diff --git a/.clang-format b/.clang-format
index 9ba433b173..aff93435f5 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.clang_format.hook b/.clang_format.hook
new file mode 100755
index 0000000000..1d92821686
--- /dev/null
+++ b/.clang_format.hook
@@ -0,0 +1,15 @@
+#!/bin/bash
+set -e
+
+readonly VERSION="3.8"
+
+version=$(clang-format -version)
+
+if ! [[ $version == *"$VERSION"* ]]; then
+    echo "clang-format version check failed."
+    echo "a version contains '$VERSION' is needed, but get '$version'"
+    echo "you can install the right version, and make an soft-link to '\$PATH' env"
+    exit -1
+fi
+
+clang-format $@
diff --git a/.copyright.hook b/.copyright.hook
new file mode 100644
index 0000000000..09afff2072
--- /dev/null
+++ b/.copyright.hook
@@ -0,0 +1,121 @@
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io, re
+import sys, os
+import subprocess
+import platform
+
+COPYRIGHT = '''
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+'''
+
+LANG_COMMENT_MARK = None
+
+NEW_LINE_MARK = None
+
+COPYRIGHT_HEADER = None
+
+if platform.system() == "Windows":
+    NEW_LINE_MARK = "\r\n"
+else:
+    NEW_LINE_MARK = '\n'
+    COPYRIGHT_HEADER = COPYRIGHT.split(NEW_LINE_MARK)[1]
+    p = re.search('(\d{4})', COPYRIGHT_HEADER).group(0)
+    process = subprocess.Popen(["date", "+%Y"], stdout=subprocess.PIPE)
+    date, err = process.communicate()
+    date = date.decode("utf-8").rstrip("\n")
+    COPYRIGHT_HEADER = COPYRIGHT_HEADER.replace(p, date)
+
+
+def generate_copyright(template, lang='C'):
+    if lang == 'Python':
+        LANG_COMMENT_MARK = '#'
+    else:
+        LANG_COMMENT_MARK = "//"
+
+    lines = template.split(NEW_LINE_MARK)
+    BLANK = " "
+    ans = LANG_COMMENT_MARK + BLANK + COPYRIGHT_HEADER + NEW_LINE_MARK
+    for lino, line in enumerate(lines):
+        if lino == 0 or lino == 1 or lino == len(lines) - 1: continue
+        if len(line)  == 0:
+            BLANK = ""
+        else:
+            BLANK = " "
+        ans += LANG_COMMENT_MARK + BLANK + line + NEW_LINE_MARK
+
+    return ans + "\n"
+
+
+def lang_type(filename):
+    if filename.endswith(".py"):
+        return "Python"
+    elif filename.endswith(".h"):
+        return "C"
+    elif filename.endswith(".c"):
+        return "C"
+    elif filename.endswith(".hpp"):
+        return "C"
+    elif filename.endswith(".cc"):
+        return "C"
+    elif filename.endswith(".cpp"):
+        return "C"
+    elif filename.endswith(".cu"):
+        return "C"
+    elif filename.endswith(".cuh"):
+        return "C"
+    elif filename.endswith(".go"):
+        return "C"
+    elif filename.endswith(".proto"):
+        return "C"
+    else:
+        print("Unsupported filetype %s", filename)
+        exit(0)
+
+
+PYTHON_ENCODE = re.compile("^[ \t\v]*#.*?coding[:=][ \t]*([-_.a-zA-Z0-9]+)")
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for filename in args.filenames:
+        fd = io.open(filename, encoding="utf-8")
+        first_line = fd.readline()
+        second_line = fd.readline()
+        if "COPYRIGHT (C)" in first_line.upper(): continue
+        if first_line.startswith("#!") or PYTHON_ENCODE.match(
+                second_line) != None or PYTHON_ENCODE.match(first_line) != None:
+            continue
+        original_contents = io.open(filename, encoding="utf-8").read()
+        new_contents = generate_copyright(
+            COPYRIGHT, lang_type(filename)) + original_contents
+        print('Auto Insert Copyright Header {}'.format(filename))
+        retv = 1
+        with io.open(filename, 'w') as output_file:
+            output_file.write(new_contents)
+
+    return retv
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/.gitignore b/.gitignore
index 275173b967..ac56a3320e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,13 @@ third_party/
 
 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/fluid/core.so
+paddle/pybind/pybind.h
+CMakeFiles
+cmake_install.cmake
+paddle/.timestamp
+python/paddlepaddle.egg-info/
+paddle/pybind/pybind.h
+python/paddle/version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4cd8eb12f6..89c620bb2f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,7 +17,25 @@
     -   id: detect-private-key
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
--   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
     hooks:
-    -   id: clang-formater
+    -   id: clang-format-with-version-check
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: bash ./.clang_format.hook -i
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
+    hooks:
+    -   id: go-fmt
+        types:
+        - go
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python ./.copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
+        exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
diff --git a/.travis.yml b/.travis.yml
index 44b755ee32..e2d49daa19 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,29 +1,26 @@
 language: cpp
 cache:
   directories:
-    - $HOME/third_party
     - $HOME/.ccache
     - $HOME/.cache/pip
+    - $TRAVIS_BUILD_DIR/build/third_party
 sudo: required
 dist: trusty
 os:
   - linux
 env:
-  - JOB=DOCS
-  - JOB=BUILD_AND_TEST
-  - JOB=PRE_COMMIT
+  - JOB=build_doc
+  - JOB=check_style
 addons:
   apt:
     packages:
       - gcc-4.8
       - g++-4.8
-      - gfortran-4.8
       - git
       - build-essential
       - python
       - python-pip
       - python2.7-dev
-      - python-numpy
       - python-wheel
       - libboost-dev
       - curl
@@ -33,29 +30,27 @@ addons:
       - automake
       - libtool
       - ccache
+  ssh_known_hosts: 52.76.173.135
 before_install:
-  - |
-    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
-      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
-        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-        then
-          echo "Only markdown docs were updated, stopping build process."
-          exit
-        fi
-      fi
-    fi
-  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
   # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
-  - pip install rarfile
+  - sudo pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - sudo pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - | 
-    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+  - |
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
+  - |
+    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
+    if [[ "$TRAVIS_BRANCH" != "develop"  && ! "$TRAVIS_BRANCH" =~ ^v[[:digit:]]+\.[[:digit:]]+(\.[[:digit:]]+)?(-\S*)?$ ]]; then exit 0; fi;
+    export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
+    export DOCS_DIR=`pwd`
+    cd ..
+    curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc   
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
index d5baee2161..4db4a4a8e7 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -1,28 +1,48 @@
 | Github account | name |
 |---|---|
-| reyoung | Yang Yu |
+| backyes | Yan-Fei Wang |
+| beckett1124 | Bin Qi |
+| Canpio | Jia-Yi Feng |
+| chengxiaohua1105 | Xiao-Hua Cheng |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| cxysteven | Xing-Yi Cheng |
+| dzhwinter | Zhi-Hong Dong |
+| emailweixu | Wei Xu |
 | gangliao | Gang Liao |
-| luotao01 | Tao Luo |
-| jacquesqiao | Long-Fei Qiao |
-| qingqing01 | Qing-Qing Dang |
+| gongweibao | Wei-Bao Gong |
+| Guo Sheng | Sheng Guo |
+| Haichao-Zhang | Hai-Chao Zhang |
 | hedaoyuan | Dao-Yuan He |
-| wangyang59 | Yang Wang |
+| helinwang | He-Lin Wang |
+| jacquesqiao | Long-Fei Qiao |
+| kuke | Yi-Bing Liu |
+| lcy-seso | Ying Cao |
+| lipeng-unisound | Peng Li |
+| liuyuan | Yuan Liu |
+| livc | Zhao Li |
+| llxxxll | Yong-Feng Liu |
+| luotao01 | Tao Luo |
+| lzhao4ever | Liang Zhao |
+| NHZlX | Zhao-Long Xing |
+| pakchoi | Chuan-Jiang Song |
+| pengli09 | Peng Li |
+| pkuyym | Ya-Ming Yang |
 | QiJune | Jun Qi |
+| qingqing01 | Qing-Qing Dang |
+| reyoung | Yang Yu |
+| Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
-| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | typhoonzero | Yi Wu |
-| backyes | Yan-Fei Wang |
-| pengli09 | Peng Li |
-| livc | Zhao Li |
+| wanghaoshuang | Hao-Shuang Wang |
+| wangyang59 | Yang Wang |
+| wangzhen-nlp | Zhen Wang |
+| wen-bo-yang | Wen-Bo Yang |
+| wwhu | Wei-Wei Hu |
+| xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
+| xujun05 | Jun Xu |
+| xushaoyong | Shao-Yong Xu |
 | Yancey1989 | Xu Yan |
-| emailweixu | Wei Xu |
-| wen-bo-yang | Wen-Bo Yang |
-| helinwang | He-Lin Wang |
-| lcy-seso | Ying Cao |
-| Zrachel | Rui-Qing Zhang |
-| Haichao-Zhang | Hai-Chao Zhang |
-| gongweibao | Wei-Bao Gong |
-| lzhao4ever | Liang Zhao |
+| zhaopu7 | Pu Zhao |
 | zhouxiao-coder | Xiao Zhou |
-| lipeng-unisound | Peng Li |
+| Zrachel | Rui-Qing Zhang |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79210d0436..7c7eb260ae 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,13 +13,17 @@
 # limitations under the License
 
 cmake_minimum_required(VERSION 3.0)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
 project(paddle CXX C Go)
+message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
+        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
+        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 
 find_package(Sphinx)
 if(NOT CMAKE_CROSSCOMPILING)
@@ -33,6 +37,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -47,6 +52,14 @@ option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. 
+option(WITH_FLUID       "Compile PaddlePaddle fluid only(TODO)"         ON)
+option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
+option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
+option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
+option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -55,22 +68,36 @@ if(NOT CMAKE_BUILD_TYPE)
       FORCE)
 endif()
 
-if(ANDROID)
-    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+if(ANDROID OR IOS)
+    if(ANDROID)
+        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
+            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
+        endif()
     endif()
 
     set(WITH_GPU OFF CACHE STRING
-        "Disable GPU when cross-compiling for Android" FORCE)
+        "Disable GPU when cross-compiling for Android and iOS" FORCE)
     set(WITH_AVX OFF CACHE STRING
-        "Disable AVX when cross-compiling for Android" FORCE)
+        "Disable AVX when cross-compiling for Android and iOS" FORCE)
     set(WITH_PYTHON OFF CACHE STRING
-        "Disable PYTHON when cross-compiling for Android" FORCE)
+        "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
     set(WITH_RDMA OFF CACHE STRING
-        "Disable RDMA when cross-compiling for Android" FORCE)
-endif(ANDROID)
+        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)
+
+    # Compile PaddlePaddle mobile inference library
+    if (NOT WITH_C_API)
+        set(WITH_C_API ON CACHE STRING
+            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
+    endif()
+    set(MOBILE_INFERENCE ON)
+    add_definitions(-DPADDLE_MOBILE_INFERENCE)
+endif()
 
-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
 if (WITH_C_API AND WITH_PYTHON)
@@ -79,8 +106,27 @@ if (WITH_C_API AND WITH_PYTHON)
     "different Python interpreter from compiling.")
 endif()
 
+if (WITH_C_API)
+  set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
+endif()
+
+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
+set(WITH_MKLML ${WITH_MKL})
+if (WITH_MKL AND AVX2_FOUND)
+    set(WITH_MKLDNN ON)
+else()
+    message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
+    set(WITH_MKLDNN OFF)
+endif()
+
 ########################################################################################
 
+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -88,10 +134,19 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/boost)     # download, build, install boost
 include(external/any)       # download libn::any
+include(external/eigen)     # download eigen3
+include(external/pybind11)  # download pybind11
+include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
+include(cudnn)              # set cudnn libraries, must before configure
+include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
@@ -99,14 +154,14 @@ include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-include(configure)          # add paddle env configuration
 
-include_directories("${PROJ_ROOT}")
-include_directories("${PROJ_ROOT}/paddle/cuda/include")
+
+include_directories("${PADDLE_SOURCE_DIR}")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
@@ -118,15 +173,45 @@ set(EXTERNAL_LIBS
 )
 
 if(WITH_GPU)
-    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
-    if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
-    endif(NOT WITH_DSO)
+  include(cuda)
 endif(WITH_GPU)
 
+if(WITH_MKLML)
+    list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
+endif()
+
+if(WITH_MKLDNN)
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
+endif()
+
+if(USE_NNPACK)
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
+endif(USE_NNPACK)
+
 add_subdirectory(proto)
+
+if(NOT MOBILE_INFERENCE)
+    # "add_subdirectory(go)" should be placed after the following loine,
+    # because it depends on paddle/optimizer.
+    add_subdirectory(paddle/optimizer)
+endif()
+
+# "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
+# placed after this block, because they depends on it.
+if(WITH_GOLANG)
+    add_subdirectory(go)
+endif(WITH_GOLANG)
+
+set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
+
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+
 add_subdirectory(paddle)
-add_subdirectory(python)
+if(WITH_PYTHON)
+  add_subdirectory(python)
+endif()
 
 if(WITH_DOC)
     add_subdirectory(doc)
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000..54131b48ec
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,46 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at paddle-dev@baidu.com. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version]
+
+[homepage]: http://contributor-covenant.org
+[version]: http://contributor-covenant.org/version/1/4/
diff --git a/CODE_OF_CONDUCT_cn.md b/CODE_OF_CONDUCT_cn.md
new file mode 100644
index 0000000000..2be794f1f3
--- /dev/null
+++ b/CODE_OF_CONDUCT_cn.md
@@ -0,0 +1,50 @@
+# 参与者公约
+
+## 我们的保证
+
+为了促进一个开放透明且友好的环境,我们作为贡献者和维护者保证:无论年龄、种族、民族、性别认同和表达(方式)、体型、身体健全与否、经验水平、国籍、个人表现、宗教或性别取向,参与者在我们项目和社区中都免于骚扰。
+
+## 我们的标准
+
+有助于创造正面环境的行为包括但不限于:
+* 使用友好和包容性语言
+* 尊重不同的观点和经历
+* 耐心地接受建设性批评
+* 关注对社区最有利的事情
+* 友善对待其他社区成员
+
+身为参与者不能接受的行为包括但不限于:
+* 使用与性有关的言语或是图像,以及不受欢迎的性骚扰
+* 捣乱/煽动/造谣的行为或进行侮辱/贬损的评论,人身攻击及政治攻击
+* 公开或私下的骚扰
+* 未经许可地发布他人的个人资料,例如住址或是电子地址
+* 其他可以被合理地认定为不恰当或者违反职业操守的行为
+
+## 我们的责任
+
+项目维护者有责任为「可接受的行为」标准做出诠释,以及对已发生的不被接受的行为采取恰当且公平的纠正措施。
+
+项目维护者有权利及责任去删除、编辑、拒绝与本行为标准有所违背的评论(comments)、提交(commits)、代码、wiki 编辑、问题(issues)和其他贡献,以及项目维护者可暂时或永久性的禁止任何他们认为有不适当、威胁、冒犯、有害行为的贡献者。
+
+## 使用范围
+
+当一个人代表该项目或是其社区时,本行为标准适用于其项目平台和公共平台。
+
+代表项目或是社区的情况,举例来说包括使用官方项目的电子邮件地址、通过官方的社区媒体账号发布或线上或线下事件中担任指定代表。
+
+该项目的呈现方式可由其项目维护者进行进一步的定义及解释。
+
+## 强制执行
+
+可以通过paddle-dev@baidu.com,来联系项目团队来举报滥用、骚扰或其他不被接受的行为。
+
+任何维护团队认为有必要且适合的所有投诉都将进行审查及调查,并做出相对应的回应。项目小组有对事件回报者有保密的义务。具体执行的方针近一步细节可能会单独公布。
+
+没有切实地遵守或是执行本行为标准的项目维护人员,可能会因项目领导人或是其他成员的决定,暂时或是永久地取消其参与资格。
+
+## 来源
+
+本行为标准改编自[贡献者公约][主页],版本 1.4
+可在此观看https://www.contributor-covenant.org/zh-cn/version/1/4/code-of-conduct.html
+
+[主页]: https://www.contributor-covenant.org
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 0d4bb973ae..a60453ff4e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1 +1,157 @@
-./doc/howto/dev/contribute_to_paddle_en.md
+# Contribute Code
+
+We sincerely appreciate your contribution.  This document explains our workflow and work style.
+
+## Workflow
+
+PaddlePaddle uses this [Git branching model](http://nvie.com/posts/a-successful-git-branching-model/).  The following steps guide usual contributions.
+
+1. Fork
+
+   Our development community has been growing fastly; it doesn't make sense for everyone to write into the official repo.  So, please file Pull Requests from your fork.  To make a fork,  just head over to the GitHub page and click the ["Fork" button](https://help.github.com/articles/fork-a-repo/).
+
+1. Clone
+
+   To make a copy of your fork to your local computers, please run
+
+   ```bash
+   git clone https://github.com/your-github-account/paddle
+   cd paddle
+   ```
+
+1. Create the local feature branch
+
+   For daily works like adding a new feature or fixing a bug, please open your feature branch before coding:
+
+   ```bash
+   git checkout -b my-cool-stuff
+   ```
+
+1. Commit
+
+   Before issuing your first `git commit` command, please install [`pre-commit`](http://pre-commit.com/) by running the following commands:
+
+   ```bash
+   pip install pre-commit
+   pre-commit install
+   ```
+
+   Our pre-commit configuration requires clang-format 3.8 for auto-formating C/C++ code and yapf for Python.
+
+   Once installed, `pre-commit` checks the style of code and documentation in every commit.  We will see something like the following when you run `git commit`:
+
+   ```
+   ➜  git commit
+   CRLF end-lines remover...............................(no files to check)Skipped
+   yapf.................................................(no files to check)Skipped
+   Check for added large files..............................................Passed
+   Check for merge conflicts................................................Passed
+   Check for broken symlinks................................................Passed
+   Detect Private Key...................................(no files to check)Skipped
+   Fix End of Files.....................................(no files to check)Skipped
+   clang-formater.......................................(no files to check)Skipped
+   [my-cool-stuff c703c041] add test file
+    1 file changed, 0 insertions(+), 0 deletions(-)
+    create mode 100644 233
+   ```
+
+1. Build and test
+
+   Users can build PaddlePaddle natively on Linux and Mac OS X.  But to unify the building environment and to make it easy for debugging, the recommended way is [using Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/build_en.md).
+
+1. Keep pulling
+
+   An experienced Git user pulls from the official repo often -- daily or even hourly, so they notice conflicts with others work early, and it's easier to resolve smaller conflicts.
+
+   ```bash
+   git remote add upstream https://github.com/PaddlePaddle/Paddle
+   git pull upstream develop
+   ```
+
+1. Push and file a pull request
+
+   You can "push" your local work into your forked repo:
+
+   ```bash
+   git push origin my-cool-stuff
+   ```
+
+   The push allows you to create a pull request, requesting owners of this [official repo](https://github.com/PaddlePaddle/Paddle) to pull your change into the official one.
+
+   To create a pull request, please follow [these steps](https://help.github.com/articles/creating-a-pull-request/).
+
+   If your change is for fixing an issue, please write ["Fixes <issue-URL>"](https://help.github.com/articles/closing-issues-using-keywords/) in the description section of your pull request.  Github would close the issue when the owners merge your pull request.
+
+   Please remember to specify some reviewers for your pull request.  If you don't know who are the right ones, please follow Github's recommendation.
+
+
+1. Delete local and remote branches
+
+   To keep your local workspace and your fork clean, you might want to remove merged branches:
+
+   ```bash
+   git push origin :my-cool-stuff
+   git checkout develop
+   git pull upstream develop
+   git branch -d my-cool-stuff
+   ```
+
+### Code Review
+
+-  Please feel free to ping your reviewers by sending them the URL of your pull request via IM or email.  Please do this after your pull request passes the CI.
+
+- Please answer reviewers' every comment.  If you are to follow the comment, please write "Done"; please give a reason otherwise.
+
+- If you don't want your reviewers to get overwhelmed by email notifications, you might reply their comments by [in a batch](https://help.github.com/articles/reviewing-proposed-changes-in-a-pull-request/).
+
+- Reduce the unnecessary commits.  Some developers commit often.  It is recommended to append a sequence of small changes into one commit by running `git commit --amend` instead of `git commit`.
+
+
+## Coding Standard
+
+### Code Style
+
+Our C/C++ code follows the [Google style guide](http://google.github.io/styleguide/cppguide.html).
+
+Our Python code follows the [PEP8 style guide](https://www.python.org/dev/peps/pep-0008/).
+
+Our build process helps to check the code style.  In [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/docker/build.sh#L42), the entry point of our [builder Docker image](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/Dockerfile#L88), the CMake argument `WITH_STYLE_CHECK` is set to `ON` by default.  This flag is on
+
+Please install pre-commit, which automatically reformat the changes to C/C++ and Python code whenever we run `git commit`.  To check the whole codebase, we can run the command `pre-commit run -a`, as in the [`check_style.sh` file](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/paddle/scripts/travis/check_style.sh#L30), which is invoked by [our Travis CI configuration](https://github.com/PaddlePaddle/Paddle/blob/b84e8226514b8bb4405c3c28e54aa5077193d179/.travis.yml#L43).
+
+### Unit Tests
+
+Please remember to add related unit tests.
+
+- For C/C++ code, please follow [`google-test` Primer](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
+
+- For Python code, please use [Python's standard `unittest` package](http://pythontesting.net/framework/unittest/unittest-introduction/).
+
+
+### Writing Logs
+
+We use [glog](https://github.com/google/glog) for logging in our C/C++ code.
+
+For general information, please use `LOG`.  For debug information, please use [`VLOG`](http://htmlpreview.github.io/?https://github.com/google/glog/blob/master/doc/glog.html#verbose).  The reason is at [here](https://groups.google.com/a/chromium.org/d/msg/chromium-dev/3NDNd1KzXeY/AZKMMx37fdQJ).
+
+`VLOG` requires a *verbose level* parameter.  For example:
+
+```c++
+VLOG(3) << "Operator FC is taking " << num_inputs << "inputs."
+```
+
+When we run a PaddlePaddle application or test, we can specify a verbose threshold.  For example:
+
+```bash
+GLOG_vmodule=buddy_allocator=2 \
+GLOG_v=10 \
+python \
+../python/paddle/v2/framework/tests/test_recurrent_op.py
+```
+
+This will enable VLOG messages generated by `buddy_allocator.{h,cc}` and in the verbose range of 0 to 3, so you will see above example VLOG message, which is in level 3.  This suggests that we output overall messages in lower verbose levels, so they display with higher probability.  When coding C++, please follow the verbose level convention as follows:
+
+- verbose level 1: [framework](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework)
+- verbose level 3: [operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)
+- verbose level 5: [memory](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory), [platform](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform)
+- verbose level 7: [math](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/math)
diff --git a/Dockerfile b/Dockerfile
index 39af60966b..6ac9901ac6 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -10,13 +10,11 @@ RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ub
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_STYLE_CHECK
 
 ENV WOBOQ OFF
-ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_GPU=${WITH_GPU:-ON}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
 ENV HOME /root
 # Add bash enhancements
@@ -24,24 +22,27 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
-    automake locales clang-format-3.8 swig doxygen cmake  \
-    liblapack-dev liblapacke-dev libboost-dev \
+    python-matplotlib gcc-4.8 g++-4.8 \
+    automake locales clang-format swig doxygen cmake  \
+    liblapack-dev liblapacke-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
-# Install Go
-RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C /usr/local -xzf go.tgz && \
+# Install Go and glide
+RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
     mkdir /root/gopath && \
-    rm go.tgz
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
 
 # git credential to skip password typing
 RUN git config --global credential.helper store
@@ -52,19 +53,23 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8
 # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
 # version util jupyter fixes this issue.
 RUN pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U wheel && \
     pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ 
-    pip install rarfile
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark
+
+RUN pip install pre-commit 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
+    pip install opencv-python
+
+COPY ./python/requirements.txt /root/
+RUN pip install -r /root/requirements.txt
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 
+
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
     (cd /woboq \
diff --git a/Dockerfile.android b/Dockerfile.android
index fa24f6f06c..9d13a414f6 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -4,9 +4,16 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
+# ENV variables
+ARG ANDROID_ABI
+ARG ANDROID_API
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+ENV ANDROID_API=${ANDROID_API:-21}
+
 ENV HOME=/root \
     ANDROID_NDK_HOME=/opt/android-ndk-linux \
-    ANDROID_STANDALONE_TOOLCHAIN=/opt/android-toolchain-gcc
+    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
 
 RUN apt-get update && \
     apt-get install -y \
@@ -14,6 +21,16 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
+# Install Go and glide
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
@@ -26,13 +43,12 @@ RUN pip install --upgrade pip && \
     pip install pre-commit
 
 # Android NDK
-RUN mkdir /opt/android-ndk-tmp && \
+RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
+    mkdir -p /opt/android-ndk-tmp && \
     cd /opt/android-ndk-tmp && \
     wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
     unzip -q android-ndk-r14b-linux-x86_64.zip && \
     mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
-    ${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh --arch=arm --platform=android-21 --install-dir=${ANDROID_STANDALONE_TOOLCHAIN} && \
-    rm -rf /opt/android-ndk-tmp && \
-    rm -rf ${ANDROID_NDK_HOME}
+    rm -rf /opt/android-ndk-tmp
 
 CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
diff --git a/README.md b/README.md
index fa16cc3cf2..d06375a444 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -36,7 +36,8 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     examples:
 
       - Optimized math operations through SSE/AVX intrinsics, BLAS libraries
-      (e.g. MKL, ATLAS, cuBLAS) or customized CPU/GPU kernels.
+      (e.g. MKL, OpenBLAS, cuBLAS) or customized CPU/GPU kernels.
+      - Optimized CNN networks through MKL-DNN library.
       - Highly optimized recurrent networks which can handle **variable-length**
       sequence without padding.
       - Optimized local and distributed training for models with high dimensional
@@ -51,45 +52,46 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 - **Connected to Products**
 
     In addition, PaddlePaddle is also designed to be easily deployable. At Baidu,
-    PaddlePaddle has been deployed into products or service with a vast number
+    PaddlePaddle has been deployed into products and services with a vast number
     of users, including ad click-through rate (CTR) prediction, large-scale image
     classification, optical character recognition(OCR), search ranking, computer
     virus detection, recommendation, etc. It is widely utilized in products at
-    Baidu and it has achieved a significant impact. We hope you can also exploit
-    the capability of PaddlePaddle to make a huge impact for your product.
+    Baidu and it has achieved a significant impact. We hope you can also explore
+    the capability of PaddlePaddle to make an impact on your product.
 
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/build_from_source_en.html).
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html) and
+[Chinese](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html) documentation.
 
-- [Deep Learning 101](http://book.paddlepaddle.org/index.html)
+- [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)
 
-  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+  You might want to start from this online interactive book that can run in a Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/usage/cluster/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://www.paddlepaddle.org/docs/develop/documentation/en/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
+
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
index 5deaf230a8..494c59730d 100644
--- a/RELEASE.cn.md
+++ b/RELEASE.cn.md
@@ -1,3 +1,62 @@
+# v0.11.0版本
+
+## PaddlePaddle Fluid
+
+- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如:
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的
+源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。
+
+
+## 新特点
+
+* 发布 `PaddlePaddle Fluid`。
+* 增加了用于模型预测的C-API。
+* 用Fluid API实现了一个简单的GAN的例子。
+* 增加了关于性能调优的文档。
+* 为`paddle.v2.dataset`下载数据集提供了重试机制.
+* C++中使用protobuf-lite替换protobuf减少了二进制的大小。
+* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment).
+* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。
+* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。
+  - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet
+  - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。
+* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign)
+* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod)
+* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance)
+* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq)
+* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score)
+* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice)
+* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* 增加移动端友好的网页
+
+## 改进
+
+* 使用一个Python`whl`包即可安装.
+* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。
+* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。
+* 删除了有一些bug的BarrierStat。
+* 清理和删除了paddle::Parameter中未使用的函数。
+* 删除了ProtoDataProvider。
+* Huber loss同时支持回归和分类。
+* 为sequence pooling 层增加`stride`参数。
+* v2 API自动使用cudnn batch normalization。
+* 可以使用一个固定的参数名共享BN层的参数。
+* 2D convolution operation支持variable-dimension input特性。
+* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。
+* 优化网页导航。
+
+## 错误修复
+
+* 修复ROI pooling的Bug. cc9a761
+* 修复当label是dense vector是AUC变成0的问题. #5274
+* 修复WarpCTC 层的Bug.
+
+
 # v0.10.0版本
 
 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
diff --git a/RELEASE.md b/RELEASE.md
index 146f7afa7d..5a62c95513 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,75 @@
+# Release v0.11.0
+
+## PaddlePaddle Fluid
+
+- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*.  Fluid is
+  designed to allow users to program like PyTorch and TensorFlow Eager Execution.
+  In these systems, there is no longer the concept *model* and applications
+  do not include a symbolic description of a graph of operators nor a sequence
+  of layers. Instead, applications look exactly like a usual program that
+  describes a process of training or inference.  The difference between
+  Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's
+  control-flow, `if-then-else` nor `for`.  Instead, Fluid provides its
+  C++ implementations and their Python binding using the `with` statement.  For an example
+
+  https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44
+
+- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program.
+Executor works like an interpreter. In future version, we will improve
+`Executor` into a debugger like GDB, and we might provide some compilers,
+which, for example, takes an application like the above one, and outputs
+an equivalent C++ source program, which can be compiled using
+[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html)
+to generate binaries that use CUDA, or using
+[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries
+that make full use of Intel CPUs.
+
+## New Features
+
+* Release `PaddlePaddle Fluid`.
+* Add C-API for model inference
+* Use fluid API to create a simple GAN demo.
+* Add develop guide about performance tunning.
+* Add retry when download `paddle.v2.dataset`.
+* Linking protobuf-lite not protobuf in C++. Reduce the binary size.
+* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released.
+* A new style cmake functions for Paddle. It is based on Bazel API.
+* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`.
+* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn):
+  - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN.
+  - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet
+  - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML.
+* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign).
+* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod).
+* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance).
+* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq).
+* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score).
+* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice).
+* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv)
+* Add mobile friendly webpages.
+
+## Improvements
+
+* Build and install using a single `whl` package.
+* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标).
+* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices.
+* Remove buggy BarrierStat.
+* Clean and remove unused functions in paddle::Parameter.
+* Remove ProtoDataProvider.
+* Huber loss supports both regression and classification.
+* Add the `stride` parameter  for sequence pooling layers.
+* Enable v2 API use cudnn batch normalization automatically.
+* The BN layer's parameter can be shared by a fixed the parameter name.
+* Support variable-dimension input feature for 2D convolution operation.
+* Refine cmake about CUDA to automatically detect GPU architecture.
+* Improved website navigation.
+
+## Bug Fixes
+
+* Fix bug in ROI pooling. cc9a761
+* Fix AUC is zero when label is dense vector. #5274
+* Fix bug in WarpCTC layer.
+
 # Release v0.10.0
 
 We are glad to release version 0.10.0.  In this version, we are happy to release the new 
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
new file mode 100644
index 0000000000..8b7dc5b7db
--- /dev/null
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -0,0 +1,112 @@
+# Benchmark
+
+Machine:
+
+- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket
+- Laptop: TBD
+
+System: CentOS release 6.3 (Final), Docker 1.12.1.
+
+PaddlePaddle:
+- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
+  - MKL-DNN tag v0.11
+  - MKLML 2018.0.1.20171007
+- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
+  - OpenBLAS v0.2.20
+	 
+On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
+
+## Benchmark Model
+
+### Server
+
+#### Training
+Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet.
+
+Input image size - 3 * 224 * 224, Time: images/second
+
+- VGG-19
+
+| BatchSize    | 64    | 128  | 256     |
+|--------------|-------| -----| --------|
+| OpenBLAS     | 7.80  | 9.00  | 10.80  | 
+| MKLML        | 12.12 | 13.70 | 16.18  |
+| MKL-DNN      | 28.46 | 29.83 | 30.44  |
+
+<img src="figs/vgg-cpu-train.png" width="500">
+
+ - ResNet-50
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 25.22 | 25.68 | 27.12  | 
+| MKLML        | 32.52 | 31.89 | 33.12  |
+| MKL-DNN      | 81.69 | 82.35 | 84.08  |
+
+<img src="figs/resnet-cpu-train.png" width="500">
+
+ - GoogLeNet
+
+| BatchSize    | 64    | 128   | 256    |
+|--------------|-------| ------| -------|
+| OpenBLAS     | 89.52 | 96.97 | 108.25 | 
+| MKLML        | 128.46| 137.89| 158.63 |
+| MKL-DNN      | 250.46| 264.83| 269.50 |
+
+<img src="figs/googlenet-cpu-train.png" width="500">
+
+- AlexNet
+
+| BatchSize    | 64     | 128    | 256    |
+|--------------|--------| ------ | -------|
+| OpenBLAS     | 45.62  | 72.79  | 107.22 | 
+| MKLML        | 66.37  | 105.60 | 144.04 |
+| MKL-DNN      | 399.00 | 498.94 | 626.53 | 
+
+<img src="figs/alexnet-cpu-train.png" width="500">
+
+#### Inference
+Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
+- VGG-19
+
+| BatchSize | 1     | 2     | 4     | 8     | 16    |
+|-----------|-------|-------|-------|-------|-------|
+| OpenBLAS  | 1.10  | 1.96  | 3.62  | 3.63  | 2.25  |
+| MKLML     | 5.58  | 9.80  | 15.15 | 21.21 | 28.67 |
+| MKL-DNN   | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+<img src="figs/vgg-cpu-infer.png" width="500">
+
+- ResNet-50
+
+| BatchSize | 1     | 2      | 4      | 8      | 16     |
+|-----------|-------|--------|--------|--------|--------|
+| OpenBLAS  | 3.31  | 6.72   | 11.59  | 13.17  | 9.27   |
+| MKLML     | 6.33  | 12.02  | 22.88  | 40.53  | 63.09  |
+| MKL-DNN   | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
+<img src="figs/resnet-cpu-infer.png" width="500">
+
+- GoogLeNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 12.06  | 23.56  | 34.48  | 36.45  | 23.12  |
+| MKLML     | 22.74  | 41.56  | 81.22  | 133.47 | 210.53 |
+| MKL-DNN   | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
+
+<img src="figs/googlenet-cpu-infer.png" width="500">
+
+- AlexNet
+
+| BatchSize | 1      | 2      | 4      | 8      | 16     |
+|-----------|--------|--------|--------|--------|--------|
+| OpenBLAS  | 3.53   | 6.23   | 15.04  | 26.06  | 31.62  |
+| MKLML     | 21.32  | 36.55  | 73.06  | 131.15 | 192.77 |
+| MKL-DNN   | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
+
+<img src="figs/alexnet-cpu-infer.png" width="500">
+
+### Laptop
+TBD
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
new file mode 100644
index 0000000000..b619613ea7
--- /dev/null
+++ b/benchmark/cluster/README.md
@@ -0,0 +1,78 @@
+# Cluster Training Benchmark
+
+## Setup
+
+- Platform
+  - Kubernetes: v1.6.2
+  - Linux Kernel: v3.10.0
+
+- Resource
+  - CPU: 10 Cores per Pod
+  - Memory: 5GB per Pod
+
+- Docker Image
+
+  We use different base Docker Image to run the benchmark on Kubernetes:
+  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
+  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
+  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
+
+- Model
+  vgg16 is used in this benchmark.
+
+## Cases
+
+- Variable
+  - Batch Size of training data.
+  - PServer count of the training job.
+  - The number of trainers.
+
+- Invariant
+  - The resource of trainer/pserver Pod.
+
+### Measure the Performance for Different Batch Size
+
+- PServer Count: 40
+- Trainer Count: 100
+- Metrics: mini-batch / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure the Performance for Different PServer Count
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure Parallel Efficiency By Increasing Trainer Count
+
+- PServer Count: 20
+- Batch Size: 64
+- Metrics:
+
+$S = \div(T1, TN)$
+
+which S is the ratio of T1 over TN, training time of 1 and N trainers.
+The parallel efficiency is:
+
+$E = \div(S, N)$
+
+| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+## Reproduce the benchmark
+
+TODO
diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png
new file mode 100644
index 0000000000..6215ae4e42
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ
diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png
new file mode 100644
index 0000000000..b3200bbc04
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ
diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png
new file mode 100644
index 0000000000..19478d433b
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
new file mode 100644
index 0000000000..4e86e058d0
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png
new file mode 100644
index 0000000000..bc43d4b8d2
Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
new file mode 100644
index 0000000000..96746b1759
Binary files /dev/null and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png
new file mode 100644
index 0000000000..3a51ec6c47
Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
new file mode 100644
index 0000000000..6d548cfd59
Binary files /dev/null and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/alexnet.py b/benchmark/paddle/image/alexnet.py
index 3358d43a4b..7029608187 100644
--- a/benchmark/paddle/image/alexnet.py
+++ b/benchmark/paddle/image/alexnet.py
@@ -1,4 +1,16 @@
-#!/usr/bin/env python
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
@@ -6,10 +18,24 @@ height = 227
 width = 227
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
+gp = get_config_arg('layer_num', int, 1)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -31,7 +57,7 @@ net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 # conv2
 net = img_conv_layer(
-    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=1)
+    input=net, filter_size=5, num_filters=256, stride=1, padding=2, groups=gp)
 net = img_cmrnorm_layer(input=net, size=5, scale=0.0001, power=0.75)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
@@ -40,11 +66,11 @@ net = img_conv_layer(
     input=net, filter_size=3, num_filters=384, stride=1, padding=1)
 # conv4
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=384, stride=1, padding=1, groups=gp)
 
 # conv5
 net = img_conv_layer(
-    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=1)
+    input=net, filter_size=3, num_filters=256, stride=1, padding=1, groups=gp)
 net = img_pool_layer(input=net, pool_size=3, stride=2)
 
 net = fc_layer(
@@ -59,6 +85,9 @@ net = fc_layer(
     layer_attr=ExtraAttr(drop_rate=0.5))
 net = fc_layer(input=net, size=1000, act=SoftmaxActivation())
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=net, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(net)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=net, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index bc893bab98..2a850ccb7f 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -5,10 +5,24 @@ height = 224
 width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+use_gpu = get_config_arg('use_gpu', bool, True)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -16,6 +30,8 @@ settings(
     learning_method=MomentumOptimizer(0.9),
     regularization=L2Regularization(0.0005 * batch_size))
 
+conv_projection = conv_projection if use_gpu else img_conv_layer
+
 def inception2(name, input, channels, \
     filter1,
     filter3R, filter3,
@@ -138,12 +154,11 @@ def inception(name, input, channels, \
     cat = concat_layer(
         name=name,
         input=[cov1, cov3, cov5, covprj],
-        bias_attr=True,
+        bias_attr=True if use_gpu else False,
         act=ReluActivation())
     return cat
 
 
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 
 # stage 1
@@ -221,6 +236,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
     name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
 
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
diff --git a/benchmark/paddle/image/plotlog.py b/benchmark/paddle/image/plotlog.py
new file mode 100644
index 0000000000..8679d4f272
--- /dev/null
+++ b/benchmark/paddle/image/plotlog.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import argparse
+import matplotlib.pyplot as plt
+
+
+def parse_args():
+    parser = argparse.ArgumentParser('Parse Log')
+    parser.add_argument(
+        '--file_path', '-f', type=str, help='the path of the log file')
+    parser.add_argument(
+        '--sample_rate',
+        '-s',
+        type=float,
+        default=1.0,
+        help='the rate to take samples from log')
+    parser.add_argument(
+        '--log_period', '-p', type=int, default=1, help='the period of log')
+
+    args = parser.parse_args()
+    return args
+
+
+def parse_file(file_name):
+    loss = []
+    error = []
+    with open(file_name) as f:
+        for i, line in enumerate(f):
+            line = line.strip()
+            if not line.startswith('pass'):
+                continue
+            line_split = line.split(' ')
+            if len(line_split) != 5:
+                continue
+
+            loss_str = line_split[2][:-1]
+            cur_loss = float(loss_str.split('=')[-1])
+            loss.append(cur_loss)
+
+            err_str = line_split[3][:-1]
+            cur_err = float(err_str.split('=')[-1])
+            error.append(cur_err)
+
+    accuracy = [1.0 - err for err in error]
+
+    return loss, accuracy
+
+
+def sample(metric, sample_rate):
+    interval = int(1.0 / sample_rate)
+    if interval > len(metric):
+        return metric[:1]
+
+    num = len(metric) / interval
+    idx = [interval * i for i in range(num)]
+    metric_sample = [metric[id] for id in idx]
+    return metric_sample
+
+
+def plot_metric(metric,
+                batch_id,
+                graph_title,
+                line_style='b-',
+                line_label='y',
+                line_num=1):
+    plt.figure()
+    plt.title(graph_title)
+    if line_num == 1:
+        plt.plot(batch_id, metric, line_style, label=line_label)
+    else:
+        for i in range(line_num):
+            plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
+    plt.xlabel('batch')
+    plt.ylabel(graph_title)
+    plt.legend()
+    plt.savefig(graph_title + '.jpg')
+    plt.close()
+
+
+def main():
+    args = parse_args()
+    assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
+
+    loss, accuracy = parse_file(args.file_path)
+    batch = [args.log_period * i for i in range(len(loss))]
+
+    batch_sample = sample(batch, args.sample_rate)
+    loss_sample = sample(loss, args.sample_rate)
+    accuracy_sample = sample(accuracy, args.sample_rate)
+
+    plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
+    plot_metric(
+        accuracy_sample,
+        batch_sample,
+        'accuracy',
+        line_style='g-',
+        line_label='accuracy')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 1ac47212b5..21e0d381aa 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import io, os
 import random
 import numpy as np
@@ -13,14 +27,21 @@ def initHook(settings, height, width, color, num_class, **kwargs):
         settings.data_size = settings.height * settings.width * 3
     else:
         settings.data_size = settings.height * settings.width
-
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.is_infer = kwargs.get('is_infer', False)
+    settings.num_samples = kwargs.get('num_samples', 2560)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 
 
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(settings.num_samples):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class)
-        yield img.astype('float32'), int(lab)
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
diff --git a/v1_api_demo/model_zoo/resnet/resnet.py b/benchmark/paddle/image/resnet.py
similarity index 65%
rename from v1_api_demo/model_zoo/resnet/resnet.py
rename to benchmark/paddle/image/resnet.py
index 6fdd97fefc..2846e4763f 100644
--- a/v1_api_demo/model_zoo/resnet/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -1,65 +1,37 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#!/usr/bin/env python
 from paddle.trainer_config_helpers import *
-"""
-paper: https://arxiv.org/abs/1512.03385
-"""
-is_test = get_config_arg("is_test", bool, False)
-is_predict = get_config_arg("is_predict", bool, False)
-data_provider = get_config_arg("data_provider", bool, True)
-layer_num = get_config_arg("layer_num", int, 50)
-
-if not is_predict and data_provider:
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args = {
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224,
-        'crop_size': 224,
-        'color': True,
-        'swap_channel:': [2, 1, 0]
-    }
-    define_py_data_sources2(
-        train_list,
-        'example/test.list',
-        module="example.image_list_provider",
-        obj="processData",
-        args=args)
 
-batch_size = 1
-learning_rate = 0.1 / batch_size
-momentum = 0.9
-weight_decay = 0.0001 * batch_size
-default_momentum(momentum)
-default_decay_rate(weight_decay)
-
-Settings(
-    algorithm='sgd',
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg("layer_num", int, 50)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
+define_py_data_sources2(
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
+
+settings(
     batch_size=batch_size,
-    learning_rate=learning_rate,
-
-    # set the appropriate parameters according your schedule
-    learning_method='momentum',
-    learning_rate_decay_a=0.5,
-    learning_rate_decay_b=1200000 * 10,
-    learning_rate_schedule="discexp", )
+    learning_rate=0.01 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
 
 
+#######################Network Configuration #############
 def conv_bn_layer(name,
                   input,
                   filter_size,
@@ -85,7 +57,10 @@ def conv_bn_layer(name,
         act=LinearActivation(),
         bias_attr=False)
     return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
 
 
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -168,6 +143,9 @@ def mid_projection(name, input, num_filters1, num_filters2, stride=2):
         name=name + "_addto", input=[branch1, last_name], act=ReluActivation())
 
 
+img = data_layer(name='image', size=height * width * 3)
+
+
 def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
     """
     A wrapper for 50,101,152 layers of ResNet.
@@ -178,10 +156,9 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
     """
     # For ImageNet
     # conv1: 112x112
-    img = data_layer(name='input', size=224 * 224 * 3)
     tmp = conv_bn_layer(
         "conv1",
-        img,
+        input=img,
         filter_size=7,
         channels=3,
         num_filters=64,
@@ -233,39 +210,21 @@ def deep_res_net(res2_num=3, res3_num=4, res4_num=6, res5_num=3):
         stride=1,
         pool_type=AvgPooling())
 
-    output = fc_layer(
-        name='output', input=tmp, size=1000, act=SoftmaxActivation())
-
-    if not is_predict:
-        classification_cost(
-            input=output, label=data_layer(
-                name='label', size=1))
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
 
 
-def res_net_50():
-    deep_res_net(3, 4, 6, 3)
-
-
-def res_net_101():
-    deep_res_net(3, 4, 23, 3)
-
-
-def res_net_152():
-    deep_res_net(3, 8, 36, 3)
-
-
-if not is_predict:
-    Inputs("input", "label")
-else:
-    Inputs("input")
-# Outputs("cost-softmax" if not is_predict else "output")
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-
 if layer_num == 50:
-    res_net_50()
+    resnet = deep_res_net(3, 4, 6, 3)
 elif layer_num == 101:
-    res_net_101()
+    resnet = deep_res_net(3, 4, 23, 3)
 elif layer_num == 152:
-    res_net_152()
+    resnet = deep_res_net(3, 8, 36, 3)
 else:
     print("Wrong layer number.")
+
+if is_infer:
+    outputs(resnet)
+else:
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
diff --git a/benchmark/paddle/image/run_mkl_infer.sh b/benchmark/paddle/image/run_mkl_infer.sh
new file mode 100755
index 0000000000..62c9bf6efd
--- /dev/null
+++ b/benchmark/paddle/image/run_mkl_infer.sh
@@ -0,0 +1,87 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num},num_samples=256" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer vgg 19 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer alexnet 2 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/run_mkl_train.sh b/benchmark/paddle/image/run_mkl_train.sh
new file mode 100755
index 0000000000..03d2d378fb
--- /dev/null
+++ b/benchmark/paddle/image/run_mkl_train.sh
@@ -0,0 +1,52 @@
+set -e
+
+function train() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    # each trainer_count use only 1 core to avoid conflict
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=10 \
+    --test_period=100 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for use_mkldnn in True False; do
+  for batchsize in 64 128 256; do
+    train vgg 19 $batchsize $use_mkldnn
+    train resnet 50 $batchsize $use_mkldnn
+    train googlenet v1 $batchsize $use_mkldnn
+    train alexnet 2 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
new file mode 100755
index 0000000000..a9a7b8a667
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -0,0 +1,69 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+
+function infer() {
+  export OPENBLAS_MAIN_FREE=1
+  topology=$1
+  layer_num=$2
+  bs=$3
+  trainers=`nproc`
+  if [ $trainers -gt $bs ]; then
+    trainers=$bs
+  fi
+  log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
+  threads=$((`nproc` / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  export OPENBLAS_NUM_THREADS=$threads
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "./run_mkl_infer.sh to save the model first"
+    exit 0
+  fi
+  log_period=$((32 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$trainers \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 160(=32*5) samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(160 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 160 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# inference benchmark
+for batchsize in 1 2 4 8 16; do
+  infer vgg 19 $batchsize
+  infer resnet 50 $batchsize 
+  infer googlenet v1 $batchsize
+  infer alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/run_openblas_train.sh b/benchmark/paddle/image/run_openblas_train.sh
new file mode 100755
index 0000000000..935cff6f2c
--- /dev/null
+++ b/benchmark/paddle/image/run_openblas_train.sh
@@ -0,0 +1,41 @@
+set -e
+
+function train() {
+  export OPENBLAS_NUM_THREADS=1
+  topology=$1
+  layer_num=$2
+  bs=$3
+  thread=`nproc`
+  # each trainer_count use only 1 core to avoid conflict
+  log="logs/train-${topology}-${layer_num}-${thread}openblas-${bs}.log"
+  args="batch_size=${bs},layer_num=${layer_num}"
+  config="${topology}.py"
+  paddle train --job=time \
+    --config=$config \
+    --use_mkldnn=False \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=3 \
+    --test_period=30 \
+    --config_args=$args \
+    2>&1 | tee ${log} 
+
+  avg_time=`tail ${log} -n 1 | awk -F ' ' '{print $8}' | sed 's/avg=//'`
+  fps=`awk 'BEGIN{printf "%.2f",('$bs' / '$avg_time' * 1000)}'`
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+
+# training benchmark
+for batchsize in 64 128 256; do
+  train vgg 19 $batchsize
+  train resnet 50 $batchsize
+  train googlenet v1 $batchsize
+  train alexnet 2 $batchsize
+done
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
new file mode 100644
index 0000000000..ca0a6798fb
--- /dev/null
+++ b/benchmark/paddle/image/vgg.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python
+from paddle.trainer_config_helpers import *
+
+height = 224
+width = 224
+num_class = 1000
+batch_size = get_config_arg('batch_size', int, 64)
+layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
+num_samples = get_config_arg('num_samples', int, 2560)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer,
+    'num_samples': num_samples
+}
+define_py_data_sources2(
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
+
+settings(
+    batch_size=batch_size,
+    learning_rate=0.001 / batch_size,
+    learning_method=MomentumOptimizer(0.9),
+    regularization=L2Regularization(0.0005 * batch_size))
+
+img = data_layer(name='image', size=height * width * 3)
+
+
+def vgg_network(vgg_num=3):
+    tmp = img_conv_group(
+        input=img,
+        num_channels=3,
+        conv_padding=1,
+        conv_num_filter=[64, 64],
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_size=2,
+        pool_stride=2,
+        pool_type=MaxPooling())
+
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=[128, 128],
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    channels = []
+    for i in range(vgg_num):
+        channels.append(256)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    channels = []
+    for i in range(vgg_num):
+        channels.append(512)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+    tmp = img_conv_group(
+        input=tmp,
+        conv_num_filter=channels,
+        conv_padding=1,
+        conv_filter_size=3,
+        conv_act=ReluActivation(),
+        pool_stride=2,
+        pool_type=MaxPooling(),
+        pool_size=2)
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    tmp = fc_layer(
+        input=tmp,
+        size=4096,
+        act=ReluActivation(),
+        layer_attr=ExtraAttr(drop_rate=0.5))
+
+    return fc_layer(input=tmp, size=num_class, act=SoftmaxActivation())
+
+
+if layer_num == 16:
+    vgg = vgg_network(3)
+elif layer_num == 19:
+    vgg = vgg_network(4)
+else:
+    print("Wrong layer number.")
+
+if is_infer:
+    outputs(vgg)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
diff --git a/benchmark/paddle/rnn/imdb.py b/benchmark/paddle/rnn/imdb.py
index fc4ed4025f..c3b5faa19a 100755
--- a/benchmark/paddle/rnn/imdb.py
+++ b/benchmark/paddle/rnn/imdb.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from __future__ import print_function
 import six.moves.cPickle as pickle
 import gzip
diff --git a/benchmark/paddle/rnn/provider.py b/benchmark/paddle/rnn/provider.py
index 928ca75daf..f35cd5b079 100644
--- a/benchmark/paddle/rnn/provider.py
+++ b/benchmark/paddle/rnn/provider.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import io, os
 import random
 import numpy as np
diff --git a/benchmark/tensorflow/image/alexnet.py b/benchmark/tensorflow/image/alexnet.py
index f6a39ef778..a37d7e7c62 100644
--- a/benchmark/tensorflow/image/alexnet.py
+++ b/benchmark/tensorflow/image/alexnet.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/alexnet_multi_gpu.py b/benchmark/tensorflow/image/alexnet_multi_gpu.py
index 7b5ee78f4d..2ebab8fb60 100644
--- a/benchmark/tensorflow/image/alexnet_multi_gpu.py
+++ b/benchmark/tensorflow/image/alexnet_multi_gpu.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/googlenet.py b/benchmark/tensorflow/image/googlenet.py
index decf855b54..1202cbb171 100644
--- a/benchmark/tensorflow/image/googlenet.py
+++ b/benchmark/tensorflow/image/googlenet.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from six.moves import xrange
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index 31466faa37..f06437eb6c 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/image/smallnet_mnist_cifar.py b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
index 1a625134a6..558c68575f 100644
--- a/benchmark/tensorflow/image/smallnet_mnist_cifar.py
+++ b/benchmark/tensorflow/image/smallnet_mnist_cifar.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/benchmark/tensorflow/rnn/reader.py b/benchmark/tensorflow/rnn/reader.py
index f538329a15..9660d3c22b 100755
--- a/benchmark/tensorflow/rnn/reader.py
+++ b/benchmark/tensorflow/rnn/reader.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os.path
 import io
 import numpy as np
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 913f711aff..6320b17520 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,85 +1,27 @@
 # Find the CBlas and lapack libraries
 #
-# It will search MKL, atlas, OpenBlas, reference-cblas in order.
+# It will search MKLML, atlas, OpenBlas, reference-cblas in order.
 #
 # If any cblas implementation found, the following variable will be set.
-#    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
+#    CBLAS_PROVIDER  # one of MKLML, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
 #    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
-#
-# User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
-# during cmake. If none of them set, it will try to find cblas implementation in
-# system paths.
-#
 
 set(CBLAS_FOUND OFF)
 
-## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
-
-find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
-find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
-
-if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
-  set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
-
-  add_definitions(-DPADDLE_USE_MKL)
-  add_definitions(-DLAPACK_FOUND)
-
-  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
-  return()
-endif()
-
-## Then find atlas.
-set(ATLAS_ROOT $ENV{ATLAS_ROOT} CACHE PATH "Folder contains Atlas")
-set(ATLAS_INCLUDE_SEARCH_PATHS
-        ${ATLAS_ROOT}/include
-        /usr/include
-        /usr/include/atlas)
-set(ATLAS_LIB_SEARCH_PATHS
-        ${ATLAS_ROOT}/lib
-        /usr/lib
-        /usr/lib/blas/atlas
-        /usr/lib/atlas
-        /usr/lib/atlas-base   # special for ubuntu 14.04.
-    )
-find_path(ATLAS_INC_DIR NAMES cblas.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
-  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
-  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-
-if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
   set(CBLAS_FOUND ON)
-  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
 
-  add_definitions(-DPADDLE_USE_ATLAS)
+  add_definitions(-DPADDLE_WITH_MKLML)
   add_definitions(-DLAPACK_FOUND)
 
-  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
+  message(STATUS "Found cblas and lapack in MKLML "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   return()
 endif()
 
@@ -150,3 +92,10 @@ if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
   message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
+
+if(IOS_USE_VECLIB_FOR_BLAS AND VECLIB_FOUND)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER vecLib)
+  set(CBLAS_INC_DIR ${VECLIB_INC_DIR})
+  add_definitions(-DPADDLE_USE_VECLIB)
+endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 5e507e78f7..5c6bcfde76 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -24,10 +24,23 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
+
+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
+
 if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
 
+if(USE_EIGEN_FOR_BLAS)
+    add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
+endif(USE_EIGEN_FOR_BLAS)
+
 if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
@@ -40,20 +53,25 @@ if(NOT CMAKE_CROSSCOMPILING)
     endif()
 endif()
 
+if(NOT WITH_GOLANG)
+    add_definitions(-DPADDLE_WITHOUT_GOLANG)
+endif(NOT WITH_GOLANG)
+
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
+
     FIND_PACKAGE(CUDA REQUIRED)
 
     if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
     endif()
 
     if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
 
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
@@ -63,5 +81,63 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if (WITH_MKLML AND MKLML_IOMP_LIB)
+    message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
+    set(OPENMP_FLAGS "-fopenmp")
+    set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+endif()
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
+
+if(WITH_GOLANG)
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
+  file(MAKE_DIRECTORY ${GOPATH})
+  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
+  file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
+  set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
+
+  add_custom_target(go_path)
+  add_custom_command(TARGET go_path
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code
+    # We can't run `go get -d ./...` for every target, because
+    # multiple `go get` can not run concurrently, but make need to be
+    # able to run with multiple jobs.
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+  )
+
+  if (GLIDE_INSTALL)
+    if(EXISTS $ENV{GOPATH}/bin/glide)
+      set(GLIDE "$ENV{GOPATH}/bin/glide")
+    else()
+      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
+    endif()
+
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
+      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
+      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
+  endif()
+
+endif(WITH_GOLANG)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 48f705818b..4823dc3e91 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -25,8 +25,10 @@ set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
 set(IGNORE_PATTERN
     .*ImportanceSampler.*
     .*cblas\\.h.*
-    .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*\\.pb\\.txt
+    .*MultiDataProvider.*
+    .*pb.*
+    .*pybind.h)
 
 # add_style_check_target
 #
@@ -40,27 +42,21 @@ macro(add_style_check_target TARGET_NAME)
     if(WITH_STYLE_CHECK)
         set(SOURCES_LIST ${ARGN})
         list(REMOVE_DUPLICATES SOURCES_LIST)
-        list(SORT SOURCES_LIST)
-
         foreach(filename ${SOURCES_LIST})
-            set(LINT ON)
             foreach(pattern ${IGNORE_PATTERN})
                 if(filename MATCHES ${pattern})
-                    message(STATUS "DROP LINT ${filename}")
-                    set(LINT OFF)
+                    list(REMOVE_ITEM SOURCES_LIST ${filename})
                 endif()
             endforeach()
-            if(LINT MATCHES ON)
-                get_filename_component(base_filename ${filename} NAME)
-                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
-                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-            endif()
         endforeach()
+
+        if(SOURCES_LIST)
+            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
+                        "--filter=${STYLE_FILTER}"
+                        ${SOURCES_LIST}
+                COMMENT "cpplint: Checking source code style"
+                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})        
+        endif()
     endif()
 endmacro()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 9724c16122..84219cfa55 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -20,6 +20,7 @@
 # The supported variables are listed belows:
 # 
 # ANDROID_STANDALONE_TOOLCHAIN
+# ANDROID_TOOLCHAIN
 # ANDROID_ABI
 # ANDROID_NATIVE_API_LEVEL
 # ANDROID_ARM_MODE
@@ -57,6 +58,10 @@ IF(NOT DEFINED CMAKE_SYSTEM_VERSION AND ANDROID_NATIVE_API_LEVEL)
     ENDIF()
 ENDIF()
 
+IF(NOT DEFINED ANDROID_TOOLCHAIN)
+    SET(ANDROID_TOOLCHAIN clang)
+ENDIF()
+
 IF(NOT DEFINED ANDROID_ABI)
     SET(ANDROID_ABI "armeabi-v7a")
 ENDIF()
@@ -82,6 +87,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             "${CMAKE_VERSION}), when cross-compiling for Android.")
 
     IF(ANDROID_STANDALONE_TOOLCHAIN)
+        # Use standalone toolchain
         SET(CMAKE_SYSROOT "${ANDROID_STANDALONE_TOOLCHAIN}/sysroot")
 
         IF(NOT CMAKE_SYSTEM_VERSION)
@@ -96,22 +102,44 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
 
         # Toolchain
-        SET(ANDROID_TOOLCHAIN "gcc")
         SET(ANDROID_TOOLCHAIN_ROOT ${ANDROID_STANDALONE_TOOLCHAIN})
-        IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-            SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
-            IF(ANDROID_ABI STREQUAL "armeabi")
-                SET(CMAKE_SYSTEM_PROCESSOR armv5te)
-            ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
-                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
-            ENDIF()
+    ELSE(ANDROID_NDK)
+        # TODO: use android ndk
+    ENDIF()
+
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(ANDROID_TOOLCHAIN_NAME arm-linux-androideabi)
+        IF(ANDROID_ABI STREQUAL "armeabi")
+            SET(CMAKE_SYSTEM_PROCESSOR armv5te)
+            SET(ANDROID_CLANG_TRIPLE armv5te-none-linux-androideabi)
+        ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
+            SET(ANDROID_CLANG_TRIPLE armv7-none-linux-androideabi)
         ENDIF()
-        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+        SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        SET(ANDROID_CLANG_TRIPLE aarch64-none-linux-android)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android ABI: ${ANDROID_ABI}.")
+    ENDIF()
+    SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
+
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
+        SET(ANDROID_C_COMPILER_NAME clang)
+        SET(ANDROID_CXX_COMPILER_NAME clang++)
+        SET(CMAKE_C_COMPILER_TARGET   ${ANDROID_CLANG_TRIPLE})
+        SET(CMAKE_CXX_COMPILER_TARGET ${ANDROID_CLANG_TRIPLE})
+    ELSEIF(ANDROID_TOOLCHAIN STREQUAL gcc)
+        SET(ANDROID_C_COMPILER_NAME gcc)
+        SET(ANDROID_CXX_COMPILER_NAME g++)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Invalid Android toolchain: ${ANDROID_TOOLCHAIN}")
     ENDIF()
 
     # C compiler
     IF(NOT CMAKE_C_COMPILER)
-        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}gcc")
+        SET(ANDROID_C_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_C_COMPILER_NAME}")
     ELSE()
         GET_FILENAME_COMPONENT(ANDROID_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
     ENDIF()
@@ -121,7 +149,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
 
     # CXX compiler
     IF(NOT CMAKE_CXX_COMPILER)
-        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}g++")
+        SET(ANDROID_CXX_COMPILER "${ANDROID_TOOLCHAIN_PREFIX}${ANDROID_CXX_COMPILER_NAME}")
     ELSE()
         GET_FILENAME_COMPONENT(ANDROID_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
     ENDIF()
@@ -133,7 +161,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
     SET(CMAKE_CXX_COMPILER ${ANDROID_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
 
     # Toolchain and ABI specific flags.
-    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections -finline-limit=64")
+    SET(ANDROID_COMPILER_FLAGS "-ffunction-sections -fdata-sections")
     SET(ANDROID_LINKER_FLAGS "-Wl,--gc-sections")
 
     IF(ANDROID_ABI STREQUAL "armeabi")
@@ -141,8 +169,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
              -march=armv5te
              -mtune=xscale
              -msoft-float)
-    ENDIF()
-    IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+    ELSEIF(ANDROID_ABI STREQUAL "armeabi-v7a")
         LIST(APPEND ANDROID_COMPILER_FLAGS
              -march=armv7-a
              -mfloat-abi=softfp)
@@ -152,6 +179,8 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
             LIST(APPEND ANDROID_COMPILER_FLAGS -mfpu=vfpv3-d16)
         ENDIF()
         LIST(APPEND ANDROID_LINKER_FLAGS -Wl,--fix-cortex-a8)
+    ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
     ENDIF()
 
     IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
@@ -160,6 +189,18 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ELSE()
             LIST(APPEND ANDROID_COMPILER_FLAGS -mthumb)
         ENDIF()
+        IF(ANDROID_TOOLCHAIN STREQUAL clang)
+            # Disable integrated-as for better compatibility.
+            LIST(APPEND ANDROID_COMPILER_FLAGS -fno-integrated-as)
+        ENDIF()
+    ENDIF()
+
+    IF(ANDROID_TOOLCHAIN STREQUAL clang)
+        # CMake automatically forwards all compiler flags to the linker,
+        # and clang doesn't like having -Wa flags being used for linking.
+        # To prevent CMake from doing this would require meddling with
+        # the CMAKE_<LANG>_COMPILE_OBJECT rules, which would get quite messy.
+        LIST(APPEND ANDROID_LINKER_FLAGS -Qunused-arguments)
     ENDIF()
 
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
@@ -186,6 +227,10 @@ ELSE()
         SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
     ENDIF()
     SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
diff --git a/cmake/cross_compiling/ios.cmake b/cmake/cross_compiling/ios.cmake
new file mode 100644
index 0000000000..d3f5bf6852
--- /dev/null
+++ b/cmake/cross_compiling/ios.cmake
@@ -0,0 +1,347 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This is a toolchain file for cross-compiling for iOS, and the
+# configuration largely refers to public toolchain file:
+#    https://raw.githubusercontent.com/leetal/ios-cmake/master/ios.toolchain.cmake
+# and
+#    https://github.com/cristeab/ios-cmake
+#
+# Supports options:
+# IOS_PLATFORM = OS (default) or SIMULATOR
+#   This decides if SDKS will be selected from the iPhoneOS.platform or iPhoneSimulator.platform folders
+#   OS - the default, used to build for iPhone and iPad physical devices, which have an arm arch.
+#   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
+# IOS_ARCH
+#   The archectures wanted to support, such "arm64", "armv7;arm64"
+# IOS_DEPLOYMENT_TARGET
+#   The minimum iOS deployment version, such as "7.0"
+# IOS_ENABLE_BITCODE = ON (default) or OFF
+# IOS_USE_VECLIB_FOR_BLAS = OFF (default) or ON
+# IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
+#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   If set manually, it will override the default location and force the user of a particular Developer Platform
+# IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
+#   By default this location is automatcially chosen based on the IOS_DEVELOPER_ROOT value.
+#   In this case it will always be the most up-to-date SDK found in the IOS_DEVELOPER_ROOT path.
+#   If set manually, this will force the use of a specific SDK version
+
+# Macros:
+# set_xcode_property (TARGET XCODE_PROPERTY XCODE_VALUE)
+#  A convenience macro for setting xcode specific properties on targets
+#  example: set_xcode_property (myioslib IPHONEOS_DEPLOYMENT_TARGET "3.1")
+# find_host_package (PROGRAM ARGS)
+#  A macro used to find executable programs on the host system, not within the iOS environment.
+#  Thanks to the android-cmake project for providing the command
+
+if(NOT IOS)
+  return()
+endif()
+
+set(CMAKE_SYSTEM_NAME Darwin)
+
+# Get the Xcode version being used.
+execute_process(COMMAND xcodebuild -version
+                OUTPUT_VARIABLE XCODE_VERSION
+                RESULT_VARIABLE XCODE_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT ${XCODE_VERSION_RESULT})
+  string(REGEX MATCH "Xcode [0-9\\.]+" XCODE_VERSION "${XCODE_VERSION}")
+  string(REGEX REPLACE "Xcode ([0-9\\.]+)" "\\1" XCODE_VERSION "${XCODE_VERSION}")
+  message(STATUS "Building with Xcode version: ${XCODE_VERSION}")
+else()
+  message(FATAL_ERROR "Cannot execute xcodebuild, please check whether xcode is installed.")
+endif()
+
+# Required as of cmake 2.8.10
+set(CMAKE_OSX_DEPLOYMENT_TARGET "" CACHE STRING "Force unset of the deployment target for iOS" FORCE)
+
+# Setup iOS platform unless specified manually with IOS_PLATFORM
+if(NOT DEFINED IOS_PLATFORM)
+  set(IOS_PLATFORM "OS")
+endif()
+set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
+
+# Set the architecture for iOS
+if(NOT DEFINED IOS_ARCH)
+  if(IOS_PLATFORM STREQUAL "OS")
+    set(IOS_ARCH "armv7;armv7s;arm64")
+  elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
+    set(IOS_ARCH "i386;x86_64")
+  endif()
+endif()
+set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string  "Build architecture for iOS")
+
+# Specify minimum iOS deployment version
+if(NOT DEFINED IOS_DEPLOYMENT_TARGET)
+  set(IOS_DEPLOYMENT_TARGET "7.0")
+endif()
+set(IOS_DEPLOYMENT_TARGET ${IOS_DEPLOYMENT_TARGET} CACHE STRING "Minimum iOS version")
+
+# Whether to enable bitcode
+if(NOT DEFINED IOS_ENABLE_BITCODE)
+  set(IOS_ENABLE_BITCODE ON)
+endif()
+set(IOS_ENABLE_BITCODE ${IOS_ENABLE_BITCODE} CACHE BOOL "Whether to enable bitcode")
+
+if(NOT DEFINED IOS_USE_VECLIB_FOR_BLAS)
+  set(IOS_USE_VECLIB_FOR_BLAS OFF)
+endif()
+set(IOS_USE_VECLIB_FOR_BLAS ${IOS_UES_VECLIB_FOR_BLAS} CACHE BOOL "Whether to use veclib")
+
+# Check the platform selection and setup for developer root
+if(${IOS_PLATFORM} STREQUAL "OS")
+  set(IOS_PLATFORM_LOCATION "iPhoneOS.platform")
+  set(XCODE_IOS_PLATFORM iphoneos)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphoneos")
+elseif(${IOS_PLATFORM} STREQUAL "SIMULATOR")
+  set(IOS_PLATFORM_LOCATION "iPhoneSimulator.platform")
+  set(XCODE_IOS_PLATFORM iphonesimulator)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-iphonesimulator")
+elseif(${IOS_PLATFORM} STREQUAL "WATCHOS")
+  set(IOS_PLATFORM_LOCATION "WatchOS.platform")
+  set(XCODE_IOS_PLATFORM watchos)
+
+  # This causes the installers to properly locate the output libraries
+  set(CMAKE_XCODE_EFFECTIVE_PLATFORMS "-watchos")
+else(${IOS_PLATFORM} STREQUAL "OS")
+  message(FATAL_ERROR "Unsupported IOS_PLATFORM value selected. Please set to\n"
+          "\t OS, SIMULATOR, or WATCHOS.")
+endif()
+
+# Check iOS developer toolchain
+if(NOT DEFINED IOS_DEVELOPER_ROOT)
+  # Setup iOS developer location
+  execute_process(COMMAND xcode-select -print-path
+                  OUTPUT_VARIABLE XCODE_DEVELOPER_DIR
+                  RESULT_VARIABLE XCODE_DEVELOPER_DIR_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  # Xcode 4.3 changed the installation location, choose the most recent one available
+  if(${XCODE_VERSION} VERSION_LESS "4.3.0")
+    set(IOS_DEVELOPER_ROOT "/Developer/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  else()
+    set(IOS_DEVELOPER_ROOT "${XCODE_DEVELOPER_DIR}/Platforms/${IOS_PLATFORM_LOCATION}/Developer")
+  endif()
+endif()
+if(EXISTS ${IOS_DEVELOPER_ROOT})
+  set(IOS_DEVELOPER_ROOT ${IOS_DEVELOPER_ROOT} CACHE PATH "Location of iOS Platform")
+else()
+  message(FATAL_ERROR "Invalid IOS_DEVELOPER_ROOT: ${IOS_DEVELOPER_ROOT} does not exist.")
+endif()
+
+# Check iOS SDK
+if(NOT DEFINED IOS_SDK_ROOT)
+  # Find and use the most recent iOS sdk
+  file(GLOB IOS_SDK_LISTS "${IOS_DEVELOPER_ROOT}/SDKs/*")
+  if(IOS_SDK_LISTS)
+    list(SORT IOS_SDK_LISTS)
+    list(REVERSE IOS_SDK_LISTS)
+    list(GET IOS_SDK_LISTS 0 IOS_SDK_ROOT)
+  else(IOS_SDK_LISTS)
+    message(FATAL_ERROR "No iOS SDK's found in default search path ${IOS_DEVELOPER_ROOT}."
+            " Please manually set IOS_SDK_ROOT or install the iOS SDK.")
+  endif(IOS_SDK_LISTS)
+endif()
+if(EXISTS ${IOS_SDK_ROOT})
+  set(IOS_SDK_ROOT ${IOS_SDK_ROOT} CACHE PATH "Location of the selected iOS SDK")
+  message(STATUS "iOS toolchain: ${IOS_SDK_ROOT}")
+else()
+  message(FATAL_ERROR "Invalid IOS_SDK_ROOT: ${IOS_SDK_ROOT} does not exist.")
+endif()
+
+# Set the sysroot default to the most recent SDK
+set(CMAKE_OSX_SYSROOT ${IOS_SDK_ROOT} CACHE PATH "Sysroot used for iOS support")
+
+# Get version of iOS SDK
+execute_process(COMMAND xcodebuild -sdk ${CMAKE_OSX_SYSROOT} -version SDKVersion
+                OUTPUT_VARIABLE IOS_SDK_VERSION
+                RESULT_VARIABLE IOS_SDK_VERSION_RESULT
+                ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(${IOS_SDK_VERSION_RESULT})
+  string(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" IOS_SDK_VERSION "${IOS_SDK_ROOT}")
+endif()
+if(NOT IOS_SDK_VERSION)
+  message(WARNING "Cannot get SDK's version.")
+  set(IOS_SDK_VERSION 1)
+endif()
+set(CMAKE_SYSTEM_VERSION ${IOS_SDK_VERSION})
+
+# Find the C & C++ compilers for the specified SDK.
+if(NOT CMAKE_C_COMPILER)
+  # Default to use clang
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang
+                  OUTPUT_VARIABLE IOS_C_COMPILER
+                  RESULT_VARIABLE IOS_C_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_C_COMPILER_RESULT})
+    get_filename_component(IOS_C_COMPILER clang PROGRAM)
+  endif()
+else(NOT CMAKE_C_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_C_COMPILER ${CMAKE_C_COMPILER} PROGRAM)
+endif(NOT CMAKE_C_COMPILER)
+if(NOT EXISTS ${IOS_C_COMPILER})
+  message(FATAL_ERROR "Cannot find C compiler: ${IOS_C_COMPILER}")
+endif()
+
+if(NOT CMAKE_CXX_COMPILER)
+  # Default to use clang++
+  execute_process(COMMAND xcrun -sdk ${CMAKE_OSX_SYSROOT} -find clang++
+                  OUTPUT_VARIABLE IOS_CXX_COMPILER
+                  RESULT_VARIABLE IOS_CXX_COMPILER_RESULT
+                  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${IOS_CXX_COMPILER_RESULT})
+    get_filename_component(IOS_CXX_COMPILER clang++ PROGRAM)
+  endif()
+else(NOT CMAKE_CXX_COMPILER)
+  # User can set it in cmake command
+  get_filename_component(IOS_CXX_COMPILER ${CMAKE_CXX_COMPILER} PROGRAM)
+endif(NOT CMAKE_CXX_COMPILER)
+if(NOT EXISTS ${IOS_CXX_COMPILER})
+  message(FATAL_ERROR "Cannot find CXX compiler: ${IOS_CXX_COMPILER}")
+endif()
+
+set(CMAKE_C_COMPILER ${IOS_C_COMPILER} CACHE PATH "C compiler" FORCE)
+set(CMAKE_CXX_COMPILER ${IOS_CXX_COMPILER} CACHE PATH "CXX compiler" FORCE)
+
+set(CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG "-compatibility_version ")
+set(CMAKE_C_OSX_CURRENT_VERSION_FLAG "-current_version ")
+set(CMAKE_CXX_OSX_COMPATIBILITY_VERSION_FLAG "${CMAKE_C_OSX_COMPATIBILITY_VERSION_FLAG}")
+set(CMAKE_CXX_OSX_CURRENT_VERSION_FLAG "${CMAKE_C_OSX_CURRENT_VERSION_FLAG}")
+
+# Set iOS specific C/C++ flags
+if(IOS_PLATFORM STREQUAL "OS")
+  if(XCODE_VERSION VERSION_LESS "7.0")
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-mios-version-min=${IOS_DEPLOYMENT_TARGET}")
+  else()
+    # Xcode 7.0+ uses flags we can build directly from XCODE_IOS_PLATFORM.
+    set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
+  endif()
+else()
+  set(XCODE_IOS_FLATFORM_VERSION_FLAGS "-mios-simulator-version-min=${IOS_DEPLOYMENT_TARGET}")
+endif()
+
+if(IOS_ENABLE_BITCODE)
+  set(XCODE_IOS_BITCODE_FLAGS "${IOS_COMPILER_FLAGS} -fembed-bitcode")
+else()
+  set(XCODE_IOS_BITCODE_FLAGS "")
+endif()
+
+set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_FLAGS}")
+
+# Hidden visibilty is required for cxx on iOS 
+set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
+set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
+
+set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
+
+if(IOS_USE_VECLIB_FOR_BLAS)
+  # Find vecLib for iOS
+  set(VECLIB_SEARCH_DIRS
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks
+      ${IOS_SDK_ROOT}/System/Library/Frameworks/Accelerate.framework/Frameworks
+      )
+  find_path(VECLIB_INC_DIR vecLib.h PATHS ${VECLIB_SEARCH_DIRS}/vecLib.framework/Headers)
+
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(vecLib DEFAULT_MSG VECLIB_INC_DIR)
+
+  if(VECLIB_FOUND)
+    if(VECLIB_INC_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework vecLib")
+      message(STATUS "Found standalone vecLib.framework")
+    else()
+      set(IOS_LINK_FLAGS ${IOS_LINK_FLAGS} -lcblas "-framework Accelerate")
+      message(STATUS "Found vecLib as part of Accelerate.framework")
+    endif()
+
+  endif()
+endif()
+
+set(CMAKE_C_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_C_LINK_FLAGS}")
+set(CMAKE_CXX_LINK_FLAGS "${IOS_LINK_FLAGS} ${CMAKE_CXX_LINK_FLAGS}")
+
+set(CMAKE_PLATFORM_HAS_INSTALLNAME 1)
+if(NOT IOS_ENABLE_BITCODE)
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib -headerpad_max_install_names")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle -headerpad_max_install_names")
+else()
+  set(CMAKE_SHARED_LIBRARY_CREATE_C_FLAGS "-dynamiclib")
+  set(CMAKE_SHARED_MODULE_CREATE_C_FLAGS "-bundle")
+endif()
+set(CMAKE_SHARED_MODULE_LOADER_C_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_SHARED_MODULE_LOADER_CXX_FLAG "-Wl,-bundle_loader,")
+set(CMAKE_FIND_LIBRARY_SUFFIXES ".dylib" ".so" ".a")
+
+# hack: if a new cmake (which uses CMAKE_INSTALL_NAME_TOOL) runs on an old build tree
+# (where install_name_tool was hardcoded) and where CMAKE_INSTALL_NAME_TOOL isn't in the cache
+# and still cmake didn't fail in CMakeFindBinUtils.cmake (because it isn't rerun)
+# hardcode CMAKE_INSTALL_NAME_TOOL here to install_name_tool, so it behaves as it did before, Alex
+if(NOT DEFINED CMAKE_INSTALL_NAME_TOOL)
+  find_program(CMAKE_INSTALL_NAME_TOOL install_name_tool)
+endif()
+
+# Set the find root to the iOS developer roots and to user defined paths
+set(CMAKE_FIND_ROOT_PATH ${IOS_DEVELOPER_ROOT} ${IOS_SDK_ROOT} ${CMAKE_PREFIX_PATH}
+    CACHE string  "iOS find search path root")
+
+# default to searching for frameworks first
+set(CMAKE_FIND_FRAMEWORK FIRST)
+
+# set up the default search directories for frameworks
+set(CMAKE_SYSTEM_FRAMEWORK_PATH
+    ${IOS_SDK_ROOT}/System/Library/Frameworks
+    ${IOS_SDK_ROOT}/System/Library/PrivateFrameworks
+    ${IOS_SDK_ROOT}/Developer/Library/Frameworks
+    )
+
+# only search the iOS sdks, not the remainder of the host filesystem
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+message(STATUS "iOS: Targeting iOS '${CMAKE_SYSTEM_VERSION}', "
+        "building for '${IOS_PLATFORM}' platform, with architecture '${CMAKE_OSX_ARCHITECTURES}'")
+message(STATUS "System CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}")
+message(STATUS "System CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}")
+
+# Used in ExternalProject command
+string(REPLACE ";" "\\$<SEMICOLON>" EXTERNAL_IOS_ARCHITECTURES "${CMAKE_OSX_ARCHITECTURES}")
+set(EXTERNAL_OPTIONAL_ARGS
+    -DCMAKE_OSX_SYSROOT=${CMAKE_OSX_SYSROOT}
+    -DCMAKE_OSX_ARCHITECTURES=${EXTERNAL_IOS_ARCHITECTURES})
+
+# This little macro lets you set any XCode specific property
+macro(set_xcode_property TARGET XCODE_PROPERTY XCODE_VALUE)
+  set_property (TARGET ${TARGET} PROPERTY XCODE_ATTRIBUTE_${XCODE_PROPERTY} ${XCODE_VALUE})
+endmacro(set_xcode_property)
+
+# This macro lets you find executable programs on the host system
+macro(find_host_package)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY NEVER)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE NEVER)
+  set(IOS FALSE)
+
+  find_package(${ARGN})
+
+  set(IOS TRUE)
+  set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+  set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+endmacro(find_host_package)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
new file mode 100644
index 0000000000..6bea7cf302
--- /dev/null
+++ b/cmake/cuda.cmake
@@ -0,0 +1,188 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
+set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
+set(paddle_known_gpu_archs7 "30 35 50 52")
+set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
+
+######################################################################################
+# A function for automatic detection of GPUs installed  (if autodetection is enabled)
+# Usage:
+#   detect_installed_gpus(out_variable)
+function(detect_installed_gpus out_variable)
+  if(NOT CUDA_gpu_detect_output)
+    set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
+
+    file(WRITE ${cufile} ""
+      "#include <cstdio>\n"
+      "int main() {\n"
+      "  int count = 0;\n"
+      "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
+      "  if (count == 0) return -1;\n"
+      "  for (int device = 0; device < count; ++device) {\n"
+      "    cudaDeviceProp prop;\n"
+      "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
+      "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
+      "  }\n"
+      "  return 0;\n"
+      "}\n")
+
+    execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
+                    "--run" "${cufile}"
+                    WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
+                    RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
+                    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+
+    if(nvcc_res EQUAL 0)
+      # only keep the last line of nvcc_out
+      STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
+      STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
+      list(GET nvcc_out -1 nvcc_out)
+      string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
+      set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
+    endif()
+  endif()
+
+  if(NOT CUDA_gpu_detect_output)
+    message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
+    set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
+  else()
+    set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
+  endif()
+endfunction()
+
+
+########################################################################
+# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
+# Usage:
+#   select_nvcc_arch_flags(out_variable)
+function(select_nvcc_arch_flags out_variable)
+  # List of arch names
+  set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
+  set(archs_name_default "All")
+  if(NOT CMAKE_CROSSCOMPILING)
+    list(APPEND archs_names "Auto")
+  endif()
+
+  # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
+  set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
+  set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
+  mark_as_advanced(CUDA_ARCH_NAME)
+
+  # verify CUDA_ARCH_NAME value
+  if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
+    string(REPLACE ";" ", " archs_names "${archs_names}")
+    message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
+    set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
+    mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
+  else()
+    unset(CUDA_ARCH_BIN CACHE)
+    unset(CUDA_ARCH_PTX CACHE)
+  endif()
+
+  if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
+    set(cuda_arch_bin "30 35")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
+    set(cuda_arch_bin "50")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
+    set(cuda_arch_bin "60 61")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
+    set(cuda_arch_bin "70")
+  elseif(${CUDA_ARCH_NAME} STREQUAL "All")
+    set(cuda_arch_bin ${paddle_known_gpu_archs})
+  elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
+    detect_installed_gpus(cuda_arch_bin)
+  else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
+    set(cuda_arch_bin ${CUDA_ARCH_BIN})
+  endif()
+
+  # remove dots and convert to lists
+  string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
+  string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
+  string(REGEX MATCHALL "[0-9]+"   cuda_arch_ptx "${cuda_arch_ptx}")
+  list(REMOVE_DUPLICATES cuda_arch_bin)
+  list(REMOVE_DUPLICATES cuda_arch_ptx)
+
+  set(nvcc_flags "")
+  set(nvcc_archs_readable "")
+
+  # Tell NVCC to add binaries for the specified GPUs
+  foreach(arch ${cuda_arch_bin})
+    if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
+      # User explicitly specified PTX for the concrete BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
+      list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
+    else()
+      # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
+      list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
+      list(APPEND nvcc_archs_readable sm_${arch})
+    endif()
+  endforeach()
+
+  # Tell NVCC to add PTX intermediate code for the specified architectures
+  foreach(arch ${cuda_arch_ptx})
+    list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
+    list(APPEND nvcc_archs_readable compute_${arch})
+  endforeach()
+
+  string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
+  set(${out_variable}          ${nvcc_flags}          PARENT_SCOPE)
+  set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
+endfunction()
+
+message(STATUS "CUDA detected: " ${CUDA_VERSION})
+if (${CUDA_VERSION} LESS 7.0)
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
+elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
+  set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
+  list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
+  list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
+  # CUDA 8 may complain that sm_20 is no longer supported. Suppress the
+  # warning for now.
+  list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
+endif()
+
+include_directories(${CUDA_INCLUDE_DIRS})
+list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+if(NOT WITH_DSO)
+    list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
+endif(NOT WITH_DSO)
+
+# setting nvcc arch flags
+select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
+list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
+message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
+
+# Set C++11 support
+set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
+# So, don't set these flags here.
+list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
+list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
+list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
+# Set :expt-relaxed-constexpr to suppress Eigen warnings
+list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
+
+if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
+elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
+    list(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
+endif()
+
+mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
+mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index 69f40df516..2c84061ff5 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -2,7 +2,7 @@ if(NOT WITH_GPU)
     return()
 endif()
 
-set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
+set(CUDNN_ROOT "/usr" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
     $ENV{CUDNN_ROOT} $ENV{CUDNN_ROOT}/include ${CUDA_TOOLKIT_INCLUDE}
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
index 62eea42692..85cce80b70 100644
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -2,13 +2,13 @@ INCLUDE(ExternalProject)
 
 SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
 
-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
 
 ExternalProject_Add(
-    linb_any
+    extern_lib_any
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
-    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    GIT_REPOSITORY  "https://github.com/PaddlePaddle/any.git"
+    GIT_TAG         "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
     PREFIX          ${ANY_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
@@ -17,5 +17,15 @@ ExternalProject_Add(
     TEST_COMMAND      ""
 )
 
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(lib_any STATIC ${dummyfile})
+else()
+    add_library(lib_any INTERFACE)
+endif()
+
+add_dependencies(lib_any extern_lib_any)
+
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
+LIST(APPEND external_project_dependencies lib_any)
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
new file mode 100644
index 0000000000..c70d83b3f4
--- /dev/null
+++ b/cmake/external/boost.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+include(ExternalProject)
+
+set(BOOST_PROJECT       "extern_boost")
+set(BOOST_VER           "1.41.0")
+set(BOOST_TAR           "boost_1_41_0")
+set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOOST_VER}/${BOOST_TAR}.tar.gz")
+set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
+set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
+set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+
+include_directories(${BOOST_INCLUDE_DIR})
+
+ExternalProject_Add(
+    ${BOOST_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DOWNLOAD_DIR          ${BOOST_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${BOOST_URL} -c -q -O ${BOOST_TAR}.tar.gz
+                          && tar zxf ${BOOST_TAR}.tar.gz
+    DOWNLOAD_NO_PROGRESS  1
+    PREFIX                ${BOOST_SOURCES_DIR}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         ""
+    INSTALL_COMMAND       ""
+    UPDATE_COMMAND        ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";")
+    add_library(boost STATIC ${dummyfile})
+else()
+    add_library(boost INTERFACE)
+endif()
+
+add_dependencies(boost ${BOOST_PROJECT})
+list(APPEND external_project_dependencies boost)
+set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000..aec51410b3
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
new file mode 100644
index 0000000000..d49c8d6011
--- /dev/null
+++ b/cmake/external/eigen.cmake
@@ -0,0 +1,36 @@
+INCLUDE(ExternalProject)
+
+SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
+SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
+
+ExternalProject_Add(
+    extern_eigen3
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
+    PREFIX          ${EIGEN_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
+    file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
+    add_library(eigen3 STATIC ${dummyfile})
+else()
+    add_library(eigen3 INTERFACE)
+endif()
+
+add_dependencies(eigen3 extern_eigen3)
+
+LIST(APPEND external_project_dependencies eigen3)
+
+IF(NOT WITH_C_API AND WITH_FLUID)
+    INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
+    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
+    INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
+ENDIF()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index a0d0a892c4..6094630454 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,9 +18,9 @@ SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
-    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ELSE(WIN32)
-    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+  set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
@@ -29,19 +29,21 @@ ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
@@ -49,3 +51,12 @@ SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
 ADD_DEPENDENCIES(gflags extern_gflags)
 
 LIST(APPEND external_project_dependencies gflags)
+
+IF(WITH_C_API OR WITH_FLUID)
+  INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
+  IF(ANDROID)
+    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index b70e94a170..382fbda3b5 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -19,37 +19,60 @@ SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
 
 IF(WIN32)
-    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
 ELSE(WIN32)
-    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+  SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 
+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
+
 ExternalProject_Add(
     extern_glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
+    GIT_TAG         ${GLOG_TAG}
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=ON
-    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
-    CMAKE_ARGS      -DBUILD_TESTING=OFF
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+                    -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DWITH_GFLAGS=ON
+                    -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+                    -DBUILD_TESTING=OFF
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
+
+IF(WITH_C_API OR WITH_FLUID)
+  INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
+  IF(ANDROID)
+    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000..79b2449fe6
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.8.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 77e06e983e..5a4aa7a5b7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,26 +34,33 @@ IF(WITH_TESTING)
             "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
+    ENDIF()
+
     ExternalProject_Add(
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
-        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        CMAKE_ARGS      -DBUILD_GMOCK=ON
-        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
-        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
-        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
+                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                        -DBUILD_GMOCK=ON
+                        -Dgtest_disable_pthreads=ON
+                        -Dgtest_force_shared_crt=ON
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                        ${EXTERNAL_OPTIONAL_ARGS}
         CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
     )
 
     ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000..89fc34796a
--- /dev/null
+++ b/cmake/external/mkldnn.cmake
@@ -0,0 +1,92 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING 
+        "Windows or Mac is not supported with MKLDNN in Paddle yet."
+        "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF CACHE STRING "Disable MKLDNN in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+SET(MKLDNN_LIB "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+
+INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
+
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
+ELSE()
+    MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
+ENDIF()
+
+SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
+SET(MKLDNN_CXXFLAG "${CMAKE_CXX_FLAGS} -Wno-error=strict-overflow")
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.11"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    UPDATE_COMMAND      ""
+    CMAKE_ARGS          -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS          -DMKLROOT=${MKLML_ROOT}
+    CMAKE_ARGS          -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
+    CMAKE_ARGS          -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
+    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                        -DMKLROOT:PATH=${MKLML_ROOT}
+)
+
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
+MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
+add_definitions(-DPADDLE_WITH_MKLDNN)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+    COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+    DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
new file mode 100644
index 0000000000..15a07ea3da
--- /dev/null
+++ b/cmake/external/mklml.cmake
@@ -0,0 +1,72 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+IF(WIN32 OR APPLE)
+    MESSAGE(WARNING
+        "Windows or Mac is not supported with MKLML in Paddle yet."
+        "Force WITH_MKLML=OFF")
+    SET(WITH_MKLML OFF CACHE STRING "Disable MKLML package in Windows and MacOS" FORCE)
+    return()
+ENDIF()
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.1.20171007")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.11/${MKLML_VER}.tgz")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "mklml")
+SET(MKLML_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
+  "PROJECT(MKLML)\n"
+  "cmake_minimum_required(VERSION 3.0)\n"
+  "install(DIRECTORY ${MKLML_VER}\n"
+  "        DESTINATION ${MKLML_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_SOURCE_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate ${MKLML_URL} -c -q -O ${MKLML_VER}.tgz 
+                          && tar zxf ${MKLML_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+  INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
new file mode 100644
index 0000000000..fc43766efa
--- /dev/null
+++ b/cmake/external/nccl.cmake
@@ -0,0 +1,67 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_GPU)
+  return()
+endif()
+
+include(ExternalProject)
+
+set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
+
+include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
+
+if(WITH_DSO)
+  # If we use DSO, we do not build nccl, just download the dependencies
+  set(NCCL_BUILD_COMMAND "")
+  set(NCCL_INSTALL_COMMAND "")
+  set(NCCL_INSTALL_DIR "")
+else()
+  # otherwise, we build nccl and link it.
+  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+  # Note: cuda 8.0 is needed to make nccl
+  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
+  set(NCCL_BUILD_COMMAND "make -j 8")
+  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
+endif()
+
+ExternalProject_Add(
+    extern_nccl
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+    GIT_TAG         "v1.3.4-1"
+    PREFIX          "${NCCL_SOURCE_DIR}"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
+    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
+    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
+    TEST_COMMAND      ""
+)
+
+if(WITH_DSO)
+  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
+    add_library(nccl STATIC ${dummyfile})
+  else()
+    add_library(nccl INTERFACE)
+  endif()
+else()
+  add_library(nccl STATIC IMPORTED GLOBAL)
+  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
+               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
+endif()
+
+add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/nnpack.cmake b/cmake/external/nnpack.cmake
new file mode 100644
index 0000000000..d42bcb0f32
--- /dev/null
+++ b/cmake/external/nnpack.cmake
@@ -0,0 +1,30 @@
+# Find the NNPACK library
+#  NNPACK_ROOT - where to find NNPACK include and library.
+#
+
+set(NNPACK_FOUND OFF)
+set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
+find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
+find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
+find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
+
+if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
+  set(NNPACK_FOUND ON)
+  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
+else()
+  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
+endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 2341e3785b..4012a164be 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -1,17 +1,21 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(USE_EIGEN_FOR_BLAS)
+    return()
+ENDIF(USE_EIGEN_FOR_BLAS)
+
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
@@ -21,30 +25,49 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
-    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+    SET(CBLAS_LIBRARIES
+        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
+    SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+    SET(OPENBLAS_COMMIT "v0.2.20")
 
     IF(CMAKE_CROSSCOMPILING)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
+        GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
+        SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
         IF(ANDROID)
-            # arm_soft_fp_abi branch of OpenBLAS to support softfp
-            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
-            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                # use softfp
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
+            ENDIF()
+        ELSEIF(IOS)
+            IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
+                SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
+                SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
+            ELSE()
+                MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
+                       "You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
+            ENDIF()
         ELSEIF(RPI)
             # use hardfp
-            SET(OPENBLAS_COMMIT "v0.2.19")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
         ENDIF()
     ELSE()
-        SET(OPENBLAS_COMMIT "v0.2.19")
+        IF(APPLE)
+            SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
+        ENDIF()
         SET(OPTIONAL_ARGS "")
         IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
             SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
         ENDIF()
     ENDIF()
 
+    SET(COMMON_ARGS CC=${OPENBLAS_CC} NO_SHARED=1 NO_LAPACK=1 libs)
+
     ExternalProject_Add(
         extern_openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
@@ -58,14 +81,48 @@ IF(NOT ${CBLAS_FOUND})
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
     )
+    SET(CBLAS_PROVIDER openblas)
+    IF(WITH_C_API)
+        INSTALL(DIRECTORY ${CBLAS_INC_DIR} DESTINATION third_party/openblas)
+        # Because libopenblas.a is a symbolic link of another library, thus need to
+        # install the whole directory.
+        IF(ANDROID)
+            SET(TMP_INSTALL_DIR third_party/openblas/lib/${ANDROID_ABI})
+        ELSE()
+            SET(TMP_INSTALL_DIR third_party/openblas/lib)
+        ENDIF()
+        INSTALL(CODE "execute_process(
+            COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
+                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
+            )"
+        )
+        INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
+                \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
+            )"
+        )
+        INSTALL(CODE "execute_process(
+            COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
+                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
+            )"
+        )
+    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
 
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 
-ADD_LIBRARY(cblas STATIC IMPORTED)
-SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+# FIXME(gangliao): generate cblas target to track all high performance
+# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
+FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
+ADD_LIBRARY(cblas STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
+
 IF(NOT ${CBLAS_FOUND})
     ADD_DEPENDENCIES(cblas extern_openblas)
     LIST(APPEND external_project_dependencies cblas)
+ELSE()
+    IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+        ADD_DEPENDENCIES(cblas mklml)
+    ENDIF()
 ENDIF(NOT ${CBLAS_FOUND})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7340394b1e..365a370a9c 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -13,12 +13,122 @@
 # limitations under the License.
 
 INCLUDE(ExternalProject)
+# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+FIND_PACKAGE(Protobuf QUIET)
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
+
+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()
 
+# Print and set the protobuf library information,
+# finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)
+    SET(protobuf_DEPS ${ARGN})
+
     MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
     MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
     MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
     INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+    # Assuming that all the protobuf libraries are of the same type.
+    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE STATIC)
+    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE SHARED)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
+    ENDIF()
+
+    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
+
+    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
+
+    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
+    # make `protobuf_generate_cpp` happy.
+    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+    FOREACH(dep ${protobuf_DEPS})
+        ADD_DEPENDENCIES(protobuf ${dep})
+        ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(libprotoc ${dep})
+        ADD_DEPENDENCIES(protoc ${dep})
+    ENDFOREACH()
+
+    LIST(APPEND external_project_dependencies protobuf)
     RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -28,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
@@ -43,22 +153,23 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
 endif()
 
 FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
+    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
 
     SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
          PARENT_SCOPE)
 
     SET(OPTIONAL_CACHE_ARGS "")
@@ -72,46 +183,54 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}"
             "-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}"
             "-Dprotobuf_WITH_ZLIB=ON"
-            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}")
+            "-DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}"
+            ${EXTERNAL_OPTIONAL_ARGS})
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
 
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
+
     ExternalProject_Add(
         ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
         DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
+        GIT_TAG         ${PROTOBUF_TAG}
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
             ${OPTIONAL_CACHE_ARGS}
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
-IF(NOT CMAKE_CROSSCOMPILING)
-    FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
-
-    IF(PROTOBUF_FOUND)
-        SET_PROTOBUF_VERSION()
-        IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
-            SET(PROTOBUF_FOUND OFF)
-        ENDIF()
-    ENDIF(PROTOBUF_FOUND)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
 ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
+IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
 
@@ -120,18 +239,31 @@ ELSE()
 ENDIF()
 
 IF(NOT PROTOBUF_FOUND)
-    build_protobuf(protobuf FALSE)
-    LIST(APPEND external_project_dependencies protobuf)
+    build_protobuf(extern_protobuf FALSE)
 
-    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
         CACHE PATH "protobuf include directory." FORCE)
-    IF(NOT CMAKE_CROSSCOMPILING)
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
+        CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
+        CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
+        CACHE FILEPATH "protoc library." FORCE)
+
+    IF(WITH_C_API OR WITH_FLUID)
+        INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
+        IF(ANDROID)
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
+        ELSE()
+            INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib)
+        ENDIF()
+    ENDIF()
+
+    IF(CMAKE_CROSSCOMPILING)
+        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
+    ELSE()
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
             CACHE FILEPATH "protobuf executable." FORCE)
+        PROMPT_PROTOBUF_LIB(extern_protobuf)
     ENDIF()
-    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
-
-PROMPT_PROTOBUF_LIB()
\ No newline at end of file
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
new file mode 100644
index 0000000000..4e87dc49d8
--- /dev/null
+++ b/cmake/external/pybind11.cmake
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_PYTHON)
+    return()
+endif()
+
+include(ExternalProject)
+
+set(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+include_directories(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_pybind = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index f4d0daab06..46c68cce32 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -12,16 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-INCLUDE(ExternalProject)
+IF(NOT WITH_PYTHON)
+    return()
+ENDIF()
+
 INCLUDE(python_module)
 
 FIND_PACKAGE(PythonInterp 2.7)
-IF(WITH_PYTHON)
-    FIND_PACKAGE(PythonLibs 2.7)
-ENDIF(WITH_PYTHON)
+FIND_PACKAGE(PythonLibs 2.7)
+# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 
 SET(py_env "")
-SET(USE_VIRTUALENV_FOR_TEST 1)
 IF(PYTHONINTERP_FOUND)
     find_python_module(pip REQUIRED)
     find_python_module(numpy REQUIRED)
@@ -32,198 +35,7 @@ IF(PYTHONINTERP_FOUND)
         MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
         "please use pip to upgrade protobuf. pip install -U protobuf")
     ENDIF()
-ELSE(PYTHONINTERP_FOUND)
-    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
-    ##################################### PYTHON ########################################
-    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
-    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
-    SET(_python_DIR ${PYTHON_INSTALL_DIR})
-
-    IF(UNIX)
-        SET(PYTHON_FOUND ON)
-        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
-        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
-        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
-        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
-    ELSEIF(WIN32)
-        SET(PYTHON_FOUND ON)
-        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
-        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
-        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
-        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
-    ELSE()
-        MESSAGE(FATAL_ERROR "Unknown system !")
-    ENDIF()
-
-    IF(APPLE)
-        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
-            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
-            )
-    ENDIF()
-
-    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
-
-    # Force Python build to "Release".
-    IF(CMAKE_CONFIGURATION_TYPES)
-        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
-        SET(CMAKE_CFG_INTDIR "Release")
-    ELSE()
-        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
-            -DCMAKE_BUILD_TYPE:STRING=Release
-            )
-    ENDIF()
-
-    ExternalProject_Add(python
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
-        PREFIX            ${PYTHON_SOURCES_DIR}
-        UPDATE_COMMAND    ""
-        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
-        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-        CMAKE_CACHE_ARGS
-            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
-            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
-            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
-            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
-            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
-            -DDOWNLOAD_SOURCES:BOOL=ON
-            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
-            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
-            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
-        DEPENDS zlib
-    )
-
-    SET(py_env
-        PATH=${PYTHON_INSTALL_DIR}/bin
-        PYTHONHOME=${PYTHON_INSTALL_DIR}
-        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
-    ####################################################################################
-
-    ##################################### SETUPTOOLS ###################################
-    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
-    ExternalProject_Add(setuptools
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
-        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
-        BUILD_IN_SOURCE     1
-        PATCH_COMMAND       ""
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-        INSTALL_COMMAND     ""
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS             python zlib
-    )
-    #####################################################################################
-
-    ##################################### SIX ###########################################
-    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
-    ExternalProject_Add(six
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX              ${SIX_SOURCES_DIR}
-        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
-        BUILD_IN_SOURCE     1
-        PATCH_COMMAND       ""
-        UPDATE_COMMAND      ""
-        CONFIGURE_COMMAND   ""
-        INSTALL_COMMAND     ""
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS             python setuptools
-    )
-    #####################################################################################
-
-    ##################################### CYTHON ########################################
-    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
-    ExternalProject_Add(cython
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        PREFIX                ${CYTHON_SOURCES_DIR}
-        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
-        GIT_TAG               0.25.2
-        BUILD_IN_SOURCE       1
-        CONFIGURE_COMMAND     ""
-        PATCH_COMMAND         ""
-        UPDATE_COMMAND        ""
-        INSTALL_COMMAND       ""
-        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS               python
-    )
-    ####################################################################################
-
-    ##################################### NUMPY ########################################
-    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
-    SET(NUMPY_TAG_VERSION "v1.11.3")
-    SET(NUMPY_VERSION "1.11.3")
-
-    SET(EGG_NAME "")
-    SET(PYTHON_NUMPY_INCLUDE_DIR "")
-    IF(WIN32)
-        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
-    ELSE(WIN32)
-        IF(APPLE)
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
-        ELSE(APPLE)
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-        ENDIF(APPLE)
-
-        FOREACH(suffix x86_64 intel fat64 fat32 universal)
-            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
-        ENDFOREACH()
-    ENDIF(WIN32)
-
-    ExternalProject_Add(numpy
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/numpy/numpy.git
-        GIT_TAG             ${NUMPY_TAG_VERSION}
-        CONFIGURE_COMMAND   ""
-        UPDATE_COMMAND      ""
-        PREFIX              ${NUMPY_SOURCES_DIR}
-        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        BUILD_IN_SOURCE     1
-        DEPENDS             python setuptools cython
-    )
-    ####################################################################################
-
-    ##################################### WHEEL ########################################
-    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
-    ExternalProject_Add(wheel
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
-        PREFIX              ${WHEEL_SOURCES_DIR}
-        CONFIGURE_COMMAND   ""
-        UPDATE_COMMAND      ""
-        BUILD_COMMAND       ""
-        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        BUILD_IN_SOURCE     1
-        DEPENDS             python setuptools
-    )
-    ####################################################################################
-
-    ################################### PROTOBUF #######################################
-    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
-    ExternalProject_Add(python-protobuf
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
-        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
-        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
-        BUILD_IN_SOURCE       1
-        PATCH_COMMAND         ""
-        CONFIGURE_COMMAND     ""
-        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        DEPENDS               python setuptools six
-    )
-    ####################################################################################
-
-    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
-
 ENDIF(PYTHONINTERP_FOUND)
 
-IF(WITH_PYTHON)
-    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-ELSE()
-    SET(PYTHON_LIBRARIES "")
-ENDIF()
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 744c766ee7..9db457c7b2 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -1,17 +1,21 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(NOT WITH_SWIG_PY)
+    return()
+ENDIF()
+
 FIND_PACKAGE(SWIG)
 
 IF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 2d7daed9bc..7cb4efa7bf 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -12,29 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
 INCLUDE(ExternalProject)
 
 SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
-
-INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
 
-SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
-
-IF(WIN32)
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ELSE(WIN32)
-    IF(APPLE)
-        SET(_warpctc_SHARED_SUFFIX dylib)
-    ELSE(APPLE)
-        SET(_warpctc_SHARED_SUFFIX so)
-    ENDIF(APPLE)
-
-    SET(WARPCTC_LIBRARIES
-        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
-ENDIF(WIN32)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
+    CACHE PATH "Warp-ctc Directory" FORCE)
+# Used in unit test test_WarpCTCLayer
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib"
+    CACHE PATH "Warp-ctc Library Directory" FORCE)
+SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}"
+    CACHE FILEPATH "Warp-ctc Library" FORCE)
 
 IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
     SET(USE_OMP OFF)
@@ -46,26 +39,31 @@ ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
-    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
-    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
-    CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
-    CMAKE_ARGS      -DBUILD_SHARED=ON
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+                    -DWITH_GPU=${WITH_GPU}
+                    -DWITH_OMP=${USE_OMP}
+                    -DWITH_TORCH=OFF
+                    -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+                    -DBUILD_SHARED=ON
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
 SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
 ADD_DEPENDENCIES(warpctc extern_warpctc)
 
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 45ca5542b7..1638cd8fdf 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -34,18 +34,30 @@ ExternalProject_Add(
     GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
-    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
-    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
-    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
-    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+                    -DBUILD_SHARED_LIBS=OFF
+                    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                    -DCMAKE_MACOSX_RPATH=ON
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                    ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
+
+IF(WITH_C_API)
+  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
+  IF(ANDROID)
+    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib/${ANDROID_ABI})
+  ELSE()
+    INSTALL(FILES ${ZLIB_LIBRARIES} DESTINATION third_party/zlib/lib)
+  ENDIF()
+ENDIF()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a996dea92..1120677a37 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,11 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
+)
 
 set(GPU_COMMON_FLAGS
     -fPIC
@@ -122,11 +126,14 @@ set(GPU_COMMON_FLAGS
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 
 if (APPLE)
-    # On Mac OS X build fat binaries with x86_64 architectures by default.
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    if(NOT CMAKE_CROSSCOMPILING)
+        # On Mac OS X build fat binaries with x86_64 architectures by default.
+        set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+    endif()
 else()
     set(GPU_COMMON_FLAGS
         -Wall
@@ -144,57 +151,3 @@ endforeach()
 foreach(flag ${GPU_COMMON_FLAGS})
     safe_set_nvflag(${flag})
 endforeach()
-
-
-set(CUDA_PROPAGATE_HOST_FLAGS OFF)
-
-# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
-# So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
-LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
-
-if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_DEBUG})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "Release")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELEASE})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "RelWithDebInfo")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
-elseif(CMAKE_BUILD_TYPE  STREQUAL "MinSizeRel")
-    LIST(APPEND CUDA_NVCC_FLAGS  ${CMAKE_CXX_FLAGS_MINSIZEREL})
-endif()
-
-function(specify_cuda_arch cuda_version cuda_arch)
-    if(${cuda_version} VERSION_GREATER "8.0")
-        foreach(capability 61 62)
-          if(${cuda_arch} STREQUAL ${capability})
-            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-          endif()
-        endforeach()
-    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
-        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
-    endif()
-endfunction()
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-endif()
-
-# Custom gpu architecture
-set(CUDA_ARCH)
-
-if(CUDA_ARCH)
-  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
-
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 43cd6b398b..18770fe286 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -11,56 +11,205 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+#
 
 
-# To simplify the build process of PaddlePaddle, we defined couple of
-# fundamental abstractions, e.g., how to build library, binary and
-# test in C++, CUDA and Go.
+# generic.cmake defines CMakes functions that look like Bazel's
+# building rules (https://bazel.build/).
+#
 #
 # -------------------------------------------
-#    C++	      CUDA C++	      Go
+#     C++        CUDA C++       Go
 # -------------------------------------------
-# cc_library	 nv_library	  go_library
-# cc_binary  	 nv_binary	  go_binary
-# cc_test        nv_test	  go_test
+# cc_library    nv_library   go_library
+# cc_binary     nv_binary    go_binary
+# cc_test       nv_test      go_test
 # -------------------------------------------
 #
-# cmake_parse_arguments can help us to achieve this goal.
-# https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
+# To build a static library example.a from example.cc using the system
+#  compiler (like GCC):
+#
+#   cc_library(example SRCS example.cc)
+#
+# To build a static library example.a from multiple source files
+# example{1,2,3}.cc:
+#
+#   cc_library(example SRCS example1.cc example2.cc example3.cc)
+#
+# To build a shared library example.so from example.cc:
+#
+#   cc_library(example SHARED SRCS example.cc)
+#
+# To build a library using Nvidia's NVCC from .cu file(s), use the nv_
+# prefixed version:
+#
+#   nv_library(example SRCS example.cu)
+#
+# To specify that a library new_example.a depends on other libraies:
+#
+#   cc_library(new_example SRCS new_example.cc DEPS example)
+#
+# Static libraries can be composed of other static libraries:
+#
+#   cc_library(composed DEPS dependent1 dependent2 dependent3)
+#
+# To build an executable binary file from some source files and
+# dependent libraries:
 #
+#   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
+#
+# To build an executable binary file using NVCC, use the nv_ prefixed
+# version:
+#
+#   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
+#
+# To build a unit test binary, which is an executable binary with
+# GoogleTest linked:
+#
+#   cc_test(example_test SRCS example_test.cc DEPS example)
+#
+# To build a unit test binary using NVCC, use the nv_ prefixed version:
+#
+#   nv_test(example_test SRCS example_test.cu DEPS example)
+#
+# It is pretty often that executable and test binaries depend on
+# pre-defined external libaries like glog and gflags defined in
+# /cmake/external/*.cmake:
+#
+#   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
+#
+# To build a go static library using Golang, use the go_ prefixed version:
+#
+#   go_library(example STATIC)
+#
+# To build a go shared library using Golang, use the go_ prefixed version:
+#
+#   go_library(example SHARED)
+#
+
+# including binary directory for generated headers.
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-endif(NOT APPLE)
-
-# cc_library parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_library(tensor
-#   SRCS
-#   tensor.cc
-#   DEPS
-#   variant)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -pthread -ldl -lrt")
+endif(NOT APPLE AND NOT ANDROID)
+
+function(merge_static_libs TARGET_NAME)
+  set(libs ${ARGN})
+  list(REMOVE_DUPLICATES libs)
+
+  # Get all propagation dependencies from the merged libraries
+  foreach(lib ${libs})
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
+  endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
+
+  # To produce a library we need at least one source file.
+  # It is created by add_custom_command below and will helps
+  # also help to track dependencies.
+  set(target_SRCS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  if(APPLE) # Use OSX's libtool to merge archives
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
+      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles}
+      )
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+    set(target_DIR ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}.dir)
+
+    foreach(lib ${libs})
+      set(objlistfile ${target_DIR}/${lib}.objlist) # list of objects in the input library
+      set(objdir ${target_DIR}/${lib}.objdir)
+
+      add_custom_command(OUTPUT ${objdir}
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})
+
+      add_custom_command(OUTPUT ${objlistfile}
+        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
+        COMMAND ${CMAKE_AR} -t "$<TARGET_FILE:${lib}>" > ${objlistfile}
+        DEPENDS ${lib} ${objdir}
+        WORKING_DIRECTORY ${objdir})
+
+      list(APPEND target_OBJS "${objlistfile}")
+    endforeach()
+
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${target_SRCS}
+      COMMAND ${CMAKE_COMMAND} -E touch ${target_SRCS}
+      DEPENDS ${libs} ${target_OBJS})
+
+    # Generate dummy staic lib
+    file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
+    add_library(${TARGET_NAME} STATIC ${target_SRCS})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    # Get the file name of the generated library
+    set(target_LIBNAME "$<TARGET_FILE:${TARGET_NAME}>")
+
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+        COMMAND ${CMAKE_AR} crs ${target_LIBNAME} `find ${target_DIR} -name '*.o'`
+        COMMAND ${CMAKE_RANLIB} ${target_LIBNAME}
+        WORKING_DIRECTORY ${target_DIR})
+  endif()
+endfunction(merge_static_libs)
+
 function(cc_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${cc_library_OPTIONAL} STREQUAL "SHARED")
-    add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
-  else()
-    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
-  endif()
-  if (cc_library_DEPS)
-    add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
-  endif()
+  if (cc_library_SRCS)
+    if (cc_library_SHARED OR cc_library_shared) # build *.so
+      add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
+    else()
+      add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
+    endif()
+    if (cc_library_DEPS)
+      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
+    endif()
+    
+    # cpplint code style
+    foreach(source_file ${cc_library_SRCS})
+      string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+      if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        list(APPEND cc_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+      endif()
+    endforeach()
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS} ${cc_library_HEADERS})
+
+  else(cc_library_SRCS)
+    if(cc_library_DEPS)
+      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
+    else()
+      message(FATAL "Please specify source file or library in cc_library.")
+    endif()
+  endif(cc_library_SRCS)
 endfunction(cc_library)
 
-# cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
-# cc_binary(tensor
-#   SRCS
-#   tensor.cc)
 function(cc_binary TARGET_NAME)
-  set(options OPTIONAL)
+  set(options "")
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -71,48 +220,56 @@ function(cc_binary TARGET_NAME)
   endif()
 endfunction(cc_binary)
 
-# The dependency to target tensor implies that if any of
-# tensor{.h,.cc,_test.cc} is changed, tensor_test need to be re-built.
-# cc_test(tensor_test
-#   SRCS
-#   tensor_test.cc
-#   DEPS
-#   tensor)
 function(cc_test TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
+    set(multiValueArgs SRCS DEPS ARGS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_test(${TARGET_NAME} ${TARGET_NAME})
+    # Support linking flags: --whole-archive (Linux) / -force_load (MacOS)
+    target_circle_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    if("${cc_test_DEPS}" MATCHES "ARCHIVE_START")
+      list(REMOVE_ITEM cc_test_DEPS ARCHIVE_START ARCHIVE_END)
+    endif()
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_test(NAME ${TARGET_NAME}
+             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
 
-# Suppose that ops.cu includes global functions that take Tensor as
-# their parameters, so ops depend on tensor. This implies that if
-# any of tensor.{h.cc}, ops.{h,cu} is changed, ops need to be re-built.
-# nv_library(ops
-#   SRCS
-#   ops.cu
-#   DEPS
-#   tensor)
 function(nv_library TARGET_NAME)
   if (WITH_GPU)
-    set(options OPTIONAL)
+    set(options STATIC static SHARED shared)
     set(oneValueArgs "")
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
-      cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-    else()
-      cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
-    endif()
-    if (nv_library_DEPS)
-      add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
-    endif()
+    if(nv_library_SRCS)
+      if (nv_library_SHARED OR nv_library_shared) # build *.so
+        cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+      else()
+          cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+      endif()
+      if (nv_library_DEPS)
+        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
+      endif()
+      # cpplint code style
+      foreach(source_file ${nv_library_SRCS})
+        string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
+        if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+          list(APPEND nv_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
+        endif()
+      endforeach()
+      add_style_check_target(${TARGET_NAME} ${nv_library_SRCS} ${nv_library_HEADERS})
+    else(nv_library_SRCS)
+      if (nv_library_DEPS)
+        merge_static_libs(${TARGET_NAME} ${nv_library_DEPS})
+      else()
+        message(FATAL "Please specify source file or library in nv_library.")
+      endif()
+    endif(nv_library_SRCS)
   endif()
 endfunction(nv_library)
 
@@ -130,13 +287,6 @@ function(nv_binary TARGET_NAME)
   endif()
 endfunction(nv_binary)
 
-# The dependency to target tensor implies that if any of
-# ops{.h,.cu,_test.cu} is changed, ops_test need to be re-built.
-# nv_test(ops_test
-#   SRCS
-#   ops_test.cu
-#   DEPS
-#   ops)
 function(nv_test TARGET_NAME)
   if (WITH_GPU AND WITH_TESTING)
     set(options "")
@@ -144,50 +294,72 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
 
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-# go_library(api
-#   SRCS
-#   api.go
-#   DEPS
-#   tensor # Because ops depend on tensor, this line is optional.
-#   ops)
 function(go_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+
+  if (go_library_SHARED OR go_library_shared)
     set(BUILD_MODE "-buildmode=c-shared")
-    if(APPLE)
-      set(LIB_NAME "lib${TARGET_NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${TARGET_NAME}.so")
-    endif()
+    set(${TARGET_NAME}_LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   else()
     set(BUILD_MODE "-buildmode=c-archive")
-    set(LIB_NAME "lib${TARGET_NAME}.a")
+    set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   endif()
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
-  add_library(${TARGET_NAME} STATIC IMPORTED)
-  set_property(TARGET ${TARGET_NAME} PROPERTY
-    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
-  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
+
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # This custom command will always run since it depends on a not
+  # existing file.
+  add_custom_command(
+    OUTPUT dummy_rebulid_${TARGET_NAME}
+    COMMAND cmake -E touch ${dummyfile}
+    )
+  # Create a custom target that depends on the custom command output
+  # file, so the custom command can be referenced as a dependency by
+  # `add_dependencies`.
+  add_custom_target(rebuild_${TARGET_NAME}
+    DEPENDS dummy_rebulid_${TARGET_NAME}
+    )
+
+  # Add dummy code to support `make target_name` under Terminal Command
+  file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
+  if (go_library_SHARED OR go_library_shared)
+    add_library(${TARGET_NAME} SHARED ${dummyfile})
+  else()
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+  endif()
+  if(go_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+  endif(go_library_DEPS)
+
+  # The "source file" of the library is `${dummyfile}` which never
+  # change, so the target will never rebuild. Make the target depends
+  # on the custom command that touches the library "source file", so
+  # rebuild will always happen.
+  add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
+
+  set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
+
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
+    # Golang build source code
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    -o "${${TARGET_NAME}_LIB_PATH}"
+    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
+    # must run under GOPATH
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)
 
 function(go_binary TARGET_NAME)
@@ -195,32 +367,156 @@ function(go_binary TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
+    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
 
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
-# go_extern will download extern go project.
-# go_extern(target_name extern_source)
-# go_extern(go_redis github.com/hoisie/redis)
-function(go_extern TARGET_NAME)
-  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
-endfunction(go_extern)
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+
+  if (MOBILE_INFERENCE)
+      set(EXTRA_FLAG "lite:")  
+  else()
+      set(EXTRA_FLAG "") 
+  endif()
+
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}"
+             "${_protobuf_protoc_hdr}"
+
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      -I${CMAKE_CURRENT_SOURCE_DIR}
+      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
+
+
+function(proto_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS DEPS)
+  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+endfunction()
+
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
+endfunction()
+
+function(py_test TARGET_NAME)
+  if(WITH_TESTING)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    add_test(NAME ${TARGET_NAME}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
+             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+  endif()
+endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
index a9241b0e3e..4f9f5546b9 100644
--- a/cmake/make_resource.py
+++ b/cmake/make_resource.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import re
 import sys
diff --git a/cmake/package.cmake b/cmake/package.cmake
index ff49a2d08e..79e02147f3 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
 set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
 include (CMakePackageConfigHelpers)
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 46035a908b..53c2de332e 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -1,27 +1,28 @@
 # This file is use to check all support level of AVX on your machine
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
-INCLUDE(CheckCXXSourceRuns)
-INCLUDE(CheckCXXSourceCompiles)
+include(CheckCXXSourceRuns)
+include(CheckCXXSourceCompiles)
 
-IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
     set(SSE2_FLAG "-msse2")
     set(SSE3_FLAG "-msse3")
-    SET(AVX_FLAG "-mavx")
-    SET(AVX2_FLAG "-mavx2")
-ELSEIF(MSVC)
+    set(AVX_FLAG "-mavx")
+    set(AVX2_FLAG "-mavx2")
+elseif(MSVC)
     set(MMX_FLAG "/arch:MMX")
     set(SSE2_FLAG "/arch:SSE2")
     set(SSE3_FLAG "/arch:SSE3")
     SET(AVX_FLAG "/arch:AVX")
     SET(AVX2_FLAG "/arch:AVX2")
-ENDIF()
+endif()
 
 set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
 
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
+set(MMX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <mmintrin.h>
 int main()
@@ -32,6 +33,7 @@ int main()
 
 # Check SSE2
 set(CMAKE_REQUIRED_FLAGS ${SSE2_FLAG})
+set(SSE2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <emmintrin.h>
 int main()
@@ -42,6 +44,7 @@ int main()
 
 # Check SSE3
 set(CMAKE_REQUIRED_FLAGS ${SSE3_FLAG})
+set(SSE3_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <pmmintrin.h>
 int main()
@@ -55,6 +58,7 @@ int main()
 
 # Check AVX
 set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
@@ -67,6 +71,7 @@ int main()
 
 # Check AVX 2
 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 904652413e..396bd1a079 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -24,15 +24,15 @@ IF(WIN32)
     SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
     IF(APPLE)
-        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
-        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
-        SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
-        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+        EXEC_PROGRAM(sw_vers ARGS -productVersion OUTPUT_VARIABLE HOST_SYSTEM_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" MACOS_VERSION "${HOST_SYSTEM_VERSION}")
+        IF(NOT DEFINED $ENV{MACOSX_DEPLOYMENT_TARGET})
             # Set cache variable - end user may change this during ccmake or cmake-gui configure.
             SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                 "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
         ENDIF()
+        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
     ELSE(APPLE)
 
         IF(EXISTS "/etc/issue")
@@ -48,6 +48,8 @@ ELSE(WIN32)
             ELSEIF(LINUX_ISSUE MATCHES "Fedora")
                 SET(HOST_SYSTEM "fedora")
             ENDIF()
+
+            STRING(REGEX MATCH "(([0-9]+)\\.)+([0-9]+)" HOST_SYSTEM_VERSION "${LINUX_ISSUE}")
         ENDIF(EXISTS "/etc/issue")
 
         IF(EXISTS "/etc/redhat-release")
@@ -69,7 +71,7 @@ CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
 
 MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 
-MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}, version: ${HOST_SYSTEM_VERSION}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
 # configuration for cross-compiling
@@ -81,27 +83,12 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
     ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "RPi")
         SET(RPI TRUE)
         INCLUDE(cross_compiling/raspberry_pi)
+    ELSEIF(${CMAKE_SYSTEM_NAME} STREQUAL "iOS")
+        SET(IOS TRUE)
+        INCLUDE(cross_compiling/ios)
     ENDIF()
 ENDIF()
 
-# prefix and suffix on different os
-IF(WIN32)
-    SET(LIBRARY_PREFIX "")
-    SET(SHARED_LIBRARY_SUFFIX ".dll")
-    SET(STATIC_LIBRARY_SUFFIX ".lib")
-    SET(EXECUTABLE_SUFFIX ".exe")
-ELSE(WIN32)
-    SET(LIBRARY_PREFIX "lib")
-    IF(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".dylib")
-    ELSE(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".so")
-    ENDIF(APPLE)
-
-    SET(STATIC_LIBRARY_SUFFIX ".a")
-    SET(EXECUTABLE_SUFFIX "")
-ENDIF(WIN32)
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 8c91434622..0dc33ce385 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -25,7 +25,9 @@ function(target_circle_link_libraries TARGET_NAME)
             endif()
         endforeach()
         if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
-            list(APPEND LIBS "-undefined dynamic_lookup")
+            if(NOT IOS_ENABLE_BITCODE)
+                list(APPEND LIBS "-undefined dynamic_lookup")
+            endif()
         endif()
         list(REVERSE libsInArgn)
         target_link_libraries(${TARGET_NAME}
@@ -71,29 +73,52 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    target_circle_link_libraries(${TARGET_NAME}
-        ARCHIVE_START
-        paddle_gserver
-        paddle_function
-        ARCHIVE_END
-        paddle_pserver
-        paddle_trainer_lib
-        paddle_network
-        paddle_math
-        paddle_utils
-        paddle_parameter
-        paddle_proto
-        paddle_cuda
-        ${EXTERNAL_LIBS}
-        ${CMAKE_THREAD_LIBS_INIT}
-        ${CMAKE_DL_LIBS}
-        ${RDMA_LD_FLAGS}
-        ${RDMA_LIBS})
+    if(MOBILE_INFERENCE)
+        target_circle_link_libraries(${TARGET_NAME}
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    else()
+        target_circle_link_libraries(${TARGET_NAME}
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_pserver
+            paddle_trainer_lib
+            paddle_network
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            paddle_optimizer
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
 
     if(ANDROID)
         target_link_libraries(${TARGET_NAME} log)
     endif(ANDROID)
 
+    if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
+      target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
+    endif()
+
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
@@ -117,7 +142,6 @@ endfunction()
 macro(add_unittest_without_exec TARGET_NAME)
     add_executable(${TARGET_NAME} ${ARGN})
     link_paddle_test(${TARGET_NAME})
-    add_style_check_target(${TARGET_NAME} ${ARGN})
 endmacro()
 
 # add_unittest
@@ -141,17 +165,6 @@ endmacro()
 function(create_resources res_file output_file)
   add_custom_command(
     OUTPUT ${output_file}
-    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
-endfunction()
-
-
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-  add_test(NAME ${TEST_NAME}
-        COMMAND env PADDLE_PACKAGE_DIR=${PADDLE_PYTHON_PACKAGE_DIR}
-        bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
-        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
+    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
diff --git a/cmake/version.cmake b/cmake/version.cmake
index ac1583a24c..cde650128a 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,7 +4,7 @@ set(tmp_version "HEAD")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
     COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
-    WORKING_DIRECTORY ${PROJ_ROOT}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 6fa42fd0c7..94dd3457fb 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_docs
-  gen_proto_py)
-
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
deleted file mode 100644
index 3bf030004d..0000000000
--- a/doc/about/index_cn.md
+++ /dev/null
@@ -1,11 +0,0 @@
-关于PaddlePaddle
-================
-
-PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台,兼备易用性、高效性、灵活性和可扩展性,目前已被百度内部多个产品线广泛使用。
-PaddlePaddle目前已经开放源码, 但是远未完善,我们希望能在这个基础上不断的改进、扩展和延伸。
-同时我们希望广大开发者积极提供反馈和贡献源代码,建立一个活跃的开源社区。
-
-致谢
---------
-
-在此,特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
deleted file mode 100644
index 065c430cde..0000000000
--- a/doc/about/index_en.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-ABOUT
-=======
-
-PaddlPaddle is an easy-to-use, efficient, flexible and scalable deep learning platform,
-which is originally developed by Baidu scientists and engineers for the purpose of applying deep learning to many products at Baidu.
-
-PaddlePaddle is now open source but far from complete, which is intended to be built upon, improved, scaled, and extended.
-We hope to build an active open source community both by providing feedback and by actively contributing to the source code.
-
-
-Credits
---------
-
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 9be0b370ee..84f9097a6c 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -7,3 +7,4 @@ API
     模型配置 <v2/model_configs.rst>
     数据访问 <v2/data.rst>
     训练与应用 <v2/run_logic.rst>
+    v2/fluid.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 25c1dd00b9..e6f632e1a5 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
     v2/model_configs.rst
     v2/data.rst
     v2/run_logic.rst
+    v2/fluid.rst
diff --git a/doc/api/v1/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
deleted file mode 100644
index d08c6b3efa..0000000000
--- a/doc/api/v1/data_provider/dataprovider_cn.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-.. _api_dataprovider:
-
-DataProvider的介绍
-==================
-
-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存,让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 :ref:`api_pydataprovider2` ,来自定义传数据的过程。如果有更复杂的使用,或者需要更高的效率,用户也可以在C++端自定义一个 ``DataProvider`` 。
-
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider,并且在DataProvider中实现如何访问训练文件列表(train.list)或测试文件列表(test.list)。
-
-- train.list和test.list存放在本地(推荐直接存放到训练目录,以相对路径引用)。一般情况下,两者均为纯文本文件,其中每一行对应一个数据文件地址:
-  
-  - 如果数据文件存于本地磁盘,这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
-  - 地址也可以为hdfs文件路径,或者数据库连接路径等。
-  - 由于这个地址会被DataProvider使用,因此,如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
-- 如果没有设置test.list,或设置为None,那么在训练过程中不会执行测试操作;否则,会根据命令行参数指定的测试方式,在训练过程中进行测试,从而防止过拟合。
diff --git a/doc/api/v1/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
deleted file mode 100644
index 96efbb1da9..0000000000
--- a/doc/api/v1/data_provider/dataprovider_en.rst
+++ /dev/null
@@ -1,34 +0,0 @@
-Introduction
-==============
-DataProvider is a module that loads training or testing data into cpu or gpu
-memory for the following triaining or testing process.
-
-For simple use, users can use Python :code:`PyDataProvider` to dynamically reads
-the original data in any format or in any form, and then transfer them into a
-data format PaddlePaddle requires. The process is extremly flexible and highly
-customized, with sacrificing the efficiency only a little. This is extremly
-useful when you have to dynamically generate certain kinds of data according to,
-for example, the training performance.
-
-Besides, users also can customize a C++ :code:`DataProvider` for a more
-complex usage, or for a higher efficiency.
-
-The following parameters are required to define in the PaddlePaddle network
-configuration file (trainer_config.py): which DataProvider is chosen to used,
-and specific parameters for DataProvider, including training file list
-(train.list) and testing file list (test.list).
-
-Train.list and test.list are simply two plain text files, which defines path
-of training or testing data. It is recommended that directly placing them into
-the training directory, and reference to them by using a relative path (
-relative to the PaddePaddle program).
-
-Testing or evaluating will not be performed during training if the test.list is
-not set or set to None. Otherwise, PaddlePaddle will evaluate the trained model
-by the specified tesing data while training, every testing period (a user
-defined command line parameter in PaddlePaddle) to prevent over-fitting.
-
-Each line of train.list and test.list is an absolute or relative path (relative
-to the PaddePaddle program runtime) of data file. Fascinatingly more, each line
-can also be a HDFS file path or a SQL connection string. As long as the user
-assures how to access each file in DataProvider.
diff --git a/doc/api/v1/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
deleted file mode 100644
index 8f9db31cfb..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_cn.rst
+++ /dev/null
@@ -1,229 +0,0 @@
-..  _api_pydataprovider2:
-
-PyDataProvider2的使用
-=====================
-
-PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据,并提供了简单的Cache功能;同时可以使用户只关注如何从文件中读取每一条数据,而不用关心数据如何传输,如何存储等等。
-
-..  contents::
-
-MNIST的使用场景
----------------
-
-我们以MNIST手写识别为例,来说明PyDataProvider2的简单使用场景。
-
-样例数据
-++++++++
-
-MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下:
-
-..  literalinclude:: src/mnist_train.txt
-
-其中每行数据代表一张图片,行内使用 ``;`` 分成两部分。第一部分是图片的标签,为0-9中的一个数字;第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字:
-
-..  literalinclude:: src/train.list
-
-dataprovider的使用
-++++++++++++++++++
-
-..  literalinclude:: src/mnist_provider.dict.py
-
-- 首先,引入PaddlePaddle的PyDataProvider2包。
-- 其次,定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2,同时设置它的input_types属性。
-  
-  - `input_types`_:设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字,显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-
-    ..  literalinclude:: src/mnist_config.py
-         :lines: 9-10
-
-  - 注意:如果用户不显示指定返回数据的对应关系,那么PaddlePaddle会根据layer的声明顺序,来确定对应关系。但这个关系可能不正确,所以推荐使用显式指定的方式来设置input_types。
-- 最后,实现数据输入函数(如本例的 ``process`` 函数)。
-
-  - 该函数的功能是:打开文本文件,读取每一行,将行中的数据转换成与input_types一致的格式,然后返回给PaddlePaddle进程。注意,
-    
-    - 返回的顺序需要和input_types中定义的顺序一致。
-    - 返回时,必须使用Python关键词 ``yield`` ,相关概念是 ``generator`` 。
-    - 一次yield调用,返回一条完整的样本。如果想为一个数据文件返回多条样本,只需要在函数中调用多次yield即可(本例中使用for循环进行多次调用)。
-  
-  - 该函数具有两个参数:
-  
-    - settings:在本例中没有使用,具体可以参考 `init_hook`_ 中的说明。
-    - filename:为 ``train.list`` 或 ``test.list`` 中的一行,即若干数据文件路径的某一个。
-
-网络配置中的调用
-++++++++++++++++
-
-在网络配置里,只需要一行代码就可以调用这个PyDataProvider2,如,
-
-..  literalinclude:: src/mnist_config.py
-     :lines: 1-7
-
-训练数据是 ``train.list`` ,没有测试数据,调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-
-小结
-+++++
-
-至此,简单的PyDataProvider2样例就说明完毕了。对用户来说,仅需要知道如何从 **一个文件** 中读取 **一条样本** ,就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作:
-
-* 将数据组合成Batch进行训练
-* 对训练数据进行Shuffle
-* 多线程的数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢?
-
-时序模型的使用场景
-------------------
-样例数据
-++++++++
-
-时序模型是指数据的某一维度是一个序列形式,即包含时间步信息。所谓时间步信息,不一定和时间有关系,只是说明数据的顺序是重要的。例如,文本信息就是一个序列数据。
-
-本例采用英文情感分类的数据,即将一段英文文本数据,分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下:
-
-..  literalinclude:: src/sentimental_train.txt
-
-dataprovider的使用
-++++++++++++++++++
-
-相对MNIST而言,这个dataprovider较复杂,主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的,它会在dataprovider创建的时候执行。
-
-- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列,因此使用 ``integer_value_sequence`` 类型来设置。
-- 将 ``dictionary`` 存入settings对象,在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象,即一个将单词字符串映射到单词ID的字典。
-
-..  literalinclude:: src/sentimental_provider.py
-
-网络配置中的调用
-++++++++++++++++
-
-调用这个PyDataProvider2的方法,基本上和MNIST样例一致,除了
-
-* 在配置中需要读取外部字典。
-* 在声明DataProvider的时候传入dictionary作为参数。
-
-..  literalinclude:: src/sentimental_config.py
-     :emphasize-lines: 12-14
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-``@provider`` 是一个Python的 `Decorator`_ ,可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系,只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-
-*  input_types:数据输入格式。具体的格式说明,请参考 `input_types`_ 。
-*  should_shuffle:是不是要对数据做Shuffle。训练时默认shuffle,测试时默认不shuffle。
-*  min_pool_size:设置内存中最小暂存的数据条数,也是PaddlePaddle所能够保证的shuffle粒度。如果为-1,则会预先读取全部数据到内存中。
-*  pool_size: 设置内存中暂存的数据条数。如果为-1(默认),则不在乎内存暂存多少条数据。如果设置,则推荐大于训练时batch size的值,并且在内存足够的情况下越大越好。
-*  can_over_batch_size:是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题,一般推荐设置成True。
-*  calc_batch_size:可以传入一个函数,用于自定义每条数据的batch size(默认为1)。
-*  cache: 数据缓存的策略,具体请参考 `cache`_ 。
-*  init_hook:初始化时调用的函数,具体请参考 `init_hook`_ 。
-*  check:如果为true,会根据input_types检查数据的合法性。
-*  check_fail_continue:如果为true,那么当check出数据不合法时,会扔到这条数据,继续训练或预测。(对check=false的情况,没有作用)
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型,和三种序列模式。
-
-四种数据类型:
-
-* dense_vector:稠密的浮点数向量。
-* sparse_binary_vector:稀疏的01向量,即大部分值为0,但有值的地方必须为1。
-* sparse_float_vector:稀疏的向量,即大部分值为0,但有值的部分可以是任何浮点数。
-* integer:整数标签。
-
-三种序列模式:
-
-* SequenceType.NO_SEQUENCE:不是一条序列
-* SequenceType.SEQUENCE:是一条时间序列
-* SequenceType.SUB_SEQUENCE: 是一条时间序列,且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同,列表如下:
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中,f代表一个浮点数,i代表一个整数。
-
-注意:对sparse_binary_vector和sparse_float_vector,PaddlePaddle存的是有值位置的索引。例如,
-
-- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ,类型是sparse_binary_vector,返回的是 ``[1, 2]`` 。
-- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ,类型是sparse_float_vector,返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。该函数在初始化的时候会被调用,其参数如下:
-
-* 第一个参数是settings对象,它和数据传入函数的第一个参数(如本例中 ``process`` 函数的 ``settings`` 参数)必须一致。该对象具有以下两个属性:
-    * settings.input_types:数据输入格式,具体请参考 `input_types`_ 。
-    * settings.logger:一个logging对象。
-* 其他参数使用 ``kwargs`` (key word arguments)传入,包括以下两种:
-    * PaddlePaddle定义的参数: 1)is_train:bool型参数,表示用于训练或预测;2)file_list:所有文件列表。
-    * 用户定义的参数:使用args在网络配置中设置。
-
-注意:PaddlePaddle保留添加参数的权力,因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-
-cache
-+++++
-
-PyDataProvider2提供了两种简单的Cache策略:
-
-* CacheType.NO_CACHE:不缓存任何数据,每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM:第一个pass会从python端读取数据,剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行都传递给process函数,从而生成多个generator。当训练数据非常多时,就会生成非常多的generator。
-
-虽然每个generator在没有调用的时候,是几乎不占内存的;但当调用过一次后,generator便会存下当前的上下文(Context),而这个Context可能会非常大。并且,generator至少需要调用两次才会知道是否停止。所以,即使process函数里面只有一个yield,也需要两次随机选择到相同generator的时候,才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次,返回0
-    tmp = next(f)  # 调用第二次的时候,才会Stop Iteration
-
-由于顺序调用这些generator不会出现上述问题,因此有两种解决方案:
-
-1. **最佳推荐**:将样本的地址放入另一个文本文件,train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
-2. 在generator的上下文中尽量留下非常少的变量引用,例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候,python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-注意:这个问题是PyDataProvider读数据时候的逻辑问题,很难整体修正。
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽可能多的使用内存。因此,对于内存较小的机器,推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-
diff --git a/doc/api/v1/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
deleted file mode 100644
index e8fb629277..0000000000
--- a/doc/api/v1/data_provider/pydataprovider2_en.rst
+++ /dev/null
@@ -1,249 +0,0 @@
-..  _api_pydataprovider2:
-
-PyDataProvider2
-===============
-
-We highly recommand users to use PyDataProvider2 to provide training or testing
-data to PaddlePaddle. The user only needs to focus on how to read a single
-sample from the original data file by using PyDataProvider2, leaving all of the
-trivial work, including, transfering data into cpu/gpu memory, shuffle, binary
-serialization to PyDataProvider2. PyDataProvider2 uses multithreading and a
-fanscinating but simple cache strategy to optimize the efficiency of the data
-providing process.
-
-DataProvider for the non-sequential model
------------------------------------------
-
-Here we use the MNIST handwriting recognition data as an example to illustrate
-how to write a simple PyDataProvider.
-
-MNIST is a handwriting classification data set. It contains 70,000 digital
-grayscale images. Labels of the training sample range from 0 to 9. All the
-images have been size-normalized and centered into images with the same size
-of 28 x 28 pixels.
-
-A small part of the original data as an example is shown as below:
-
-.. literalinclude:: src/mnist_train.txt
-
-Each line of the data contains two parts, separated by :code:`;`. The first part is
-label of an image. The second part contains 28x28 pixel float values.
-
-Just write path of the above data into train.list. It looks like this:
-
-.. literalinclude:: src/train.list
-
-The corresponding dataprovider is shown as below:
-
-.. literalinclude:: src/mnist_provider.dict.py
-
-The first line imports PyDataProvider2 package.
-The main function is the process function, that has two parameters.
-The first parameter is the settings, which is not used in this example.
-The second parameter is the filename, that is exactly each line of train.list.
-This parameter is passed to the process function by PaddlePaddle.
-
-:code:`@provider` is a Python
-`Decorator <http://www.learnpython.org/en/Decorators>`_ .
-It sets some properties to DataProvider, and constructs a real PaddlePaddle
-DataProvider from a very simple user implemented python function. It does not
-matter if you are not familiar with `Decorator`_. You can keep it simple by
-just taking :code:`@provider` as a fixed mark above the provider function you
-implemented.
-
-`input_types`_ defines the data format that a DataProvider returns.
-In this example, it is set to a 28x28-dimensional dense vector and an integer
-scalar, whose value ranges from 0 to 9.
-`input_types`_ can be set to several kinds of input formats, please refer to the
-document of `input_types`_ for more details.
-
-
-The process method is the core part to construct a real DataProvider in
-PaddlePaddle. It implements how to open the text file, how to read one sample
-from the original text file, convert them into `input_types`_, and give them
-back to PaddlePaddle process at line 23.
-Note that data yielded by the process function must follow the same order that
-`input_types`_ are defined.
-
-
-With the help of PyDataProvider2, user can focus on how to generate ONE traning
-sample by using keywords :code:`yield`.
-:code:`yield` is a python keyword, and a concept related to it includes
-:code:`generator`.
-
-Only a few lines of codes need to be added into the training configuration file,
-you can take this as an example.
-
-.. literalinclude:: src/mnist_config.py
-
-Here we specify training data by :code:`train.list`, and no testing data is specified.
-The method which actually provide data is :code:`process`.
-
-User also can use another style to provide data, which defines the
-:code:`data_layer`'s name explicitly when `yield`. For example,
-the :code:`dataprovider` is shown as below.
-
-.. literalinclude:: src/mnist_provider.dict.py
-   :linenos:
-
-If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
-the order of :code:`data_layer` definition roughly to determine which feature to
-which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
-:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.
-
-Now, this simple example of using PyDataProvider is finished.
-The only thing that the user should know is how to generte **one sample** from
-**one data file**.
-And PaddlePadle will do all of the rest things\:
-
-* Form a training batch
-* Shuffle the training data
-* Read data with multithreading
-* Cache the training data (Optional)
-* CPU-> GPU double buffering.
-
-Is this cool?
-
-..  _api_pydataprovider2_sequential_model:
-
-DataProvider for the sequential model
--------------------------------------
-A sequence model takes sequences as its input. A sequence is made up of several
-timesteps. The so-called timestep, is not necessary to have something to do
-with time. It can also be explained to that the order of data are taken into
-consideration into model design and training.
-For example, the sentence can be interpreted as a kind of sequence data in NLP
-tasks.
-
-Here is an example on data proivider for English sentiment classification data.
-The original input data are simple English text, labeled into positive or
-negative sentiment (marked by 0 and 1 respectively).
-
-A small part of the original data as an example can be found in the path below:
-
-.. literalinclude:: src/sentimental_train.txt
-
-The corresponding data provider can be found in the path below:
-
-.. literalinclude:: src/sentimental_provider.py
-
-This data provider for sequential model is a little more complex than that
-for MINST dataset.
-A new initialization method is introduced here.
-The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s
-:code:`init_hook` parameter, and it will be invoked once DataProvider is
-initialized. The :code:`on_init` function has the following parameters:
-
-* The first parameter is the settings object.
-* The rest parameters are passed by key word arguments. Some of them are passed
-  by PaddlePaddle, see reference for `init_hook`_.
-  The :code:`dictionary` object is a python dict object passed from the trainer
-  configuration file, and it maps word string to word id.
-
-To pass these parameters into DataProvider, the following lines should be added
-into trainer configuration file.
-
-.. literalinclude:: src/sentimental_config.py
-
-The definition is basically same as MNIST example, except:
-* Load dictionary in this configuration
-* Pass it as a parameter to the DataProvider
-
-The `input_types` is configured in method :code:`on_init`. It has the same
-effect to configure them by :code:`@provider`'s :code:`input_types` parameter.
-However, the :code:`input_types` is set at runtime, so we can set it to
-different types according to the input data. Input of the neural network is a
-sequence of word id, so set :code:`seq_type` to :code:`integer_value_sequence`.
-
-Durning :code:`on_init`, we save :code:`dictionary` variable to
-:code:`settings`, and it will be used in :code:`process`. Note the settings
-parameter for the process function and for the on_init's function are a same
-object.
-
-The basic processing logic is the same as MNIST's :code:`process` method. Each
-sample in the data file is given back to PaddlePaddle process.
-
-Thus, the basic usage of PyDataProvider is here.
-Please refer to the following section reference for details.
-
-Reference
----------
-
-@provider
-+++++++++
-
-.. autofunction:: paddle.trainer.PyDataProvider2.provider
-
-input_types
-+++++++++++
-
-PaddlePaddle has four data types, and three sequence types.
-The four data types are:
-
-* :code:`dense_vector`: dense float vector.
-* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
-  the non zero elements are fixed to 1.
-* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some
-  non zero elements can be any float value. They are given by the user.
-* :code:`integer`: an integer scalar, that is especially used for label or word index.
-
-The three sequence types are:
-
-* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
-* :code:`SequenceType.SEQUENCE` means the sample is a sequence.
-* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of
-  the input sequence is also a sequence.
-
-Different input type has a defferenct input format. Their formats are shown
-in the above table.
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-where f represents a float value, i represents an integer value.
-
-init_hook
-+++++++++
-
-init_hook is a function that is invoked once the data provoder is initialized.
-Its parameters lists as follows:
-
-* The first parameter is a settings object, which is the same to :code:`settings`
-  in :code:`process` method. The object contains several attributes, including:
-
-  * :code:`settings.input_types`: the input types. Reference `input_types`_.
-  * :code:`settings.logger`: a logging object.
-
-* The rest parameters are the key word arguments. It is made up of PaddpePaddle
-  pre-defined parameters and user defined parameters.
-
-  * PaddlePaddle-defined parameters including:
-
-    * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
-      training or testing.
-    * :code:`file_list` is the list of all files.
-
-  * User-defined parameters args can be set in training configuration.
-
-Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
-use :code:`**kwargs` in init_hook to ensure compatibility by accepting the
-parameters which your init_hook does not use.
-
-cache
-+++++
-DataProvider provides two simple cache strategy. They are:
-
-* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by
-  the user implemented python module every pass.
-* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user
-  implemented python module, and the rest passes will directly read data from
-  memory.
diff --git a/doc/api/v1/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
deleted file mode 100644
index 429338c57f..0000000000
--- a/doc/api/v1/data_provider/src/mnist_config.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
-    train_list='train.list',
-    test_list=None,
-    module='mnist_provider',
-    obj='process')
-
-img = data_layer(name='pixel', size=784)
-label = data_layer(name='label', size=10)
diff --git a/doc/api/v1/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
deleted file mode 100644
index 2ba0b126a0..0000000000
--- a/doc/api/v1/data_provider/src/mnist_provider.dict.py
+++ /dev/null
@@ -1,24 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
-def process(settings, filename):  # settings is not used currently.
-    f = open(filename, 'r')  # open one of training file
-
-    for line in f:  # read each line
-        label, pixel = line.split(';')
-
-        # get features and label
-        pixels_str = pixel.split(' ')
-
-        pixels_float = []
-        for each_pixel_str in pixels_str:
-            pixels_float.append(float(each_pixel_str))
-
-        # give data to paddle.
-        yield {"pixel": pixels_float, 'label': int(label)}
-
-    f.close()  # close file
diff --git a/doc/api/v1/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
deleted file mode 100644
index 34be718ad9..0000000000
--- a/doc/api/v1/data_provider/src/mnist_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-5;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.215686 0.533333 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.67451 0.992157 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.070588 0.886275 0.992157 0 0 0 0 0 0 0 0 0 0 0.192157 0.070588 0 0 0 0 0 0 0 0 0 0 0 0 0 0.670588 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0.117647 0.933333 0.858824 0.313725 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.858824 0.992157 0.831373 0 0 0 0 0 0 0 0 0 0.141176 0.992157 0.992157 0.611765 0.054902 0 0 0 0 0 0 0 0 0 0 0.258824 0.992157 0.992157 0.529412 0 0 0 0 0 0 0 0 0 0.368627 0.992157 0.992157 0.419608 0.003922 0 0 0 0 0 0 0 0 0 0.094118 0.835294 0.992157 0.992157 0.517647 0 0 0 0 0 0 0 0 0 0.603922 0.992157 0.992157 0.992157 0.603922 0.545098 0.043137 0 0 0 0 0 0 0 0.447059 0.992157 0.992157 0.956863 0.062745 0 0 0 0 0 0 0 0 0.011765 0.666667 0.992157 0.992157 0.992157 0.992157 0.992157 0.745098 0.137255 0 0 0 0 0 0.152941 0.866667 0.992157 0.992157 0.521569 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.992157 0.803922 0.352941 0.745098 0.992157 0.945098 0.317647 0 0 0 0 0.580392 0.992157 0.992157 0.764706 0.043137 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.776471 0.043137 0 0.007843 0.27451 0.882353 0.941176 0.176471 0 0 0.180392 0.898039 0.992157 0.992157 0.313725 0 0 0 0 0 0 0 0 0 0 0.070588 0.992157 0.992157 0.713725 0 0 0 0 0.627451 0.992157 0.729412 0.062745 0 0.509804 0.992157 0.992157 0.776471 0.035294 0 0 0 0 0 0 0 0 0 0 0.494118 0.992157 0.992157 0.968627 0.168627 0 0 0 0.423529 0.992157 0.992157 0.364706 0 0.717647 0.992157 0.992157 0.317647 0 0 0 0 0 0 0 0 0 0 0 0.533333 0.992157 0.984314 0.945098 0.603922 0 0 0 0.003922 0.466667 0.992157 0.988235 0.976471 0.992157 0.992157 0.788235 0.007843 0 0 0 0 0 0 0 0 0 0 0 0.686275 0.882353 0.364706 0 0 0 0 0 0 0.098039 0.588235 0.992157 0.992157 0.992157 0.980392 0.305882 0 0 0 0 0 0 0 0 0 0 0 0 0.101961 0.67451 0.321569 0 0 0 0 0 0 0 0.105882 0.733333 0.976471 0.811765 0.713725 0 0 0 0 0 0 0 0 0 0 0 0 0 0.65098 0.992157 0.321569 0 0 0 0 0 0 0 0 0 0.25098 0.007843 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0.94902 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.968627 0.764706 0.152941 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.498039 0.25098 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-0;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.298039 0.333333 0.333333 0.333333 0.337255 0.333333 0.333333 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.027451 0.223529 0.776471 0.964706 0.988235 0.988235 0.988235 0.992157 0.988235 0.988235 0.780392 0.098039 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.14902 0.698039 0.988235 0.992157 0.988235 0.901961 0.87451 0.568627 0.882353 0.976471 0.988235 0.988235 0.501961 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.188235 0.647059 0.988235 0.988235 0.745098 0.439216 0.098039 0 0 0 0.572549 0.988235 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.941176 0.247059 0 0 0 0 0 0 0.188235 0.898039 0.992157 0.992157 0 0 0 0 0 0 0 0 0 0 0 0.039216 0.639216 0.933333 0.988235 0.913725 0.278431 0 0 0 0 0 0 0 0.113725 0.843137 0.988235 0.988235 0 0 0 0 0 0 0 0 0 0 0 0.235294 0.988235 0.992157 0.988235 0.815686 0.07451 0 0 0 0 0 0 0 0.333333 0.988235 0.988235 0.552941 0 0 0 0 0 0 0 0 0 0 0.211765 0.878431 0.988235 0.992157 0.701961 0.329412 0.109804 0 0 0 0 0 0 0 0.698039 0.988235 0.913725 0.145098 0 0 0 0 0 0 0 0 0 0.188235 0.890196 0.988235 0.988235 0.745098 0.047059 0 0 0 0 0 0 0 0 0 0.882353 0.988235 0.568627 0 0 0 0 0 0 0 0 0 0.2 0.933333 0.992157 0.992157 0.992157 0.447059 0.294118 0 0 0 0 0 0 0 0 0.447059 0.992157 0.768627 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.988235 0.988235 0.988235 0.992157 0.47451 0 0 0 0 0 0 0 0.188235 0.933333 0.87451 0.509804 0 0 0 0 0 0 0 0 0 0 0.992157 0.988235 0.937255 0.792157 0.988235 0.894118 0.082353 0 0 0 0 0 0 0.027451 0.647059 0.992157 0.654902 0 0 0 0 0 0 0 0 0 0 0 0.623529 0.988235 0.913725 0.329412 0.376471 0.184314 0 0 0 0 0 0 0.027451 0.513725 0.988235 0.635294 0.219608 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.929412 0.988235 0.988235 0.741176 0.309804 0 0 0 0 0 0 0.529412 0.988235 0.678431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.223529 0.992157 0.992157 1 0.992157 0.992157 0.992157 0.992157 1 0.992157 0.992157 0.882353 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.023529 0.478431 0.654902 0.658824 0.952941 0.988235 0.988235 0.988235 0.992157 0.988235 0.729412 0.278431 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.196078 0.647059 0.764706 0.764706 0.768627 0.580392 0.047059 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
-4;0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.180392 0.470588 0.623529 0.623529 0.623529 0.588235 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.243137 0.494118 0.862745 0.870588 0.960784 0.996078 0.996078 0.996078 0.996078 0.992157 0.466667 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.317647 0.639216 0.639216 0.639216 0.639216 0.639216 0.470588 0.262745 0.333333 0.929412 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.811765 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.184314 0.992157 0.694118 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.192157 0.996078 0.384314 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.454902 0.980392 0.219608 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.564706 0.941176 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.588235 0.776471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.945098 0.560784 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.054902 0.952941 0.356863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.337255 0.917647 0.109804 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.698039 0.701961 0.019608 0.4 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.662745 0.376471 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.090196 0.639216 0.972549 0.945098 0.913725 0.996078 0.996078 0.996078 0.996078 1 0.996078 0.996078 1 0.996078 0 0 0 0 0 0 0 0 0 0 0.007843 0.105882 0.717647 0.776471 0.905882 0.996078 0.996078 0.988235 0.980392 0.862745 0.537255 0.223529 0.223529 0.368627 0.376471 0.6 0.6 0.6 0 0 0 0 0 0 0 0 0.262745 0.470588 0.6 0.996078 0.996078 0.996078 0.996078 0.847059 0.356863 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.909804 0.705882 0.823529 0.635294 0.490196 0.219608 0.113725 0.062745 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.152941 0.152941 0.156863 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0;
diff --git a/doc/api/v1/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
deleted file mode 100644
index 7ce71608a2..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_config.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from paddle.trainer_config_helpers import *
-
-dictionary = dict()
-...  #  read dictionary from outside
-
-define_py_data_sources2(
-    train_list='train.list',
-    test_list=None,
-    module='sentimental_provider',
-    obj='process',
-    # above codes same as mnist sample.
-    args={  # pass to provider.
-        'dictionary': dictionary
-    })
diff --git a/doc/api/v1/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
deleted file mode 100644
index 14bd0e05a9..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_provider.py
+++ /dev/null
@@ -1,43 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-
-
-def on_init(settings, dictionary, **kwargs):
-    # on_init will invoke when data provider is initialized. The dictionary
-    # is passed from trainer_config, and is a dict object with type
-    # (word string => word id).
-
-    # set input types in runtime. It will do the same thing as
-    # @provider(input_types) will do, but it is set dynamically during runtime.
-    settings.input_types = {
-        # The text is a sequence of integer values, and each value is a word id.
-        # The whole sequence is the sentences that we want to predict its
-        # sentimental.
-        'data': integer_value_sequence(len(dictionary)),  # text input
-        'label': integer_value(2)  # label positive/negative
-    }
-
-    # save dictionary as settings.dictionary. 
-    # It will be used in process method.
-    settings.dictionary = dictionary
-
-
-@provider(init_hook=on_init)
-def process(settings, filename):
-    f = open(filename, 'r')
-
-    for line in f:  # read each line of file
-        label, sentence = line.split('\t')  # get label and sentence
-        words = sentence.split(' ')  # get words
-
-        # convert word string to word id
-        # the word not in dictionary will be ignored.
-        word_ids = []
-
-        for each_word in words:
-            if each_word in settings.dictionary:
-                word_ids.append(settings.dictionary[each_word])
-
-        # give data to paddle.
-        yield word_ids, int(label)
-
-    f.close()
diff --git a/doc/api/v1/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
deleted file mode 100644
index 0060ac267c..0000000000
--- a/doc/api/v1/data_provider/src/sentimental_train.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-0       I saw this movie at the AFI Dallas festival . It all takes place at a lake house and it looks wonderful .
-1       This documentary makes you travel all around the globe . It contains rare and stunning sequels from the wilderness .
-...
diff --git a/doc/api/v1/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
deleted file mode 100644
index 92bdc0a8b4..0000000000
--- a/doc/api/v1/data_provider/src/train.list
+++ /dev/null
@@ -1 +0,0 @@
-mnist_train.txt
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
deleted file mode 100644
index 3718cd73a2..0000000000
--- a/doc/api/v1/index_cn.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API中文手册
-============
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
deleted file mode 100644
index 10c297a71d..0000000000
--- a/doc/api/v1/index_en.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-API
-===
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle_en.rst
diff --git a/doc/api/v1/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
deleted file mode 100644
index 51349250e8..0000000000
--- a/doc/api/v1/predict/src/predict_sample.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-TEST_DATA = [[[
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.215686, 0.533333, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.67451, 0.992157, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.070588, 0.886275, 0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.192157,
-    0.070588, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.670588, 0.992157,
-    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.117647, 0.933333, 0.858824, 0.313725,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.090196, 0.858824, 0.992157, 0.831373, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.141176, 0.992157, 0.992157, 0.611765, 0.054902, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0.258824, 0.992157, 0.992157, 0.529412, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.368627, 0.992157, 0.992157, 0.419608, 0.003922, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.094118, 0.835294, 0.992157, 0.992157, 0.517647, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.603922, 0.992157, 0.992157, 0.992157, 0.603922,
-    0.545098, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0.447059, 0.992157, 0.992157,
-    0.956863, 0.062745, 0, 0, 0, 0, 0, 0, 0, 0, 0.011765, 0.666667, 0.992157,
-    0.992157, 0.992157, 0.992157, 0.992157, 0.745098, 0.137255, 0, 0, 0, 0, 0,
-    0.152941, 0.866667, 0.992157, 0.992157, 0.521569, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.070588, 0.992157, 0.992157, 0.992157, 0.803922, 0.352941, 0.745098,
-    0.992157, 0.945098, 0.317647, 0, 0, 0, 0, 0.580392, 0.992157, 0.992157,
-    0.764706, 0.043137, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.070588, 0.992157, 0.992157,
-    0.776471, 0.043137, 0, 0.007843, 0.27451, 0.882353, 0.941176, 0.176471, 0,
-    0, 0.180392, 0.898039, 0.992157, 0.992157, 0.313725, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0.070588, 0.992157, 0.992157, 0.713725, 0, 0, 0, 0, 0.627451,
-    0.992157, 0.729412, 0.062745, 0, 0.509804, 0.992157, 0.992157, 0.776471,
-    0.035294, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.494118, 0.992157, 0.992157,
-    0.968627, 0.168627, 0, 0, 0, 0.423529, 0.992157, 0.992157, 0.364706, 0,
-    0.717647, 0.992157, 0.992157, 0.317647, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.533333, 0.992157, 0.984314, 0.945098, 0.603922, 0, 0, 0, 0.003922,
-    0.466667, 0.992157, 0.988235, 0.976471, 0.992157, 0.992157, 0.788235,
-    0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.686275, 0.882353, 0.364706, 0,
-    0, 0, 0, 0, 0, 0.098039, 0.588235, 0.992157, 0.992157, 0.992157, 0.980392,
-    0.305882, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.101961, 0.67451, 0.321569,
-    0, 0, 0, 0, 0, 0, 0, 0.105882, 0.733333, 0.976471, 0.811765, 0.713725, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.65098, 0.992157, 0.321569, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0.25098, 0.007843, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
-    0.94902, 0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.968627, 0.764706, 0.152941, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.498039, 0.25098, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0
-]], [[
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.298039, 0.333333, 0.333333, 0.333333, 0.337255,
-    0.333333, 0.333333, 0.109804, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0.027451, 0.223529, 0.776471, 0.964706, 0.988235, 0.988235, 0.988235,
-    0.992157, 0.988235, 0.988235, 0.780392, 0.098039, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0.14902, 0.698039, 0.988235, 0.992157, 0.988235, 0.901961,
-    0.87451, 0.568627, 0.882353, 0.976471, 0.988235, 0.988235, 0.501961, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.188235, 0.647059, 0.988235, 0.988235,
-    0.745098, 0.439216, 0.098039, 0, 0, 0, 0.572549, 0.988235, 0.988235,
-    0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2, 0.933333, 0.992157,
-    0.941176, 0.247059, 0, 0, 0, 0, 0, 0, 0.188235, 0.898039, 0.992157,
-    0.992157, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.039216, 0.639216, 0.933333,
-    0.988235, 0.913725, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0.113725, 0.843137,
-    0.988235, 0.988235, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.235294, 0.988235,
-    0.992157, 0.988235, 0.815686, 0.07451, 0, 0, 0, 0, 0, 0, 0, 0.333333,
-    0.988235, 0.988235, 0.552941, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.211765,
-    0.878431, 0.988235, 0.992157, 0.701961, 0.329412, 0.109804, 0, 0, 0, 0, 0,
-    0, 0, 0.698039, 0.988235, 0.913725, 0.145098, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.188235, 0.890196, 0.988235, 0.988235, 0.745098, 0.047059, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0.882353, 0.988235, 0.568627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.2,
-    0.933333, 0.992157, 0.992157, 0.992157, 0.447059, 0.294118, 0, 0, 0, 0, 0,
-    0, 0, 0, 0.447059, 0.992157, 0.768627, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0.623529, 0.988235, 0.988235, 0.988235, 0.988235, 0.992157, 0.47451, 0, 0,
-    0, 0, 0, 0, 0, 0.188235, 0.933333, 0.87451, 0.509804, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0.992157, 0.988235, 0.937255, 0.792157, 0.988235, 0.894118,
-    0.082353, 0, 0, 0, 0, 0, 0, 0.027451, 0.647059, 0.992157, 0.654902, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.623529, 0.988235, 0.913725, 0.329412, 0.376471,
-    0.184314, 0, 0, 0, 0, 0, 0, 0.027451, 0.513725, 0.988235, 0.635294,
-    0.219608, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.929412, 0.988235,
-    0.988235, 0.741176, 0.309804, 0, 0, 0, 0, 0, 0, 0.529412, 0.988235,
-    0.678431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.223529, 0.992157,
-    0.992157, 1, 0.992157, 0.992157, 0.992157, 0.992157, 1, 0.992157, 0.992157,
-    0.882353, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.023529,
-    0.478431, 0.654902, 0.658824, 0.952941, 0.988235, 0.988235, 0.988235,
-    0.992157, 0.988235, 0.729412, 0.278431, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0.196078, 0.647059, 0.764706, 0.764706, 0.768627,
-    0.580392, 0.047059, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0
-]]]
-
-
-def main():
-    conf = parse_config("./mnist_model/trainer_config.py", "")
-    print conf.data_config.load_data_args
-    network = swig_paddle.GradientMachine.createFromConfigProto(
-        conf.model_config)
-    assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
-    network.loadParameters("./mnist_model/")
-    converter = DataProviderConverter([dense_vector(784)])
-    inArg = converter(TEST_DATA)
-    print network.forwardTest(inArg)
-
-
-if __name__ == '__main__':
-    swig_paddle.initPaddle("--use_gpu=0")
-    main()
diff --git a/doc/api/v1/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
deleted file mode 100644
index 42f333dba2..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_cn.rst
+++ /dev/null
@@ -1,58 +0,0 @@
-.. _api_swig_py_paddle:
-
-基于Python的预测
-================
-
-预测流程
---------
-
-PaddlePaddle使用swig对常用的预测接口进行了封装,通过编译会生成py_paddle软件包,安装该软件包就可以在python环境下实现模型预测。可以使用python的 ``help()`` 函数查询软件包相关API说明。
-
-基于Python的模型预测,主要包括以下五个步骤。
-
-1. 初始化PaddlePaddle环境
-
-   在程序开始阶段,通过调用 ``swig_paddle.initPaddle()`` 并传入相应的命令行参数初始化PaddlePaddle。
-
-2. 解析模型配置文件
-   
-   初始化之后,可以通过调用 ``parse_config()`` 解析训练模型时用的配置文件。注意预测数据通常不包含label, 同时预测网络通常直接输出最后一层的结果而不是像训练网络一样再接一层cost layer,所以一般需要对训练用的模型配置文件稍作相应修改才能在预测时使用。
-
-3. 构造paddle.GradientMachine
-  
-   通过调用 ``swig_paddle.GradientMachine.createFromConfigproto()`` 传入上一步解析出来的模型配置就可以创建一个 ``GradientMachine``。
-
-4. 准备预测数据
-  
-   swig_paddle中的预测接口的参数是自定义的C++数据类型,py_paddle里面提供了一个工具类 ``DataProviderConverter`` 可以用于接收和PyDataProvider2一样的输入数据并转换成预测接口所需的数据类型。
-
-5. 模型预测
-  
-   通过调用 ``forwardTest()`` 传入预测数据,直接返回计算结果。
-
-
-预测Demo
---------
-
-如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
-
-..  literalinclude:: src/predict_sample.py
-    :language: python
-    :lines: 15-18,121-136
-
-
-Demo预测输出如下,其中value即为softmax层的输出。由于TEST_DATA包含两条预测数据,所以输出的value包含两个向量 。
-
-..  code-block:: text
-
-    [{'id': None, 'value': array(
-      [[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
-          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
-          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
-          1.15501715e-08],
-       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
-          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
-          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
-          4.48684503e-08]], dtype=float32)}]
-
-
diff --git a/doc/api/v1/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
deleted file mode 100644
index 1c628e6971..0000000000
--- a/doc/api/v1/predict/swig_py_paddle_en.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Python Prediction
-==================
-
-PaddlePaddle offers a set of clean prediction interfaces for python with the help of
-SWIG. The main steps of predict values in python are:
-
-* Parse training configurations
-* Construct GradientMachine
-* Prepare data
-* Predict
-
-Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem. A complete sample code could be found at
-:code:`src_root/doc/ui/predict/predict_sample.py`.
-
-..  literalinclude:: src/predict_sample.py
-    :language: python
-    :lines: 15-18,90-100,101-104
-
-The module that does the most of the job is py_paddle.swig_paddle, it's
-generated by SWIG and has complete documents, for more details you can use
-python's :code:`help()` function. Let's walk through the above python script:
-
-* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
-  PaddlePaddle with command line arguments, for more about command line arguments
-  see :ref:`cmd_detail_introduction` .
-* Parse the configuration file that is used in training with :code:`parse_config()`.
-  Because data to predict with always have no label, and output of prediction work
-  normally is the output layer rather than the cost layer, so you should modify
-  the configuration file accordingly before using it in the prediction work.
-* Create a neural network with
-  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
-  parsed configuration :code:`conf.model_config` as argument. Then load the
-  trained parameters from the model with :code:`network.loadParameters()`.
-* Create a data converter object of utility class :code:`DataProviderConverter`.
-    - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderConverter that can accept the same input data with
-      PyDataProvider2, for more information please refer to document
-      of :ref:`api_pydataprovider2` .
-* Do the prediction with :code:`forwardTest()`, which takes the converted
-  input data and outputs the activations of the output layer.
-
-Here is a typical output:
-
-..  code-block:: text
-
-    [{'id': None, 'value': array([[  5.53018653e-09,   1.12194102e-05,   1.96644767e-09,
-          1.43630644e-02,   1.51111044e-13,   9.85625684e-01,
-          2.08823112e-10,   2.32777140e-08,   2.00186201e-09,
-          1.15501715e-08],
-       [  9.99982715e-01,   1.27787406e-10,   1.72296313e-05,
-          1.49316648e-09,   1.36540484e-11,   6.93137714e-10,
-          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
-          4.48684503e-08]], dtype=float32)}]
-
-:code:`value` is the output of the output layer, each row represents result of
-the corresponding row in the input data, each element represents activation of
-the corresponding neuron in the output layer.
-
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
index eca3ce03bc..5317e66b64 100644
--- a/doc/api/v2/config/activation.rst
+++ b/doc/api/v2/config/activation.rst
@@ -99,3 +99,10 @@ STanh
 ..  automodule:: paddle.v2.activation
     :members: STanh
     :noindex:
+    
+SoftSign
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftSign
+    :noindex:
diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst
index 39db51fa4a..9ac972fb19 100644
--- a/doc/api/v2/config/evaluators.rst
+++ b/doc/api/v2/config/evaluators.rst
@@ -99,3 +99,12 @@ value_printer
 ..  automodule:: paddle.v2.evaluator
     :members:  value_printer
     :noindex:
+
+Detection
+=====
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 1efa74ecda..ddf0b055a9 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,18 +54,23 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
 
+row_conv
+--------
+..  autoclass:: paddle.v2.layer.row_conv
+    :noindex:
+
 Image Pooling Layer
 ===================
 
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -77,6 +82,11 @@ maxout
 ..  autoclass:: paddle.v2.layer.maxout
     :noindex:
 
+roi_pool
+--------
+..  autoclass:: paddle.v2.layer.roi_pool
+    :noindex:
+
 Norm Layer
 ==========
 
@@ -94,12 +104,17 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
     :noindex:
-    
+
+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
+
 Recurrent Layers
 ================
 
@@ -130,7 +145,7 @@ recurrent_group
 ---------------
 ..  autoclass:: paddle.v2.layer.recurrent_group
     :noindex:
-    
+
 lstm_step
 ---------
 ..  autoclass:: paddle.v2.layer.lstm_step
@@ -145,12 +160,12 @@ beam_search
 ------------
 ..  autoclass:: paddle.v2.layer.beam_search
     :noindex:
-    
+
 get_output
 ----------
 ..  autoclass:: paddle.v2.layer.get_output
     :noindex:
-    
+
 Mixed Layer
 ===========
 
@@ -193,6 +208,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
     :noindex:
 
+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:
 
 table_projection
 ----------------
@@ -203,7 +222,7 @@ trans_full_matrix_projection
 ----------------------------
 ..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
-    
+
 Aggregate Layers
 ================
 
@@ -233,6 +252,11 @@ first_seq
 ..  autoclass:: paddle.v2.layer.first_seq
     :noindex:
 
+sub_seq
+---------
+..  autoclass:: paddle.v2.layer.sub_seq
+    :noindex:
+
 concat
 ------
 ..  autoclass:: paddle.v2.layer.concat
@@ -243,6 +267,21 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
     :noindex:
 
+seq_slice
+---------
+..  autoclass:: paddle.v2.layer.seq_slice
+    :noindex:
+
+kmax_sequence_score
+-------------------
+..  autoclass:: paddle.v2.layer.kmax_sequence_score
+    :noindex:
+
+sub_nested_seq
+--------------
+..  autoclass:: paddle.v2.layer.sub_nested_seq
+    :noindex:
+
 Reshaping Layers
 ================
 
@@ -301,6 +340,16 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dot_prod
+---------
+.. autoclass:: paddle.v2.layer.dot_prod
+    :noindex:
+
+out_prod
+--------
+.. autoclass:: paddle.v2.layer.out_prod
+    :noindex:
+
 power
 -----
 ..  autoclass:: paddle.v2.layer.power
@@ -311,6 +360,16 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
     :noindex:
 
+clip
+----
+..  autoclass:: paddle.v2.layer.clip
+    :noindex:
+
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
@@ -328,11 +387,21 @@ cos_sim
 ..  autoclass:: paddle.v2.layer.cos_sim
     :noindex:
 
+l2_distance
+-----------
+..  autoclass:: paddle.v2.layer.l2_distance
+    :noindex:
+
 trans
 -----
 ..  autoclass:: paddle.v2.layer.trans
     :noindex:
 
+scale_shift
+-----------
+..  autoclass:: paddle.v2.layer.scale_shift
+    :noindex:
+
 Sampling Layers
 ===============
 
@@ -346,6 +415,19 @@ sampling_id
 ..  autoclass:: paddle.v2.layer.sampling_id
     :noindex:
 
+multiplex
+---------
+..  autoclass:: paddle.v2.layer.multiplex
+    :noindex:
+
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Slicing and Joining Layers
 ==========================
 
@@ -374,9 +456,14 @@ multi_binary_label_cross_entropy_cost
 ..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
     :noindex:
 
-huber_cost
-----------
-..  autoclass:: paddle.v2.layer.huber_cost
+huber_regression_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_regression_cost
+    :noindex:
+
+huber_classification_cost
+-------------------------
+..  autoclass:: paddle.v2.layer.huber_classification_cost
     :noindex:
 
 lambda_cost
@@ -384,9 +471,9 @@ lambda_cost
 ..  autoclass:: paddle.v2.layer.lambda_cost
     :noindex:
 
-mse_cost
---------
-..  autoclass:: paddle.v2.layer.mse_cost
+square_error_cost
+-----------------
+..  autoclass:: paddle.v2.layer.square_error_cost
     :noindex:
 
 rank_cost
@@ -434,10 +521,44 @@ smooth_l1_cost
 ..  autoclass:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
-Check Layer 
+multibox_loss
+--------------
+..  autoclass:: paddle.v2.layer.multibox_loss
+    :noindex:
+
+Check Layer
 ============
 
 eos
 ---
 ..  autoclass:: paddle.v2.layer.eos
     :noindex:
+
+Miscs
+=====
+
+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+
+Activation with learnable parameter
+===================================
+
+prelu
+--------
+..  autoclass:: paddle.v2.layer.prelu
+    :noindex:
+
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
+Detection output Layer
+======================
+
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
index b2a617fff1..048379cf01 100644
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,11 +125,8 @@ simple_attention
     :members: simple_attention
     :noindex:
 
-Miscs
-=====
-
-dropout_layer
---------------
+dot_product_attention
+---------------------
 ..  automodule:: paddle.v2.networks
-    :members: dropout_layer
+    :members: dot_product_attention
     :noindex:
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
index fef87c4fbd..b56c7332cc 100644
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
@@ -2,112 +2,9 @@
 Data Reader Interface and DataSets
 ==================================
 
+..  toctree::
+    :maxdepth: 1
 
-DataTypes
-=========
-
-..  automodule:: paddle.v2.data_type
-    :members:
-    :noindex:
-
-DataFeeder
-==========
-
-..  automodule:: paddle.v2.data_feeder
-    :members:
-    :noindex:
-
-Reader
-======
-
-..  automodule:: paddle.v2.reader
-    :members:
-    :noindex:
-
-..  automodule:: paddle.v2.reader.creator
-    :members:
-    :noindex:
-
-minibatch
-=========
-
-..  automodule:: paddle.v2.minibatch
-    :members:
-    :noindex:
-
-Dataset
-=======
-
-..  automodule:: paddle.v2.dataset
-    :members:
-    :noindex:
-
-mnist
-+++++
-
-..  automodule:: paddle.v2.dataset.mnist
-    :members:
-    :noindex:
-
-cifar
-+++++
-
-..  automodule:: paddle.v2.dataset.cifar
-    :members:
-    :noindex:
-
-conll05
-+++++++
-
-..  automodule:: paddle.v2.dataset.conll05
-    :members: get_dict,get_embedding,test
-    :noindex:
-
-imdb
-++++
-
-..  automodule:: paddle.v2.dataset.imdb
-    :members:
-    :noindex:
-
-imikolov
-++++++++
-
-..  automodule:: paddle.v2.dataset.imikolov
-    :members:
-    :noindex:
-
-movielens
-+++++++++
-
-..  automodule:: paddle.v2.dataset.movielens
-    :members:
-    :noindex:
-
-..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
-    :noindex:
-    
-..  autoclass:: paddle.v2.dataset.movielens.UserInfo
-    :noindex:
-
-sentiment
-+++++++++
-
-..  automodule:: paddle.v2.dataset.sentiment
-    :members:
-    :noindex:
-
-uci_housing
-+++++++++++
-
-..  automodule:: paddle.v2.dataset.uci_housing
-    :members:
-    :noindex:
-
-wmt14
-+++++
-
-..  automodule:: paddle.v2.dataset.wmt14
-    :members:
-    :noindex:
-
+    data/data_reader.rst
+    data/image.rst
+    data/dataset.rst
diff --git a/doc/api/v2/data/data_reader.rst b/doc/api/v2/data/data_reader.rst
new file mode 100644
index 0000000000..2ccfec9c28
--- /dev/null
+++ b/doc/api/v2/data/data_reader.rst
@@ -0,0 +1,36 @@
+=====================
+Data Reader Interface
+=====================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
new file mode 100644
index 0000000000..6a8ecc5bb1
--- /dev/null
+++ b/doc/api/v2/data/dataset.rst
@@ -0,0 +1,75 @@
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
diff --git a/doc/api/v2/data/image.rst b/doc/api/v2/data/image.rst
new file mode 100644
index 0000000000..97651ffa6b
--- /dev/null
+++ b/doc/api/v2/data/image.rst
@@ -0,0 +1,5 @@
+Image Interface
+===============
+
+..  automodule:: paddle.v2.image
+    :members:
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
new file mode 100644
index 0000000000..5f15cad2b5
--- /dev/null
+++ b/doc/api/v2/fluid.rst
@@ -0,0 +1,18 @@
+======================
+Fluid
+======================
+
+..  toctree::
+    :maxdepth: 1
+
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
+    fluid/io.rst
diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst
new file mode 100644
index 0000000000..0fa78f7dfb
--- /dev/null
+++ b/doc/api/v2/fluid/data_feeder.rst
@@ -0,0 +1,9 @@
+===========
+DataFeeder
+===========
+
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst
new file mode 100644
index 0000000000..a23f3301d0
--- /dev/null
+++ b/doc/api/v2/fluid/evaluator.rst
@@ -0,0 +1,9 @@
+===========
+Evaluator
+===========
+
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst
new file mode 100644
index 0000000000..3a283538c1
--- /dev/null
+++ b/doc/api/v2/fluid/executor.rst
@@ -0,0 +1,9 @@
+===========
+Executor
+===========
+
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst
new file mode 100644
index 0000000000..8f587837e9
--- /dev/null
+++ b/doc/api/v2/fluid/initializer.rst
@@ -0,0 +1,50 @@
+===========
+Initializer
+===========
+
+
+
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+
+
+
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+
+
+
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+
+
+
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+
+
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+
+
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
new file mode 100644
index 0000000000..67f68c4e9e
--- /dev/null
+++ b/doc/api/v2/fluid/io.rst
@@ -0,0 +1,10 @@
+===========
+IO
+===========
+
+
+
+is_parameter
+-----------
+..  autofunction:: paddle.v2.fluid.io.is_parameter
+    :noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
new file mode 100644
index 0000000000..231ec2d4ba
--- /dev/null
+++ b/doc/api/v2/fluid/layers.rst
@@ -0,0 +1,546 @@
+==========
+Layers
+==========
+
+
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+
+dynamic_lstmp
+-------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstmp
+    :noindex:
+
+dynamic_gru
+-----------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_gru
+    :noindex:
+
+data
+----
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+
+mean
+----
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+
+mul
+---
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+
+elementwise_sub
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+    :noindex:
+
+elementwise_mul
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+    :noindex:
+
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+
+
+dropout
+-------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+
+
+reshape
+--------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+
+
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+
+
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+
+
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+
+
+sigmoid_cross_entropy_with_logits
+---------------------------------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+
+
+cast
+----
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+
+
+concat
+-------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+
+
+sums
+----
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+
+
+linear_chain_crf
+----------------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+
+
+assign
+-------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+
+
+split_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+
+
+merge_lod_tensor
+----------------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+
+cos_sim
+--------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+
+
+cross_entropy
+-------------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+
+
+
+square_error_cost
+-----------------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+
+
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+
+
+sequence_conv
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+
+
+conv2d
+------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+
+
+sequence_pool
+-------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+
+
+sequence_first_step
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_first_step
+    :noindex:
+
+
+sequence_last_step
+------------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_last_step
+    :noindex:
+
+
+pool2d
+------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+
+
+batch_norm
+----------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+
+
+beam_search_decode
+------------------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+
+
+lod_rank_table
+--------------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+
+
+max_sequence_len
+----------------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+
+
+topk
+-----
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+
+
+lod_tensor_to_array
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+
+
+
+array_to_lod_tensor
+-------------------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+
+
+
+
+fill_constant
+-------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+
+
+
+fill_constant_batch_size_like
+-----------------------------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+
+
+ones
+----
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+
+
+zeros
+-----
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+
+
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+
+
+array_write
+-----------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+
+
+
+create_array
+------------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+
+
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+
+
+array_read
+----------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+
+
+shrink_memory
+--------------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+
+
+array_length
+-------------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+
+
+conv2d_transpose
+----------------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
+
+
+sequence_expand
+---------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_expand
+    :noindex:
+
+
+gru_unit
+--------
+..  autofunction:: paddle.v2.fluid.layers.gru_unit
+    :noindex:
+
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
+
+
+sequence_softmax
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_softmax
+    :noindex:
+
+
+reduce_sum
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_sum
+    :noindex:
+
+
+reduce_mean
+-----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_mean
+    :noindex:
+
+
+reduce_max
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_max
+    :noindex:
+
+
+reduce_min
+----------
+..  autofunction:: paddle.v2.fluid.layers.reduce_min
+    :noindex:
+
+
+split
+-----
+..  autofunction:: paddle.v2.fluid.layers.split
+    :noindex:
+
+
+matmul
+------
+..  autofunction:: paddle.v2.fluid.layers.matmul
+    :noindex:
+
+logsigmoid
+----------
+..  autofunction:: paddle.v2.fluid.layers.logsigmoid
+    :noindex:
+
+exp
+---
+..  autofunction:: paddle.v2.fluid.layers.exp
+    :noindex:
+
+relu
+----
+..  autofunction:: paddle.v2.fluid.layers.relu
+    :noindex:
+
+tanh
+----
+..  autofunction:: paddle.v2.fluid.layers.tanh
+    :noindex:
+
+tanh_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
+    :noindex:
+
+softshrink
+----------
+..  autofunction:: paddle.v2.fluid.layers.softshrink
+    :noindex:
+
+sqrt
+----
+..  autofunction:: paddle.v2.fluid.layers.sqrt
+    :noindex:
+
+abs
+----
+..  autofunction:: paddle.v2.fluid.layers.abs
+    :noindex:
+
+ceil
+----
+..  autofunction:: paddle.v2.fluid.layers.ceil
+    :noindex:
+
+floor
+-----
+..  autofunction:: paddle.v2.fluid.layers.floor
+    :noindex:
+
+round
+-----
+..  autofunction:: paddle.v2.fluid.layers.round
+    :noindex:
+
+reciprocal
+----------
+..  autofunction:: paddle.v2.fluid.layers.reciprocal
+    :noindex:
+
+log
+---
+..  autofunction:: paddle.v2.fluid.layers.log
+    :noindex:
+
+square
+------
+..  autofunction:: paddle.v2.fluid.layers.square
+    :noindex:
+
+softplus
+--------
+..  autofunction:: paddle.v2.fluid.layers.softplus
+    :noindex:
+
+softsign
+---------
+..  autofunction:: paddle.v2.fluid.layers.softsign
+    :noindex:
+
+brelu
+-----
+..  autofunction:: paddle.v2.fluid.layers.brelu
+    :noindex:
+
+leaky_relu
+----------
+..  autofunction:: paddle.v2.fluid.layers.leaky_relu
+    :noindex:
+
+soft_relu
+---------
+..  autofunction:: paddle.v2.fluid.layers.soft_relu
+    :noindex:
+
+elu
+----
+..  autofunction:: paddle.v2.fluid.layers.elu
+    :noindex:
+
+relu6
+-----
+..  autofunction:: paddle.v2.fluid.layers.relu6
+    :noindex:
+
+pow
+----
+..  autofunction:: paddle.v2.fluid.layers.pow
+    :noindex:
+
+hard_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.hard_shrink
+    :noindex:
+
+thresholded_relu
+----------------
+..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
+    :noindex:
+
+hard_sigmoid
+-------------
+..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
+    :noindex:
+
+swish
+------
+..  autofunction:: paddle.v2.fluid.layers.swish
+    :noindex:
+
+im2sequence
+------
+..  autofunction:: paddle.v2.fluid.layers.im2sequence
+    :noindex:
+
+edit_distance
+---------------
+..  autofunction:: paddle.v2.fluid.layers.edit_distance_error
+    :noindex:
+
+ctc_greedy_decoder
+---------------
+..  autofunction:: paddle.v2.fluid.layers.ctc_greedy_decoder
+    :noindex:
+
+l2_normalize
+------------
+..  autofunction:: paddle.v2.fluid.layers.l2_normalize
+    :noindex:
+
+sequence_reshape
+----------------
+..  autofunction:: paddle.v2.fluid.layers.sequence_reshape
+    :noindex:
+
+row_conv
+--------
+..  autofunction:: paddle.v2.fluid.layers.row_conv
+    :noindex:
+
+multiplex
+---------
+..  autofunction:: paddle.v2.fluid.layers.multiplex
+    :noindex:
diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst
new file mode 100644
index 0000000000..500019bc50
--- /dev/null
+++ b/doc/api/v2/fluid/nets.rst
@@ -0,0 +1,33 @@
+===========
+Nets
+===========
+
+simple_img_conv_pool
+--------------------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+
+
+img_conv_group
+---------------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+
+
+sequence_conv_pool
+------------------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
+
+
+glu
+---
+..  autofunction:: paddle.v2.fluid.nets.glu
+    :noindex:
+
+
+scaled_dot_product_attention
+----------------------------
+..  autofunction:: paddle.v2.fluid.nets.scaled_dot_product_attention
+    :noindex:
+
diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst
new file mode 100644
index 0000000000..19b4940f08
--- /dev/null
+++ b/doc/api/v2/fluid/optimizer.rst
@@ -0,0 +1,54 @@
+===========
+Optimizer
+===========
+
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+
+
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+
+
+
+MomentumOptimizer
+-----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+
+
+
+AdagradOptimizer
+----------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+
+
+AdamOptimizer
+-------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+
+
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+
+
+DecayedAdagradOptimizer
+-----------------------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
+
diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst
new file mode 100644
index 0000000000..ca0c8af9e8
--- /dev/null
+++ b/doc/api/v2/fluid/param_attr.rst
@@ -0,0 +1,11 @@
+===========
+ParamAttr
+===========
+
+
+
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst
new file mode 100644
index 0000000000..7d4042d1f4
--- /dev/null
+++ b/doc/api/v2/fluid/profiler.rst
@@ -0,0 +1,10 @@
+===========
+Profiler
+===========
+
+
+
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst
new file mode 100644
index 0000000000..868e225ed3
--- /dev/null
+++ b/doc/api/v2/fluid/regularizer.rst
@@ -0,0 +1,25 @@
+===========
+Regularizer
+===========
+
+WeightDecayRegularizer
+----------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+
+
+L2DecayRegularizer
+------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+
+
+
+L1DecayRegularizer
+-------------------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
+
+
diff --git a/doc/design/api.md b/doc/design/api.md
index 8185d2af0e..e6a4638d91 100644
--- a/doc/design/api.md
+++ b/doc/design/api.md
@@ -3,7 +3,7 @@
 ## Ingredients
 
 As our design principle is starting from the essence: how could we
-allow users to express and solve their problems at neural networks.
+allow users to express and solve their problems as neural networks.
 Some essential concepts that our API have to provide include:
 
 1. A *topology* is an expression of *layers*.
@@ -233,7 +233,7 @@ paddle.dist_train(model,
                   num_parameter_servers=15)
 ```
 
-The pseudo code if `paddle.dist_train` is as follows:
+The pseudo code of `paddle.dist_train` is as follows:
 
 ```python
 def dist_train(topology, parameters, trainer, reader, ...):
diff --git a/doc/design/auto_gradient_check.md b/doc/design/auto_gradient_check.md
new file mode 100644
index 0000000000..f9991541bc
--- /dev/null
+++ b/doc/design/auto_gradient_check.md
@@ -0,0 +1,146 @@
+## Auto Gradient Checker Design
+
+## Backgraound:
+- Generally, it is easy to check whether the forward computation of an Operator is correct or not. However, backpropagation is a notoriously difficult algorithm to debug and get right:
+  1. you should get the right backpropagation formula according to the forward computation.
+  2. you should implement it right in CPP.
+  3. it's difficult to prepare test data.
+
+- Auto gradient checking gets a numerical gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
+  1. numerical gradient checker only need forward operator.
+  2. user only need to prepare the input data for forward Operator.
+
+## Mathematical Theory
+The following two document from Stanford has a detailed explanation of how to get numerical gradient and why it's useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numeric Gradient Implementation
+### Python Interface
+```python
+def get_numerical_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, whose key is
+    variable name, and value is numpy array.
+    :param output_name: The final output variable name.
+    :param input_to_check: The input variable with respect to which to compute the gradient.
+    :param delta: The perturbation value for numeric gradient method. The
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it will suffer from numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+```
+
+### Explaination:
+
+- Why need `output_name`
+  - An Operator may have multiple Output, one can get independent gradient from each Output. So caller should specify the name of the output variable.
+
+- Why need `input_to_check`
+  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+
+
+### Core Algorithm Implementation
+
+
+```python
+    # we only compute gradient of one element a time.
+    # we use a for loop to compute the gradient of each element.
+    for i in xrange(tensor_size):
+        # get one input element by its index i.
+        origin = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run op and then get the new value of the result tensor.
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # plus delta to this element, run op and get the new value of the result tensor.
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, origin)
+
+        # compute the gradient of this element and store it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Graident Checker Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+1. Numerical gradient
+2. CPU kernel gradient
+3. GPU kernel gradient (if supported)
+
+The numerical gradient only relies on forward Operator. So we use the numerical gradient as the reference value. And the gradient checking is performed in the following three steps:
+
+1. calculate the numerical gradient
+2. calculate CPU kernel gradient with the backward Operator and compare it with the numerical gradient
+3. calculate GPU kernel gradient with the backward Operator and compare it with the numeric gradient (if supported)
+
+#### Python Interface
+
+```python
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: the input variable with respect to which to compute the gradient.
+        :param output_name: The final output variable name.
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+```
+
+### How to check if two numpy array is close enough?
+if `abs_numerical_grad` is nearly zero, then use abs error for numerical_grad
+
+```python
+numerical_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numerical_grad = numpy.abs(numerical_grad)
+# if abs_numerical_grad is nearly zero, then use abs error for numeric_grad, not relative
+# error.
+abs_numerical_grad[abs_numerical_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numerical_grad - operator_grad) / abs_numerical_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes:
+The Input data for auto gradient checker should be reasonable to avoid numerical  stability problem.
+
+
+#### Refs:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
diff --git a/doc/design/backward.md b/doc/design/backward.md
new file mode 100644
index 0000000000..20fda7a98f
--- /dev/null
+++ b/doc/design/backward.md
@@ -0,0 +1,158 @@
+# Backward Building
+
+## Motivation
+
+In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
+
+When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
+
+## Challenges
+
+The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place. 
+
+## Usage
+
+Although the whole algorithm is comprised of many functions, only one is exposed as API:
+
+```python
+def append_backward(loss, parameter_list=None, no_grad_set=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by the cost function.
+        parameter_list(list): Parameters that need to be updated by optimizers.
+            If None, it means all parameters need to be updated.
+
+        no_grad_set(set): Variables that have no gradients in Block 0. 
+            If None, the set will be generated inside the function and 
+            contains all variables with `step_gradient=True` from all blocks.
+        
+    Return:
+        (list[Variable]): list of (parameters, gradients) pair.
+    """
+```
+
+By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
+
+This API will be invoked automatically before optimizer building. 
+As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
+
+## Implementation
+
+The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables. 
+
+### Creating `grad_op`s
+
+The creating of `grad_op`s is implemented by:
+
+```python
+def _append_backward_ops_(target,
+                          block,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        target(Variable): the target variable of forward pass
+        block(Block): the block where forward ops are
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict): 
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+    """
+```
+
+Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`. 
+
+However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
+
+During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
+
+```
+******* pseudo-code ********
+for op in reversed(block.ops):
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Create a new block(`grad_s_block`), whose father is `s_block`.
+        Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
+    
+    Invoke `core.get_grad_op_desc()` to get op's grad_op.
+    Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
+    Assign grad_s_block to grad_op as it's 'sub_block' attribute.
+    Append grad_op to current target_block.
+```
+
+The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
+
+### Corner Cases of `grad_op` Creating
+
+In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
+
+#### Shared Variables
+
+If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up. 
+
+For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
+
+See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
+
+#### No Gradient Variables
+
+In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. 
+
+Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
+
+It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. 
+
+This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). 
+
+### Creating Backward Variables
+
+Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
+
+```python
+def _append_backward_vars_(block, 
+                           start_op_idx, 
+                           grad_to_var, 
+                           grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
+    """
+```
+
+Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
+
+```
+for op in block.ops[start_op_idx : ]:
+
+    if op has an attribute named 'sub_block':
+        Get the sub-block(`s_block`) from op's attribute.
+        Invoke _append_backward_vars_(), with `block=s_block`
+        
+    for var_name in op.all_output_names():
+        if block.has_var_recursive(var_name) or var_name is the name of empty variable:
+            continue
+        create a new variable named 'var_name' in block
+        if grad_to_var.has_key(var_name):
+            set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
+            
+    do op's var type inference
+    do op's shape inference
+```
diff --git a/doc/design/block.md b/doc/design/block.md
new file mode 100644
index 0000000000..907a2def55
--- /dev/null
+++ b/doc/design/block.md
@@ -0,0 +1,336 @@
+# Design Doc: Block and Scope
+
+## The Representation of Computation
+
+Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
+
+- Caffe, Torch, and Paddle: sequences of layers.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
+- PaddlePaddle: nested blocks, like C++ and Java programs.
+
+## Block in Programming Languages and Deep Learning
+
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
+
+Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
+
+| programming languages | PaddlePaddle          |
+|-----------------------|-----------------------|
+| for, while loop       | RNN, WhileOp          |
+| if, if-else, switch   | IfElseOp, SwitchOp    |
+| sequential execution  | a sequence of layers  |
+
+A key difference is that a C++ program describes a one pass computation, whereas a deep learning program describes both the forward and backward passes.
+
+## Stack Frames and the Scope Hierarchy
+
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
+
+| programming languages | PaddlePaddle                    |
+|-----------------------|---------------------------------|
+| stack                 | scope hierarchy                 |
+| stack frame           | scope                           |
+| push at entering block| push at entering block          |
+| pop at leaving block  | destroy when minibatch completes|
+
+1. In traditional programs:
+
+   - When the execution enters the left curly brace of a block, the runtime pushes a frame into the stack, where it realizes local variables.
+   - After the execution leaves the right curly brace, the runtime pops the frame.
+   - The maximum number of frames in the stack is the maximum depth of nested blocks.
+
+1. In PaddlePaddle
+
+   - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - The height of the highest tree is the maximum depth of nested blocks.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
+
+## Use Blocks in C++ and PaddlePaddle Programs
+
+Let us consolidate the discussion by presenting some examples.
+
+### Blocks with `if-else` and `IfElseOp`
+
+The following C++ programs shows how blocks are used with the `if-else` structure:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int z = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+
+```
+
+An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
+
+
+### Blocks with `for` and `RNNOp`
+
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
+
+```python
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
+o1, o2 = rnn()
+```
+has its equivalent C++ program as follows
+
+```c++
+int* x = {10, 20, 30};
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
+
+int mem[sizeof(x) / sizeof(x[0]) + 1];
+int o1[sizeof(x) / sizeof(x[0]) + 1];
+int o2[sizeof(x) / sizeof(x[0]) + 1];
+for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
+  int x = x[i-1];
+  if (i == 1) mem[0] = m;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
+  int act = sigmoid(sum);
+  mem[i] = act;
+  o1[i] = act;
+  o2[i] = hidden_out;
+}
+```
+
+## Compilation and Execution
+
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
+
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
+
+## The "Binary Executable File Format"
+
+The definition of the protobuf message is as follows:
+
+```protobuf
+message BlockDesc {
+  repeated VarDesc vars = 1;
+  repeated OpDesc ops = 2;
+}
+```
+
+The step net in above RNN example would look like
+
+```
+BlockDesc {
+  vars = {
+    VarDesc {...} // x
+    VarDesc {...} // h
+    VarDesc {...} // fc_out
+    VarDesc {...} // hidden_out
+    VarDesc {...} // sum
+    VarDesc {...} // act
+  }
+  ops = {
+    OpDesc {...} // matmul
+    OpDesc {...} // add_two
+    OpDesc {...} // sigmoid
+  }
+};
+```
+
+Also, the RNN operator in above example is serialized into a protobuf message of type `OpDesc` and would look like:
+
+```
+OpDesc {
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
+  attrs {
+    "states" : {1} // the index of h
+    "step_net" : <above step net>
+  }
+};
+```
+
+This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing the global block.
+
+
+## The Compilation of Blocks
+
+During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
+
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
+
+```python
+a = pd.Variable(shape=[20, 20])
+b = pd.fc(a, params=["fc.w", "fc.b"])
+
+rnn = pd.create_rnn()
+with rnn.stepnet():
+    x = a.as_step_input()
+    # reuse fc's parameter
+    fc_without_b = pd.get_variable("fc.w")
+    rnn.output(fc_without_b)
+
+out = rnn()
+```
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+
+In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
+
+To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
+
+`SymbolTable` can do the following:
+
+- store the definitions (some names and attributes) of variables and operators,
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+
+
+```c++
+// Information in SymbolTable is enough to trace the dependency graph. So maybe
+// the Eval() interface takes a SymbolTable is enough.
+class SymbolTable {
+ public:
+  SymbolTable(SymbolTable* parent) : parent_(parent) {}
+
+  OpDesc* NewOp(const string& name="");
+
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
+
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
+  // recursively.
+  // this interface is introduced to support InferShape, find protobuf messages
+  // of variables and operators, pass pointers into InferShape.
+  //
+  // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
+  VarDesc* FindVar(const string& name, bool recursive=true);
+
+  OpDesc* FindOp(const string& name);
+
+  BlockDesc Compile() const;
+
+ private:
+  SymbolTable* parent_;
+
+  map<string, OpDesc> ops_;
+  map<string, VarDesc> vars_;
+};
+```
+
+After all the description of variables and operators is added into SymbolTable,
+the block has enough information to run.
+
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
+
+
+```c++
+namespace {
+
+class Block : OperatorBase {
+public:
+  Block(const BlockDesc& desc) desc_(desc) {}
+
+  void InferShape(const framework::Scope& scope) const override {
+    if (!symbols_ready_) {
+      CreateVariables(scope);
+      CreateOperators();
+    }
+    // should run InferShape first.
+    for (auto& op : runtime_table_.ops()) {
+      op->InferShape(scope);
+    }
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    PADDLE_ENFORCE(symbols_ready_, "operators and variables should be created first.");
+    for (auto& op : runtime_table_.ops()) {
+      op->Run(scope, place);
+    }
+  }
+
+  void CreateVariables(const framework::Scope& scope);
+  void CreateOperators();
+
+  // some other necessary interfaces of NetOp are listed below
+  // ...
+
+private:
+  BlockDesc desc_;
+  bool symbols_ready_{false};
+};
+```
+
+## The Execution of Blocks
+
+Block inherits from OperatorBase, which has a Run method.
+Block's Run method will run its operators sequentially.
+
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
+
+The definition of Eval is as follows:
+
+```c++
+// clean a block description by targets using the corresponding dependency graph.
+// return a new BlockDesc with minimal number of operators.
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
+// to a cluster.
+BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
+
+void Block::Eval(const vector<string>& targets,
+                 const framework::Scope& scope,
+                 const platform::DeviceContext& dev_ctx) {
+  BlockDesc min_desc = Prune(desc_, targets);
+  Block min_block(min_desc);
+  min_block.Run(scope, dev_ctx);
+}
+```
diff --git a/doc/design/build_system/README.md b/doc/design/build_system/README.md
index 310739f37a..bf0e4dddc1 100644
--- a/doc/design/build_system/README.md
+++ b/doc/design/build_system/README.md
@@ -105,3 +105,48 @@ shared_library(api
 ### Implementation
 
 As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph.  It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
+
+### Using Package Manager For Go
+
+Building Go binaries and libraries need to satisfy their dependencies, generally
+we can do `go get ./...` to download and compile all external dependencies. The
+problems are:
+
+1. `go get` will always get the latest code from the default branch of the
+    remote repo, so changes of dependents might break the build. This is very
+    different with what we already have in `cmake/external` which download a
+    specific version or commit id of the dependency.
+1. Some locations can not access external dependencies through the internet, as mentioned
+   in https://github.com/PaddlePaddle/Paddle/issues/2605. Using package management
+   tools can package the dependencies as a "vendor" package, which can be mirrored
+   at many cloud file hosting, so users what to compile paddle by themselves can
+   download this "vendor" package from a mirror site.
+
+#### Choose A Suitable Tool
+
+As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
+list dozens of Go package managers. We choose the tool using following principles:
+
+- Most "active" projects with more stars, more pull requests or commits
+- Widely used project
+
+After comparing all these projects, we shall choose between the most popular
+tools: Godep and Glide.
+
+Here's a brief comparison between Godep and Glide
+: https://github.com/Masterminds/glide/wiki/Go-Package-Manager-Comparison. There are
+also many complaints about using `Godep`. There's also a new "official" pakcage
+management tool has been started at: https://github.com/golang/dep to resolve
+such problems, but it's currently at Alpha stage. So the best choice now is
+glide obviously.
+
+#### Manage Go Packages
+
+- Dependencies: `go/glide.yaml` will store the dependencies and their versions which
+  is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
+  with their commit id. Builds will "lock" to these packages if we don't `glide up`
+  them
+- Vendor package: `go/vendor` directory will generated when running `cmake` command. `cmake`
+  will download the code corresponding to `go/glide.lock`. If we put a vendor folder
+  under `go/`, cmake will just check the commit id to the packages under the folder,
+  if commit id matches, there will be no download at all.
diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png
new file mode 100644
index 0000000000..232762b82a
Binary files /dev/null and b/doc/design/ci_build_whl.png differ
diff --git a/doc/design/cluster_train/README.md b/doc/design/cluster_train/README.md
index 74961f8005..177a5f5d54 100644
--- a/doc/design/cluster_train/README.md
+++ b/doc/design/cluster_train/README.md
@@ -54,17 +54,18 @@ The life cycle of a single task is illustrated below:
 <img src="src/paddle-task-states.png"/>
 
 1. When a new pass of training starts, all tasks will be placed in the todo queue.
-1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
-1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
-1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. Upon trainer requests for new task, the master server will dispatch a task from todo queue to it, put the task in the pending queue and wait for completion.
+1. The trainer will work on its task and tell the master server once the task is completed and ask for new task. The master server will dispatch a new task to that trainer.
+1. If a task fails for any reason in trainer, or takes longer than a specific period of time,  the master server will move the task back to the todo queue. The timeout count for that task will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, then it will be discarded.
 1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
 
 ### Trainer Process
 
 The trainer process will:
 
-- Receive tasks from the master.
-- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+- Request tasks from the master.
+- Work on the tasks
+- Upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
 
 ### Parameter Server Process
 
@@ -119,8 +120,8 @@ When the master is started by the Kubernetes, it executes the following steps at
 
 1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
 1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
-1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
-1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+1. Write its ip address to */master/addr* so that trainers can discover it.
+1. Listens to trainers' request of task, dispatch one upon request, and updates task queue using an etcd transaction to ensure lock is held during the update.
 
 When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
 
@@ -128,13 +129,11 @@ When the master server process is dead for any reason, Kubernetes will restart i
 
 When the trainer is started by the Kubernetes, it executes the following steps at startup:
 
-1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
-1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
-1. Waits for tasks from the master to start training.
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count */ps_desired*.
+1. Finds and watches */master/addr* to get master's address.
+1. Requests for tasks from the master to start training.
 
-If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
-
-When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from master and go on training.
 
 ### Parameter Server Process
 
diff --git a/doc/design/cluster_train/large_model_dist_train.md b/doc/design/cluster_train/large_model_dist_train.md
new file mode 100644
index 0000000000..0c4b5bc24c
--- /dev/null
+++ b/doc/design/cluster_train/large_model_dist_train.md
@@ -0,0 +1,101 @@
+# Alalysis of large model distributed training in Paddle
+
+***NOTE: This is only some note for how we implemeted this scheme in V1, not a new design.***
+
+## What is it
+
+We often encounter cases that the embedding layer parameters(sparse) are so large that we can not store it in the trainer's memory when training. So we need to put them to several servers, and fetch them row by row instead of fetch all of the parameters.
+
+## How to use
+
+Specify command-line argument like  `--loadsave_parameters_in_pserver=true --ports_num_for_sparse=1  --use_old_updater=1` when starting the paddle trainer. And also add something like `--ports_num_for_sparse=1 --pserver_num_threads=5` when starting pserver processes.
+
+Accrodingly, configure your embedding layers like:
+
+```python
+SPARSE_REMOTE=True
+
+w1 = data_layer(name="w1", size=dict_size)
+emb1 = embedding_layer(input=w1, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+w2 = data_layer(name="w2", size=dict_size)
+emb2 = embedding_layer(input=w2, size=32, param_attr=ParameterAttribute(sparse_update=SPARSE_REMOTE))
+...
+```
+
+## Implementation details
+
+```c++
+enum MatType {
+  MAT_NORMAL,
+  MAT_NORMAL_SHARED,
+  MAT_VALUE_SHARED,
+  MAT_SPARSE_ROW_IDS,
+  MAT_SPARSE_ROW_AUTO_GROW,
+  MAT_CACHE_ROW,
+  MAT_SPARSE_ROW,
+  MAT_SPARSE_ROW_PREFETCH,
+  MAT_SPARSE_ROW_PREFETCH_FULL_SIZE,
+};
+```
+
+`MAT_SPARSE_ROW_PREFETCH` is what we use when configured to fetch only row of matrix when training.
+
+In `trainer_internal.cpp:L93 trainOneBatch`:
+
+```c++
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    REGISTER_TIMER("prefetch");
+    gradientMachine_->prefetch(inArgs);
+    parameterUpdater_->getParametersRemote();
+  }
+```
+
+When doing actual network forward and backward, at the beginning of each batch, the trainer will try to download one row of data from pserver.
+
+In `trainer/RemoteParameterUpdater.cpp`: `parameterUpdater_->getParametersRemote();`:
+
+```c++
+if (fullSize) {
+    ...
+} else {
+getParams = [&] {
+    parameterClient_->getParameterSparse(
+        /* recvParameterType= */ PARAMETER_VALUE, sendBackParameterType);
+};
+applyL1 = [](Parameter& para, real decayRate) {
+    para.getMat(PARAMETER_VALUE)->applyL1(/*lr=*/1.0f, decayRate);
+};
+}
+```
+
+Calling `parameterClient_->getParameterSparse` will do remote call to pserver's `getParameterSparse`:
+
+```c++
+void ParameterServer2::getParameterSparse(const SendParameterRequest& request,
+                                          std::vector<Buffer>& inputBuffers,
+                                          SendParameterResponse* response,
+                                          std::vector<Buffer>* outputBuffers) {
+  (void)inputBuffers;
+  auto& buffer = *readWriteBuffer_;
+  size_t numReals = 0;
+  for (const auto& block : request.blocks()) {
+    numReals += getParameterConfig(block).dims(1);
+  }
+  buffer.resize(numReals);
+
+  VLOG(3) << "pserver: getParameterSparse, numReals=" << numReals;
+
+  ReadLockGuard guard(parameterMutex_);
+  size_t offset = 0;
+  for (const auto& block : request.blocks()) {
+    size_t width = getParameterConfig(block).dims(1);
+    Buffer buf = {buffer.data() + offset, width};
+    int type = request.send_back_parameter_type();
+    sendBackParameterSparse(block, type, response, &buf, width, outputBuffers);
+    offset += width;
+  }
+}
+```
+
+`getParameterConfig(block).dims(1)` returns the width of the current "parameter block"(a shard of parameter object),
+then `getParameterSparse` remote call returns only one row of data to the client.
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
index b3e4079010..474b8c572c 100644
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
   char*               name;
   paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
 
-paddle_pserver_client* paddle_new_pserver_client();
-void paddle_pserver_client_release(paddle_pserver_client* client);
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 
 /**
  * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
  * @param learning_rate the learning rate for the gradients.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 
 /**
  * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
  * paddle_get_params will block until parameters are initialized on
  * the parameter servers.
  *
- * @param names the array of names of the parameters to get.
- * @param dst the destination array of parameters to save to.
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
  * @param len the length of the names array and the paddle_parameter
  * array.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 
 /**
  * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
  * @param path the path to save parameters.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md
new file mode 100644
index 0000000000..6e8e593845
--- /dev/null
+++ b/doc/design/cluster_train/remote_parameter_updater.md
@@ -0,0 +1,21 @@
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
new file mode 100644
index 0000000000..b755185c81
--- /dev/null
+++ b/doc/design/cluster_train/save_model.md
@@ -0,0 +1,111 @@
+# Design Doc: Save Model
+
+## Overview
+
+The model is the output of the training process. There are two
+ways from which user can obtain a model:
+
+- Save model triggered by user code: user code asks PaddlePaddle to
+  save a model.
+- Convert model from the checkpoint: model being converted from
+  pservers' periodic checkpoint. In this way, the user can cancel a
+  job at any time, and still have a relatively fresh model (we
+  checkpoint around every 5 minutes).
+
+### Trainer Saving Model vs. Pservers Saving Model
+
+Both trainers and pservers have access to the model. So the model can
+be saved from a trainer or pservers. We need to decide where the model
+is saved from.
+
+#### Dense Update vs. Sparse Update
+
+There are two types of model update methods: dense update and sparse
+update (when the model parameter is configured to be sparse).
+
+- Dense update
+
+  Every trainer has it's own full copy of the model. Every model
+  update will update the entire model.
+
+- Sparse update
+
+  The training input is sparse, and the trainer does not have the
+  entire model. It will only download the sub-model necessary related
+  to the input. When updating the model, only the sub-model related to
+  the training input is updated.
+
+
+#### Pservers Saving Model
+
+The benefit of letting pservers save model is they have the entire
+model all the time. However, since pservers are on different nodes, it
+requires a merging process to merge model shards into the same
+model. Thus requires the pservers to write models to a distributed
+filesystem, making the checkpoint shards visible to the merge program.
+
+#### Trainer Saving Model
+
+The benefit of letting one trainer to save the model is it does not
+require a distributed filesystem. And it's reusing the same save model
+logic when training locally - except when doing sparse update, the
+trainer needs to download the entire model during the saving process.
+
+#### Conclusion
+
+Given trainer saving model does not require a distributed filesystem,
+and is an intuitive extension to trainer saving model when training
+locally, we decide to let the trainer save the model when doing
+distributed training.
+
+
+### Convert Model from Checkpoint
+
+TODO
+
+
+## Timeline
+
+We first implement trainer save the model. Converting the latest
+snapshot to a model will be a TODO for future.
+
+
+## Trainer Save Model
+
+### Trainer Election
+
+One trainer will be elected as the one to save the model. When using
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
+
+### Model Save Path
+
+Each trainer will be given the directory to save the model. The
+elected trainer will save the model to
+`given-directory/trainerID`. Since the trainer ID is unique, this
+would prevent concurrent save to the same file when multiple trainers
+are elected to save the model when split-brain problem happens.
+
+### What Happens When Model Is Saving
+
+It takes some time to save model, we need to define what will happen
+when save model is taking place.
+
+When doing dense update, the trainer uses the local model. Pservers
+does not need to pause model update.
+
+When doing sparse update. The trainer needs to download the entire
+model while saving. To get the most accurate model, the model update
+needs to be paused before the download starts and resumed after the
+download finishes. Otherwise, the trainer gets a model that is
+"polluted": some part of the model is old, some part of the model is
+new.
+
+It's unclear that the "polluted" model will be inferior due to the
+stochastic nature of deep learning, and pausing the model update will
+add more complexity to the system. Since supporting sparse update is a
+TODO item. We defer the evaluation of pause the model update or not
+during saving model to the future.
diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle
index 56681ae5bb..f973dc9b9d 100644
Binary files a/doc/design/cluster_train/src/paddle-etcd.graffle and b/doc/design/cluster_train/src/paddle-etcd.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png
index 4f9c9762b3..57981ceb4b 100644
Binary files a/doc/design/cluster_train/src/paddle-etcd.png and b/doc/design/cluster_train/src/paddle-etcd.png differ
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
index 42384a3f05..43415ed8cf 100644
Binary files a/doc/design/cluster_train/src/trainer.graffle and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
new file mode 100644
index 0000000000..afc65e831d
--- /dev/null
+++ b/doc/design/concurrent_programming.md
@@ -0,0 +1,163 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model.  The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.   
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs.  This allows us to create a concurrent/parallel AI application.   An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**  
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program.  So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+| Go | Fluid |
+|----|-------|
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
+| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
+| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+  paddlepaddle()
+  fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, L, Y],
+    ops = [
+      read(output = X)
+      kube_get_workers_addrs(output = L)
+      Y = tensor_array(len(L))
+      parallel_for(input = X, output = Y, 
+                   attrs = {L, block_id(1)}) # referring to block 1
+    ]
+  }
+  
+  block[1] = Block {
+    parent = 0,
+    vars = [x, y, index],
+    ops = [
+      slice(input = [X, index], output = x) # index is initialized by parallel_for
+      send(input = x, attrs = L[index])
+      recv(outputs = y, attrs = L[index])
+      assign(input = y, output = Y[index])
+    ]
+  }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() {  //// block 0
+  X = fluid.read(...)
+  L = fluid.k8s.get_worker_addrs()
+  Y = fluid.tensor_array(len(L))
+  fluid.parallel_for(X, L, 
+                     func(index int) {  //// block 1
+                       x = X[index]
+                       fluid.send(L[index], x)
+                       y = fluid.recv(L[index])
+                       Y[index] = y
+                     })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.  
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).  
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h).  `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed, 
+
+  1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+  2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread  
+     1. creates an Executor instance, and
+     2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+  W = Tensor(...)
+  x = fluid.listen_and_do(
+        fluid.k8s.self_addr(),
+        func(input Tensor) {
+          output = fluid.mult(input, W)
+        })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+  1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+  2. once a connection is established,
+     1. creates a scope of two parameters, "input" and "output",
+     2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+     3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+   1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+   2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads.  For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool.  Multiple green threads might run on the same OS thread.  An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/design/csp.md b/doc/design/csp.md
new file mode 100644
index 0000000000..ba9cacfdea
--- /dev/null
+++ b/doc/design/csp.md
@@ -0,0 +1,96 @@
+# Design Doc: CSP in PaddlePaddle Fluid
+
+## Motivation
+
+Concurrent programming is important for deep learning.  Few example applications are:
+
+1.  The main thread keeps reading the next mini-batch while another thread uses the GPU for computing.
+2.  The main thread performs the computation while another thread uploads the local gradients from each trainer to the parameter server.
+
+Most DL systems, including TensorFlow, Caffe2, and MxNet, can asynchronously execute operators in a graph. However, Fluid doesn't have the concept of a graph at all, as the design goal of Fluid is that of a programming language.
+
+## Concurrent Programming Models
+
+There were many concurrent programming models, implemented in various forms:
+
+| concurrent programming model | implementation |
+|-----|-----|
+| mutex | types and functions in standard libraries |
+| semaphore | types and functions in standard libraries |
+| communicating sequential processes (CSP) | Go programming language |
+| actor model | Erlang programming language |
+| message passing | MPI |
+| bulk synchronous parallel (BSP) | Pregel distributed programming framework |
+
+Since Fluid was designed to be a programming language, we would like to implement CSP in Fluid.
+
+### CSP v.s. Actor Model
+
+A well-known implementation of Actor Model is the Erlang programming language.  In Actor Model, *processes* could send messages to another process and receive messages from another process given the process IDs.  We can find the three ingredients, process with ID, send, and recv, in MPI too.  Indeed, we can rewrite Erlang programs in Python + MPI with possibly fewer lines of code.  Our concern with Actor Model is that it doesn't seem reasonable to implement process management in a programming language's runtime library; instead, it should be the operating systems' responsibility to manage processes and libraries like MPI for send/recv.
+
+## CSP in Fluid
+
+Fluid has two fundamental control-flows: *if-else* and *while*.  If we are to implement CSP, we need the following:
+
+1. a new data type: *channel* and operators *send* and *recv*,
+1. *goroutine* or thread, and
+1. a new control-flow: select.
+
+We also need Python wrappers for the above components.
+
+The type *channel* is conceptually the blocking queue.  In Go, its implemented is a [blocking circular queue](https://github.com/golang/go/blob/68ce117cf17b8debf5754bfd476345779b5b6616/src/runtime/chan.go#L31-L50), which supports send and recv.
+
+The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
+
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+
+### Type Channel
+
+Fluid supports many data types:
+
+1. Tensor,
+1. Row-sparse Tensor
+1. LoD Tensor,
+1. Tensor array, etc
+
+Each data type is registered in the [`framework.proto`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L117-L127) as an enum value.  To add a new type channel, we need to add a new type enum.
+
+To expose a C++ type to Python, we need to edit the [`pybind.cc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc) file.  [Here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/pybind/pybind.cc#L120-L164) is an example how we expose C++ class LoDTensor.
+
+## Syntax Design
+
+### Create Channel
+
+In Go, we create a channel by specifying the element type and buffer size:
+
+```go
+ch  := make(chan int)       // a channel without buffer
+ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch  = fluid.make_chan(dtype=INT)
+ch1 = fluid.make_chan(dtype=INT, 100)
+```
+
+In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
+
+```python
+ch = fluid.make_chan(dtype=Tensor, etype=float16)
+```
+
+or Tensors of Tensors of float16 etc.
+
+The point here is that we need a consistent way to compose types, like in C++ we can have `Tensor<Tensor<...<float16>...> >`.
+
+### Send and Recv
+
+### Select
+
+## Example Programs
+
+### 1. RPC between Trainers and Parameter Servers
+
+### 2. Concurrent Minibatch Loading
diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png
new file mode 100644
index 0000000000..15e8e290a1
Binary files /dev/null and b/doc/design/dcgan.png differ
diff --git a/doc/design/dist_refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
new file mode 100644
index 0000000000..9368c5780d
--- /dev/null
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -0,0 +1,197 @@
+# Design Doc: Distributed Training Architecture
+
+## Abstract
+
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
+
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
+
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
+
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
+
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
+
+## Analysis
+
+The assumption is that the user writes the trainer program in either Python or C++.
+
+### Limitation 1
+
+There are two basic functionalities in the trainer program:
+
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
+  optimizer.
+
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
+
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
+
+### Limitation 2
+
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
+
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
+
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
+
+<img src="src/compiler.png"/>
+
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
+
+<img src="src/paddle-compile.png"/>
+
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
+
+### Limitation 3
+
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
+
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
+
+## Distributed Training Architecture
+
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
+
+<img src="src/distributed_architecture.png"/>
+
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
+
+### Python API
+
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
+
+```Python
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+    for data in train_reader():
+        loss, acc = exe.run(trainer_prog,
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost])
+```
+
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+   distributed training program:
+   1. Parse configurations from `RemoteExecutor`.
+   1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+   1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+      DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+      "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+    feed=feeder.feed(data),
+    fetch_list=[avg_cost],
+    job_desc=JobDesc(
+      jobname,
+      num_trainer,
+      num_pserver,
+      cpu_per_trainer,
+      gpu_per_trainer,
+      mem_per_trainer,
+      cpu_per_pserver,
+      mem_per_pserver
+    ))
+for data in train_reader():
+    loss, acc = exe.run(trainer_prog,
+                        feed=feeder.feed(data),
+                        fetch_list=[avg_cost])
+```
+
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
+
+<img src="src/remote_executor.png" width="500" align="center" />
+
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc` from `ConfigMap`.
+
+
+### Placement Algorithm
+
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
+
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
+
+
+### Local Training Architecture
+
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
+
+<img src="src/local_architecture.png"/>
+
+
+### Training Data
+
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
+
+
+## References:
+
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
+
+[2] [TensorFlow: A System for Large-Scale Machine Learning](https://www.usenix.org/system/files/conference/osdi16/osdi16-abadi.pdf)
diff --git a/doc/design/dist_refactor/multi_cpu.md b/doc/design/dist_refactor/multi_cpu.md
new file mode 100644
index 0000000000..a8d8ee0422
--- /dev/null
+++ b/doc/design/dist_refactor/multi_cpu.md
@@ -0,0 +1,43 @@
+# Design Doc: Execute the Program with Multi CPU
+
+## Abstract
+
+This Design Doc propose an approach to make the user-defined Op graph
+running with multi-CPU, we will use an auto transpiler to convert the user-defined
+Op graph to a multi-CPU Op graph, and run `ParallelDo` Op to run the graph.
+
+## Transpiler
+
+<img src="src/multi-threads/single-thread@3x.png" width="300">
+
+After converted:
+
+<img src="src/multi-threads/multi-threads@3x.png" width="1000">
+
+## Implement
+
+- `Multi-CPU Transpiler` will convert the graph to a multi-CPU graph
+  which would be executed with multi-threads.
+- `BlockingCounter` will `Init/Decrement` an atomic counter, and Blocking `Wait`
+  for the atomic counter become `0`:
+  ```cpp
+  BlockingCounter bc(thread_count);
+  for (int i = 0; i < thread_count; ++i) {
+    thread_pool->Start([&bc] {bc.DecrementCount(); })
+  }
+  bc.Wait();
+  ```
+- `ParallelDo` Operator
+  - Initialize a thread pool which is a Singleton.
+  - Use a block id as the input, and create run the specify Block on independent scope
+    with multi-threads.
+  - Initialize a `BlockingCounter` instance and wait until all threads are done.
+- `Split` Operator will split the Input Tensor into a TensorArray.
+- `Merge` merge all the gradients which calculated in different threads
+  with `mean/sum/max/min...` method, and then run the Optimizer Op to optimize `W`.
+
+## TODO
+
+- Improve the optimizer stage with multi-threads, since we could
+  assign the parameters to the different threads and execute
+  optimizer with multi-threads.
diff --git a/doc/design/dist_refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md
new file mode 100644
index 0000000000..805dd13048
--- /dev/null
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -0,0 +1,96 @@
+# Design Doc: Parameter Server
+
+## Abstract
+
+We propose an approach to implement the parameter server. In this
+approach, there is no fundamental difference between the trainer and
+the parameter server: they both run subgraphs, but subgraphs of
+different purposes.
+
+## Background
+
+The previous implementations of the parameter server do not run a
+fluid sub-program. Parameter initialization, optimizer computation, network
+communication and checkpointing are implemented twice on both the
+trainer as well as the parameter server.
+
+It would be great if we can write code once and use them on both: the
+trainer and the parameter server, since this reduces code duplication and
+improves extensibility. Given that after the current refactoring, we are
+representing everything as a computation graph on the
+trainer. Representing everything as a computation graph on the parameter
+server becomes a natural extension.
+
+## Design
+
+### Distributed Transpiler
+
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
+steps:
+
+1. OP placement: the OPs will be placed on different nodes according
+   to a heuristic that minimizes the estimated total computation
+   time. Currently we will use a simple heuristic that puts parameter
+   variable on parameter server workers and everything else on trainer
+   workers.
+1. Add communication OPs to enable the communication between nodes.
+
+We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
+
+Below is an example of converting the user defined graph to the
+subgraphs for the trainer and the parameter server:
+
+<img src="src/local-graph.png" width="300"/>
+
+After converting:
+
+<img src="src/dist-graph.png" width="700"/>
+
+1. The parameter variable W and its optimizer program are placed on the parameter server.
+1. Operators are added to the program.
+   - *Send* sends data to the connected *Recv* operator.  The
+	 scheduler on the receive node will only schedule *Recv* operator
+	 to run when the *Send* operator has ran (the *Send* OP will mark
+	 the *Recv* OP runnable automatically).
+   - *Enqueue* enqueues the input variable, it can block until space
+     become available in the queue.
+   - *Dequeue* outputs configurable numbers of tensors from the
+     queue. It will block until the queue has the required number of
+     tensors.
+
+
+### Benefits
+
+- Model parallelism becomes easier to implement: it is an extension to
+  the trainer - parameter server approach. We can have several "Transpilers"
+  to achieve different goals.
+- User-defined optimizer is easier to add - user can now express it as
+  a sub-program.
+- No more duplication logic inside the trainer and the parameter
+  server mentioned in the background section.
+
+### Challenges
+
+- It is important to balance the parameter shards on multiple
+  parameter servers. If a single parameter is very big (for example: some
+  word-embedding, fully connected, softmax layer), we need to
+  automatically partition the single parameter onto different
+  parameter servers when possible (only element-wise optimizer depends
+  on the parameter variable).
+- In the "Async SGD" figure, the "W" variable on the parameter server
+  could be read and written concurrently. See
+  [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+  details about concurrent program in Fluid.
+
+### Discussion
+
+- Can the Enqueue OP be implemented under our current tensor design
+  (put the input tensor into the queue tensor)?
+- *Dequeue* OP will have variable numbers of output (depending on the
+  `min_count` attribute), does our current design support it? (similar
+  question for the *Add* OP)
+
+
+### References:
+[1] [TensorFlow: Large-Scale Machine Learning on Heterogeneous Distributed Systems](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/45166.pdf)
diff --git a/doc/design/dist_refactor/src/compiler.graffle b/doc/design/dist_refactor/src/compiler.graffle
new file mode 100644
index 0000000000..8cc678fea3
Binary files /dev/null and b/doc/design/dist_refactor/src/compiler.graffle differ
diff --git a/doc/design/dist_refactor/src/compiler.png b/doc/design/dist_refactor/src/compiler.png
new file mode 100644
index 0000000000..65d34f841a
Binary files /dev/null and b/doc/design/dist_refactor/src/compiler.png differ
diff --git a/doc/design/dist_refactor/src/dist-graph.graffle b/doc/design/dist_refactor/src/dist-graph.graffle
new file mode 100644
index 0000000000..941399c6ce
Binary files /dev/null and b/doc/design/dist_refactor/src/dist-graph.graffle differ
diff --git a/doc/design/dist_refactor/src/dist-graph.png b/doc/design/dist_refactor/src/dist-graph.png
new file mode 100644
index 0000000000..3546b09f1c
Binary files /dev/null and b/doc/design/dist_refactor/src/dist-graph.png differ
diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/dist_refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000..d1b6014134
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/dist_refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000..29c7b0c078
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.png differ
diff --git a/doc/design/dist_refactor/src/local-graph.graffle b/doc/design/dist_refactor/src/local-graph.graffle
new file mode 100644
index 0000000000..19e509bd9a
Binary files /dev/null and b/doc/design/dist_refactor/src/local-graph.graffle differ
diff --git a/doc/design/dist_refactor/src/local-graph.png b/doc/design/dist_refactor/src/local-graph.png
new file mode 100644
index 0000000000..ada51200f7
Binary files /dev/null and b/doc/design/dist_refactor/src/local-graph.png differ
diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/dist_refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000..49fcc663eb
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/dist_refactor/src/local_architecture.png
new file mode 100644
index 0000000000..14adc9fd72
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.png differ
diff --git a/doc/design/dist_refactor/src/multi-threads.graffle b/doc/design/dist_refactor/src/multi-threads.graffle
new file mode 100644
index 0000000000..e71173715f
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads.graffle differ
diff --git a/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
new file mode 100644
index 0000000000..e40a869987
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png differ
diff --git a/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
new file mode 100644
index 0000000000..4083aebfdd
Binary files /dev/null and b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png differ
diff --git a/doc/design/dist_refactor/src/paddle-compile.graffle b/doc/design/dist_refactor/src/paddle-compile.graffle
new file mode 100644
index 0000000000..a6348cc3db
Binary files /dev/null and b/doc/design/dist_refactor/src/paddle-compile.graffle differ
diff --git a/doc/design/dist_refactor/src/paddle-compile.png b/doc/design/dist_refactor/src/paddle-compile.png
new file mode 100644
index 0000000000..e0f13d551a
Binary files /dev/null and b/doc/design/dist_refactor/src/paddle-compile.png differ
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
new file mode 100644
index 0000000000..41b2067311
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
new file mode 100644
index 0000000000..744e2fb2e0
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md
new file mode 100644
index 0000000000..58aa73b8cd
--- /dev/null
+++ b/doc/design/error_clip.md
@@ -0,0 +1,92 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc.set_attr("max", self.max)
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md
new file mode 100644
index 0000000000..11cc129d56
--- /dev/null
+++ b/doc/design/evaluator.md
@@ -0,0 +1,58 @@
+## Evaluator Design
+
+### Problem Statement
+
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
+
+### Evaluator Design
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+### Implementation
+This design is shown in the Python API. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
+
+    
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """ 
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+    
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+      
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
diff --git a/doc/design/executor.md b/doc/design/executor.md
new file mode 100644
index 0000000000..2d4b371cc5
--- /dev/null
+++ b/doc/design/executor.md
@@ -0,0 +1,29 @@
+# Executor Design Doc
+
+## Motivation
+In [fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), we encourage the user to use deep learning programming paradigms to describe the training process. When the user-written Python program is executed, it will first create a protobuf message
+[`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+The executor runs the `ProgramDesc` like an interpreter. `ProgramDesc` contains the intrinsics (operators in this case) and variables which will be used, executor explicitly executes the stored precompiled code.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators in the block. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instances, which is persistent throughout different runs.
+
+## Executor
+
+The `Executor` explicitly executes all the intrinsics (operators here) in the `block_id`th block of a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then runs all the operators in sequence one-by-one.
+It is very similar to how a push stack frame works when entering a block, following which it cleans up all the temporary variables when a mini-batch is finished. It does not however, have the stack frame pop process.
+
+### The interface
+```c++
+  Executor(places);
+```
+A executor does not own any computing resources, a user can only construct an executor using the specified places.
+
+### Running an Executor
+
+```
+  void Run(ProgramDesc, Scope, block_id, create_local_scope);
+```
+An `Executor` only provides a unified way to execute `ProgramDesc`. `ProgramDesc` is the target that will be executed, the `Scope` specifies the variable container, the `block_id` indicates the entrance block and `create_local_scope` is a boolean that states whether it will destroy the temporary variables after the execution is finished.
diff --git a/doc/design/float16.md b/doc/design/float16.md
new file mode 100644
index 0000000000..1ea95ed6b5
--- /dev/null
+++ b/doc/design/float16.md
@@ -0,0 +1,105 @@
+# Design Doc: float16
+
+## Why float16
+Half precision (float16) is a binary floating-point format that occupies 16 bits in memory. float16 is half the size of traditional 32-bit single precision format (float) and has lower precision and smaller range. 
+
+When high precision computation is not required, using float16 data type could potentially 
+
+- reduce storage space, memory bandwidth, and power usages; 
+- increase the chance of data fitting into a smaller cache of lower latency; 
+- provide arithmetic speed up if supported by hardware. 
+
+## Survey of current float16 support
+A brief survey of float16 support on different compilers, hardwares, and libraries can be found below. Interested readers can refer to [link1](https://github.com/PaddlePaddle/Paddle/issues/4853) and [link2](https://github.com/Xreki/Xreki.github.io/blob/master/multi_data_types_in_dl_framework/ppt/float16_and_quantized_type.md) for more info.
+
+The goal of float16 is to serve as a key for the executor to find and run the correct version of compute method specialized for float16 in operator kernel. It should be compatible with various natively supported float16 implementations including `__half` for cuda, `float16_t` for ARM, and `Eigen::half` for Eigen to make writing customized float16 kernels easier. 
+
+### Compiler
+- nvcc supports `__half` data type after CUDA 7.5.
+- `__fp16` or `float16_t` is supported as storage type for gcc >= 6.1 and clang >= 3.4.
+- `__fp16` or `float16_t` is supported as arithmetic type for gcc >= 7.1 and clang >= 3.9.
+
+### Hardware
+- `__half` is supported on GPU with compute capability >= 5.3.
+- `__fp16` is supported as storage type for ARMv7-A, ARMv8-A, and above.
+- `__fp16` is supported as arithmetic type after ARMv8.2-A (currently, the only microarchitecture implementing ARMv8.2-A is ARM Cortex-A75, which is announced in May 2017. There seems to be no application processors currently available on market that adopts this architecture. It is reported that Qualcomm Snapdragon 845 uses Cortex-A75 design and will be available in mobile devices in early 2018).
+
+### Libraries
+- [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
+- [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
+
+## Implementation
+The float16 class holds a 16-bit `uint16_t` data internally.
+```
+struct float16 {
+  uint16_t x;
+};
+``` 
+
+float16 supports the following features:
+  - constructors / assignment operators that take input from primitive data types including bool, integers of various length, float, and double. 
+  - constructors / assignment operators that take input from `__half` on cuda, `float16_t` on ARM, and `Eigen::half` on Eigen.
+  - conversion operators to primitive data types and half precision data types on cuda, ARM and Eigen. 
+  - overloaded arithmetic operators for cuda, arm, and non-arm cpu, respectively. These operators will take advantage of the cuda and ARM intrinsics on the corresponding hardware. 
+  
+To support the above features, two fundamental conversion functions are provided:
+```
+float16 float_to_half_rn(float f);  // convert to half precision in round-to-nearest-even mode
+float half_to_float(float16 h);
+```
+which provides one-to-one conversion between float32 and float16. These twos functions will do different conversion routines based on the current hardware. CUDA/ARM instrinsics will be used when the corresonding hardware is available. If the hardware or compiler level does not support float32 to float16 conversion, software emulation will be performed to do the conversion.
+
+## To do
+After float16 class is available, some of the future items are below:
+
+- Update pybind/tensor_py.h to bind c++ float16 with numpy float16. 
+
+- Modify `GetKernelType()` method in `framework/operator.h` to make it compatible with float16.
+
+- Create a type-casting operator that can convert the data type in tensor between float16 and other types.
diff --git a/doc/design/fluid-compiler.graffle b/doc/design/fluid-compiler.graffle
new file mode 100644
index 0000000000..c933df2cb8
Binary files /dev/null and b/doc/design/fluid-compiler.graffle differ
diff --git a/doc/design/fluid-compiler.png b/doc/design/fluid-compiler.png
new file mode 100644
index 0000000000..1b0ffed203
Binary files /dev/null and b/doc/design/fluid-compiler.png differ
diff --git a/doc/design/fluid.md b/doc/design/fluid.md
new file mode 100644
index 0000000000..2acc168007
--- /dev/null
+++ b/doc/design/fluid.md
@@ -0,0 +1,114 @@
+# Design Doc: PaddlePaddle Fluid
+
+## Why Fluid
+
+When Baidu developed PaddlePaddle in 2013, the only well-known open source deep learning system at the time was Caffe.  However, when PaddlePaddle was open-sourced in 2016, many other choices were available. There was a challenge -- what is the need for open sourcing yet another deep learning framework?
+
+Fluid is the answer.  Fluid is similar to PyTorch and TensorFlow Eager Execution, which describes the "process" of training or inference using the concept of a model.  In fact in PyTorch, TensorFlow Eager Execution and Fluid, there is no  concept of a model at all. The details are covered in the sections below. Fluid is currently more extreme in the above mentioned idea than PyTorch and Eager Execution, and we are trying to push Fluid towards the directions of a compiler and a new programming language for deep learning.
+
+## The Evolution of Deep Learning Systems
+
+Deep learning infrastructure is one of the fastest evolving technologies. Within four years, there have already been three generations of technologies invented.
+
+| Existed since | model as sequence of layers | model as graph of operators | No model |
+|--|--|--|--|
+| 2013 | Caffe, Theano, Torch, PaddlePaddle | | |
+| 2015 | | TensorFlow, MxNet, Caffe2, ONNX, n-graph | |
+| 2016 | | | PyTorch, TensorFlow Eager Execution, PaddlePaddle Fluid |
+
+From the above table, we see that the deep learning technology is evolving towards getting rid of the concept of a model.  To understand the reasons behind this direction, a comparison of the *programming paradigms* or the ways to program deep learning applications using these systems, would be helpful. The following section goes over these.
+
+## Deep Learning Programming Paradigms
+
+With the systems listed as the first or second generation, e.g., Caffe or TensorFlow, an AI application training program looks like the following:
+
+```python
+x = layer.data("image")
+l = layer.data("label")
+f = layer.fc(x, W)
+s = layer.softmax(f)
+c = layer.mse(l, s)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    forward({input=x, data=m}, minimize=c)
+    backward(...)
+
+print W # print the trained model parameters.
+```
+
+The above program includes two parts:
+
+1. The first part describes the model, and
+2. The second part describes the training process (or inference process) for the model.
+
+This paradigm has a well-known problem that limits the productivity of programmers. If the programmer made a mistake in configuring the model, the error messages wouldn't show up until the second part is executed and `forward` and `backward` propagations are performed. This makes it difficult for the programmer to debug and locate a mistake that is located blocks away from the actual error prompt.
+
+This problem of being hard to debug and re-iterate fast on a program is the primary reason that programmers, in general,  prefer PyTorch over the older systems.  Using PyTorch, we would write the above program as following:
+
+```python
+W = tensor(...)
+
+for i in xrange(1000): # train for 1000 iterations
+    m = read_minibatch()
+    x = m["image"]
+    l = m["label"]
+    f = layer.fc(x, W)
+    s = layer.softmax(f)
+    c = layer.mse(l, s)
+    backward()
+
+print W # print the trained model parameters.
+```
+
+We can see that the main difference is the moving the model configuration part (the first step) into the training loop.  This change would allow the mistakes in model configuration to be reported where they actually appear in the programming block.  This change also represents the model better, or its forward pass, by keeping the configuration process in the training loop.
+
+## Describe Arbitrary Models for the Future
+
+Describing the process instead of the model also brings Fluid, the flexibility to define different non-standard models that haven't been invented yet.
+
+As we write out the program for the process, we can write an RNN as a loop, instead of an RNN as a layer or as an operator.  A PyTorch example would look like the following:
+
+```python
+for i in xrange(1000):
+    m = read_minibatch()
+    x = m["sentence"]
+    for t in xrange x.len():
+        h[t] = the_step(x[t])
+```        
+
+With Fluid, the training loop and the RNN in the above program are not really Python loops, but just a "loop structure" provided by Fluid and implemented in C++ as the following:
+
+```python
+train_loop = layers.While(cond)
+with train_loop.block():
+  m = read_minibatch()
+  x = m["sentence"]
+  rnn = layers.While(...)
+  with rnn.block():
+    h[t] = the_step(input[t])
+```    
+
+An actual Fluid example is described  [here](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44).
+
+From the example, the Fluid programs look very similar to their PyTorch equivalent programs, except that Fluid's loop structure, wrapped with Python's `with` statement, could run much faster than just a Python loop.
+
+We have more examples of the [`if-then-else`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/if_else_op.md) structure of Fluid.
+
+## Turing Completeness
+
+In computability theory, a system of data-manipulation rules, such as a programming language, is said to be Turing complete if it can be used to simulate any Turing machine.  For a programming language, if it provides if-then-else and loop, it is Turing complete.  From the above examples, Fluid seems to be Turing complete; however, it is noteworthy to notice that there  is a slight difference between the `if-then-else` of Fluid and that of a programming language. The difference being that the former runs both of its branches and splits the input mini-batch into two -- one for the True condition and another for the False condition. This hasn't been researched in depth if this is equivalent to the `if-then-else` in programming languages that makes them Turing-complete.  Based on a conversation with [Yuang Yu](https://research.google.com/pubs/104812.html), it seems to be the case but this needs to be looked into in-depth.
+
+## The Execution of a Fluid Program
+
+There are two ways to execute a Fluid program.  When a program is executed, it creates a protobuf message [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/a91efdde6910ce92a78e3aa7157412c4c88d9ee8/paddle/framework/framework.proto#L145) that describes the process and is conceptually like an [abstract syntax tree](https://en.wikipedia.org/wiki/Abstract_syntax_tree).
+
+There is a C++ class [`Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h), which runs a `ProgramDesc`, similar to how an interpreter runs a Python program.
+
+Fluid is moving towards the direction of a compiler, which is explain in [fluid_compiler.md](fluid_compiler.md).
+
+## Backward Compatibility of Fluid
+
+Given all the advantages from the removal of the concept of a *model*, hardware manufacturers might still prefer the existence of the concept of a model, so it would be easier for them to support multiple frameworks all at once and could run a trained model during inference.  For example, Nervana, a startup company acquired by Intel, has been working on an XPU that reads the models in the format known as [n-graph](https://github.com/NervanaSystems/ngraph).  Similarly, [Movidius](https://www.movidius.com/) is producing a mobile deep learning chip that reads and runs graphs of operators.  The well-known [ONNX](https://github.com/onnx/onnx) is also a file format of graphs of operators.
+
+For Fluid, we can write a converter that extracts the parts in the `ProgramDesc` protobuf message, converts them into a graph of operators, and exports the graph into the ONNX or n-graph format.
diff --git a/doc/design/fluid_compiler.md b/doc/design/fluid_compiler.md
new file mode 100644
index 0000000000..2a6beafc52
--- /dev/null
+++ b/doc/design/fluid_compiler.md
@@ -0,0 +1,110 @@
+# PaddlePaddle Fluid: Towards a Compiled Programming Language
+
+As described in [fluid.md](fluid.md), when a Fluid application program
+runs, it generates a `ProgramDesc` protobuf message as an intermediate
+representation of itself.  The C++ class `Executor` can run this
+protobuf message as an interpreter.  This article describes the Fluid
+compiler.
+
+![](fluid-compiler.png)
+
+## ProgramDesc
+
+Before we go deeper into the idea of compiled language, let us take a
+look at a simple example Fluid application.
+
+```python
+import "fluid"
+
+func paddlepaddle() {
+  X = fluid.read(...)
+  W = fluid.Tensor(...)
+  Y = fluid.mult(X, W)
+}
+```
+
+This program consists of a [block](block.md) of three operators --
+`read`, `assign`, and `mult`.  Its `ProgramDesc` message looks like
+the following
+
+```protobuf
+message ProgramDesc {
+  block[0] = Block {
+    vars = [X, W, Y],
+    ops = [
+      read(output = X)
+      assign(input = ..., output = W)
+      mult(input = {X, W}, output = Y)
+    ],
+  }
+}
+```
+ 
+## Transpilers
+
+We can write a transpiler program that takes a `ProgramDesc`, e.g.,
+the above one, and outputs another `ProgramDesc`.  Let us take some
+examples:
+
+1. *Memory optimization transpiler*: We can write a transpiler that
+   inserts some `FreeMemoryOp`s in the above example `ProgramDesc` so
+   to free memory early, before the end of an iteration, so to keep a
+   small memory footprint.
+
+1. *Distributed training transpiler*: We can write a transpiler that
+   converts a`ProgramDesc` into its distributed version of two
+   `ProgramDesc`s -- one for running by the trainer processes and the
+   other for the parameter server.
+
+In the rest of this article, we talk about a special kind of
+transpiler, *Native code generator*, which takes a `ProgramDesc` and
+generates a `.cu` (or `.cc`) file, which could be built by C++
+compilers (gcc, nvcc, icc) into binaries.
+
+## Native Code Generator
+
+For the above example, the native code generator transpiler, say, the
+CUDA code generator, should generate a `main` function:
+
+```c++
+void main() {
+  auto X = fluid_cuda_read(...);
+  auto W = fluid_cuda_create_tensor(...);
+  auto Y = fluid_cuda_mult(X, W);
+}
+```
+
+and the definitions of functions `fluid_cuda_read`,
+`fluid_cuda_create_tensor`, and `fluid_cuda_mult`.  Please be aware
+that each function could just define a C++ instance of an operator and
+run it.  For example
+
+```c++
+paddle::Tensor fluid_cuda_read(...) {
+  paddle::Tensor t;
+  paddle::operator::Read r(&t, ...);
+  r.Run();
+  return t;
+}
+```
+
+For computational operators that have multiple *kernels*, each for a
+specific hardware platform, for example, the `mult` operator, the
+generated code should call its CUDA kernel:
+
+```c++
+paddle::Tensor fluid_cuda_mult(const paddle::Tensor& a, 
+                               const paddle::Tensor& b) {
+  paddle::Tensor t;
+  paddle::operator::Mult m(a, b, ...);
+  Mult.Run(cuda_context);
+}
+```
+
+where `cuda_context` could be a global variable of type
+`paddle::CUDADeviceContext`.
+
+## Multi-Block Code Generation
+
+Most Fluid application programs may have more than one blocks.  To
+execute them, we need to trace [scopes](scope.md).
diff --git a/doc/design/functions_operators_layers.md b/doc/design/functions_operators_layers.md
new file mode 100644
index 0000000000..984b59f4c6
--- /dev/null
+++ b/doc/design/functions_operators_layers.md
@@ -0,0 +1,100 @@
+# Design Doc: Functions, Operators, and Layers
+
+In a DL system, we can compose one or more fine grained operators into a coarse grained one.  For example, the FC layer can be composed of a multiplication operator and an add operator.
+
+Historically, some fine grained operations are known as operators, and some coarse level ones are known as layers.  But we need a well-defined separation.
+
+In general, operators are those very fine grained operations, e.g., mul and add. In the implementation, we can write them as C++ functions:
+
+```c++
+template <typename T> T add(T x, T y) { return x + y; }
+template <typename T> T mul(T x, T y) { return x * y; }
+```
+
+Then we can wrap them into operators which are C++ classes and can be created from Python bindings by name.  A C macro can do this. For example, the following macro invocation
+
+```c++
+#define MAKE_FUNCTION_OPERATOR(mul);
+```
+
+generates
+
+```c++
+template <typename T> class mulOp : public OperatorBase {...};
+REGISTER_OP(mulOp<float32>, "mul");
+```
+
+so that in Python we can create operator mul by:
+
+```python
+X1 = Var()
+X2 = Var()
+Y = Var()
+paddle.cpp.create_operator("mul", input=[X1, X2], output=Y)
+```
+
+Also, at the same time, we can compose a coarse level C++ operator class by composing functions `mul` and `add`:
+
+```c++
+template <typename T>
+class FCOp : public OperatorBase {
+ public:
+  void Run(...) {
+    add(mul(Input<T>("X"), Input<T>("W")), Input<T>("b");
+  }
+};
+REGISTER_OP(FCOp, "fc");
+```
+
+We need to support such composition in Python as well.  To do so, we need a higher level Python wrapping of operator creation than `paddle.cpp.create_operator`.  This higher level operator API should be compatible with the layer API.
+
+Let's explain using an example.  Suppose that we are going to compose the FC using mul and add in Python, we'd like to have Python functions `mul` and `add` defined in module `operator`:
+
+```python
+def operator.mul(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("mul", input={X1, Y1}, output=O)
+    return O
+
+def operator.add(X1, X2):
+    O = Var()
+    paddle.cpp.create_operator("add", input={X1, X2}, output=O)
+    return O
+```
+
+Above code snippets are automatically generated.  Given them, users can define
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    return operator.add(operator.mul(X, W), b)
+```
+
+If we don't have `operator.mul` and `operator.add`, the definiton of `layer.fc` would be complicated:
+
+```python
+def layer.fc(X):
+    W = Var()
+    b = Var()
+    O1 = Var()
+    paddle.cpp.create_operator("mul", input=[X, W], output=O1)
+    O2 = Var()
+    paddle.cpp.create_operator("add", input=[O1, b], output=O2)
+    return O2
+```
+
+We'd like to have Python bindings to operators in package `paddle.operator`, and Python compositions of operators in package `paddle.layer`.  So we have the following concepts in above illustrative example:
+
+
+| C++ functions/functors | mul          | add          |             |          |
+|------------------------|--------------|--------------|-------------|----------|
+| C++ operator class     | mulOp        | addOp        | FCOp        |          |
+| Python binding         | operator.mul | operator.add | operator.fc |          |
+| Python function        |              |              |             | layer.fc |
+
+
+This is how we differentiate layer and operators in PaddlePaddle:
+
+- those defined in C++ and have a lightweighted Python wrapper in module `operators` are operators; whereas
+- those who don't have C++ implementations but a Python implementation that compose C++ operators are known as layers.
diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md
new file mode 100644
index 0000000000..fb41df8615
--- /dev/null
+++ b/doc/design/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+<p align="center">
+<img src="./test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+<p align="center">
+<img src="./dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+
+## The Conditional-GAN might be a class. 
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one. 
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+  
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+    
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+      
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+    
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+    
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+    
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+    
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively. 
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+    
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+    
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+    
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+    
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+    # executor
+    sess = pd.executor()
+    
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+        if batch_id % 2 == 0:
+          sess.run(d_step, 
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/design/graph.md b/doc/design/graph.md
new file mode 100644
index 0000000000..7519a65df8
--- /dev/null
+++ b/doc/design/graph.md
@@ -0,0 +1,70 @@
+# Design Doc: Computations as a Graph
+
+A primary goal of the refactorization of PaddlePaddle is a more flexible representation of deep learning computation, in particular, a graph of operators and variables, instead of sequences of layers as before.
+
+This document explains that the construction of a graph as three steps:
+
+- construct the forward part
+- construct the backward part
+- construct the optimization part
+
+## The Construction of a Graph
+
+Let us take the problem of image classification as a simple example.  The application program that trains the model looks like:
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+### Forward Part
+
+The first four lines of above program build the forward part of the graph.
+
+![](images/graph_construction_example_forward_only.png)
+
+In particular, the first line `x = layer.data("images")` creates variable x and a Feed operator that copies a column from the minibatch to x.  `y = layer.fc(x)` creates not only the FC operator and output variable y, but also two parameters, W and b, and the initialization operators.
+
+Initialization operators are kind of "run-once" operators -- the `Run` method increments a class data member counter so to run at most once.  By doing so, a parameter wouldn't be initialized repeatedly, say, in every minibatch.
+
+In this example, all operators are created as `OpDesc` protobuf messages, and all variables are `VarDesc`.  These protobuf messages are saved in a `BlockDesc` protobuf message.
+
+### Backward Part
+
+The fifth line `optimize(cost)` calls two functions, `ConstructBackwardGraph` and `ConstructOptimizationGraph`.
+
+`ConstructBackwardGraph` traverses the forward graph in the `BlockDesc` protobuf message and builds the backward part.
+
+![](images/graph_construction_example_forward_backward.png)
+
+According to the chain rule of gradient computation, `ConstructBackwardGraph` would
+
+1. create a gradient operator G for each operator F,
+1. make all inputs, outputs, and outputs' gradient of F as inputs of G,
+1. create gradients for all inputs of F, except for those who don't have gradients, like x and l, and
+1. make all these gradients as outputs of G.
+
+### Optimization Part
+
+For each parameter, like W and b created by `layer.fc`, marked as double circles in above graphs, `ConstructOptimizationGraph` creates an optimization operator to apply its gradient.  Here results in the complete graph:
+
+![](images/graph_construction_example_all.png)
+
+## Block and Graph
+
+The word block and graph are interchangable in the desgin of PaddlePaddle.  A [Block](https://github.com/PaddlePaddle/Paddle/pull/3708) is a metaphore of the code and local variables in a pair of curly braces in programming languages, where operators are like statements or instructions.  A graph of operators and variables is a representation of the block.
+
+A Block keeps operators in an array `BlockDesc::ops`
+
+```protobuf
+message BlockDesc {
+  repeated OpDesc ops = 1;
+  repeated VarDesc vars = 2;
+}
+```
+
+in the order that they appear in user programs, like the Python program at the beginning of this article.  We can imagine that in `ops`,  we have some forward operators, followed by some gradient operators, and then some optimization operators.
diff --git a/doc/design/graph_survey.md b/doc/design/graph_survey.md
new file mode 100644
index 0000000000..6c6db08f46
--- /dev/null
+++ b/doc/design/graph_survey.md
@@ -0,0 +1,232 @@
+## Survey on Graph
+
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+
+### Mxnet
+
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+
+
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+
+
+A simple network topology wrote by Symbol is as follows:
+
+```python
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.Flatten(data=data)
+    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+    return mlp
+```
+
+
+
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+
+Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+
+And Symbol can be saved to a Json file.
+
+Here is a detailed example:
+
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+	output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+
+>>> fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+	output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+	arg[0]=flatten0(0)
+	arg[1]=fc1_weight(0) version=0
+	arg[2]=fc1_bias(0) version=0
+Attrs:
+	num_hidden=128
+
+```
+
+
+### TensorFlow
+
+
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+
+A simple example is as follows:
+
+```python
+  # Build a dataflow graph.
+  c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  e = tf.matmul(c, d)
+
+  # Construct a `Session` to execute the graph.
+  sess = tf.Session()
+
+  # Execute the graph and store the value that `e` represents in `result`.
+  result = sess.run(e)
+```
+
+  
+The main method of `Tensor` is as follows: 
+ 
+ 
+```python
+@property
+def op(self):
+  """The `Operation` that produces this tensor as an output."""
+  return self._op
+
+@property
+def dtype(self):
+   """The `DType` of elements in this tensor."""
+  return self._dtype
+
+@property
+def graph(self):
+  """The `Graph` that contains this tensor."""
+  return self._op.graph
+
+@property
+def name(self):
+  """The string name of this tensor."""
+  if not self._op.name:
+    raise ValueError("Operation was not named: %s" % self._op)
+  return "%s:%d" % (self._op.name, self._value_index)
+
+@property
+def device(self):
+  """The name of the device on which this tensor will be produced, or None."""
+  return self._op.device
+```
+
+
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+
+
+Here is a detailed example:
+
+
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+```
+
+### Dynet
+
+
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+
+
+A simple example is as follows:
+
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. 
+
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+
+
+Here is a detailed example:
+
+write topology in C++
+
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+
+Expression pred = W * xs[i];
+cg.print_graphviz();
+
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+
+compile and print
+
+```
+# first print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+}
+# third print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+  N2 [label="v2 = -1.88387 - v1"];
+  N1 -> N2;
+  N3 [label="v3 = -v2"];
+  N2 -> N3;
+  N4 [label="v4 = square(v3)"];
+  N3 -> N4;
+}
+```
+
+### Conclusion
+
+
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md
new file mode 100644
index 0000000000..26d140f06d
--- /dev/null
+++ b/doc/design/if_else_op.md
@@ -0,0 +1,51 @@
+# The `IfElse` Operator
+
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
+
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
+
+```python
+import paddle as pd
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
+```
diff --git a/doc/design/images/asgd.gif b/doc/design/images/asgd.gif
new file mode 100644
index 0000000000..4a0da7bf6d
Binary files /dev/null and b/doc/design/images/asgd.gif differ
diff --git a/doc/design/images/control_flow_graph.png b/doc/design/images/control_flow_graph.png
new file mode 100644
index 0000000000..3579998e58
Binary files /dev/null and b/doc/design/images/control_flow_graph.png differ
diff --git a/doc/design/images/dataflow_equations.png b/doc/design/images/dataflow_equations.png
new file mode 100644
index 0000000000..c10f7f69f4
Binary files /dev/null and b/doc/design/images/dataflow_equations.png differ
diff --git a/doc/design/images/deep_learning.png b/doc/design/images/deep_learning.png
new file mode 100644
index 0000000000..026becc4d9
Binary files /dev/null and b/doc/design/images/deep_learning.png differ
diff --git a/doc/design/images/duplicate_op.graffle b/doc/design/images/duplicate_op.graffle
new file mode 100644
index 0000000000..5979f792e2
Binary files /dev/null and b/doc/design/images/duplicate_op.graffle differ
diff --git a/doc/design/images/duplicate_op.png b/doc/design/images/duplicate_op.png
new file mode 100644
index 0000000000..f299c5d37f
Binary files /dev/null and b/doc/design/images/duplicate_op.png differ
diff --git a/doc/design/images/duplicate_op2.graffle b/doc/design/images/duplicate_op2.graffle
new file mode 100644
index 0000000000..5cec3bc64d
Binary files /dev/null and b/doc/design/images/duplicate_op2.graffle differ
diff --git a/doc/design/images/duplicate_op2.png b/doc/design/images/duplicate_op2.png
new file mode 100644
index 0000000000..21cdd5cabf
Binary files /dev/null and b/doc/design/images/duplicate_op2.png differ
diff --git a/doc/design/images/feed_forward.png b/doc/design/images/feed_forward.png
new file mode 100644
index 0000000000..d312371a04
Binary files /dev/null and b/doc/design/images/feed_forward.png differ
diff --git a/doc/design/images/feed_forward_regularized.png b/doc/design/images/feed_forward_regularized.png
new file mode 100644
index 0000000000..677e99bfd9
Binary files /dev/null and b/doc/design/images/feed_forward_regularized.png differ
diff --git a/doc/design/images/graph_construction_example.bash b/doc/design/images/graph_construction_example.bash
new file mode 100755
index 0000000000..35e6997abd
--- /dev/null
+++ b/doc/design/images/graph_construction_example.bash
@@ -0,0 +1,11 @@
+cat ./graph_construction_example.dot | \
+    sed 's/color=red/color=red, style=invis/g' | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_only.png
+
+cat ./graph_construction_example.dot | \
+    sed 's/color=green/color=green, style=invis/g' | \
+    dot -Tpng > graph_construction_example_forward_backward.png
+
+cat ./graph_construction_example.dot | \
+    dot -Tpng > graph_construction_example_all.png
diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot
new file mode 100644
index 0000000000..e115f9844b
--- /dev/null
+++ b/doc/design/images/graph_construction_example.dot
@@ -0,0 +1,68 @@
+digraph ImageClassificationGraph {
+        ///////// The forward part /////////
+        FeedX [label="Feed", color=blue, shape=box];
+        FeedY [label="Feed", color=blue, shape=box];
+        InitW [label="Init", color=blue, shape=diamond];
+        Initb [label="Init", color=blue, shape=diamond];
+        FC [label="FC", color=blue, shape=box];
+        MSE [label="MSE", color=blue, shape=box];
+
+        x [label="x", color=blue, shape=oval];
+        l [label="l", color=blue, shape=oval];
+        y [label="y", color=blue, shape=oval];
+        W [label="W", color=blue, shape=doublecircle];
+        b [label="b", color=blue, shape=doublecircle];
+        cost [label="cost", color=blue, shape=oval];
+
+        FeedX -> x -> FC -> y -> MSE -> cost [color=blue];
+        FeedY -> l [color=blue];
+        InitW -> W [color=blue];
+        Initb -> b [color=blue];
+        W -> FC [color=blue];
+        b -> FC [color=blue];
+        l -> MSE [color=blue];
+
+        ////////// The backward part /////////
+        MSE_Grad [label="MSE_grad", color=red, shape=box];
+        FC_Grad [label="FC_grad", color=red, shape=box];
+
+        d_cost [label="d cost", color=red, shape=oval];
+        d_y [label="d y", color=red, shape=oval];
+        d_b [label="d b", color=red, shape=oval];
+        d_W [label="d W", color=red, shape=oval];
+
+        cost -> MSE_Grad [color=red];
+        d_cost -> MSE_Grad [color=red];
+        l -> MSE_Grad [color=red];
+        y -> MSE_Grad -> d_y [color=red];
+
+        x -> FC_Grad [color=red];
+        y -> FC_Grad [color=red];
+        d_y -> FC_Grad [color=red];
+        W -> FC_Grad -> d_W [color=red];
+        b -> FC_Grad -> d_b [color=red];
+
+        ////////// The optimizaiton part //////////
+
+        OPT_W [label="SGD", color=green, shape=box];
+        OPT_b [label="SGD", color=green, shape=box];
+
+        W -> OPT_W [color=green];
+        b -> OPT_b [color=green];
+        d_W -> OPT_W -> W [color=green];
+        d_b -> OPT_b -> b [color=green];
+
+        ////////// Groupings //////////
+
+        subgraph clusterMSE {
+                style=invis;
+                MSE;
+                MSE_Grad;
+        }
+
+        subgraph clusterFC {
+                style=invis;
+                FC;
+                FC_Grad;
+        }
+}
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png
new file mode 100644
index 0000000000..261611a572
Binary files /dev/null and b/doc/design/images/graph_construction_example_all.png differ
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png
new file mode 100644
index 0000000000..4c69687f4a
Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png
new file mode 100644
index 0000000000..e668c16e0c
Binary files /dev/null and b/doc/design/images/graph_construction_example_forward_only.png differ
diff --git a/doc/design/images/l1_regularization.png b/doc/design/images/l1_regularization.png
new file mode 100644
index 0000000000..e1b9c7a44f
Binary files /dev/null and b/doc/design/images/l1_regularization.png differ
diff --git a/doc/design/images/l2_regularization.png b/doc/design/images/l2_regularization.png
new file mode 100644
index 0000000000..d5c2fcbc2c
Binary files /dev/null and b/doc/design/images/l2_regularization.png differ
diff --git a/doc/design/images/loss_equation.png b/doc/design/images/loss_equation.png
new file mode 100644
index 0000000000..14212ec8d3
Binary files /dev/null and b/doc/design/images/loss_equation.png differ
diff --git a/doc/design/images/multigpu_allreduce.graffle b/doc/design/images/multigpu_allreduce.graffle
new file mode 100644
index 0000000000..cb5bc420ce
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.graffle differ
diff --git a/doc/design/images/multigpu_allreduce.png b/doc/design/images/multigpu_allreduce.png
new file mode 100644
index 0000000000..87a1b3e8f6
Binary files /dev/null and b/doc/design/images/multigpu_allreduce.png differ
diff --git a/doc/design/images/multigpu_before_convert.graffle b/doc/design/images/multigpu_before_convert.graffle
new file mode 100644
index 0000000000..6c35ab1b21
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.graffle differ
diff --git a/doc/design/images/multigpu_before_convert.png b/doc/design/images/multigpu_before_convert.png
new file mode 100644
index 0000000000..9c8f771116
Binary files /dev/null and b/doc/design/images/multigpu_before_convert.png differ
diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png
new file mode 100644
index 0000000000..d57b71ca88
Binary files /dev/null and b/doc/design/images/profiler.png differ
diff --git a/doc/design/images/theta_star.gif b/doc/design/images/theta_star.gif
new file mode 100644
index 0000000000..dd24d33e12
Binary files /dev/null and b/doc/design/images/theta_star.gif differ
diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md
new file mode 100644
index 0000000000..d9d5397bec
--- /dev/null
+++ b/doc/design/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/design/kernel_hint_design.md b/doc/design/kernel_hint_design.md
new file mode 100644
index 0000000000..a54b7da045
--- /dev/null
+++ b/doc/design/kernel_hint_design.md
@@ -0,0 +1,57 @@
+## Problem
+In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
+
+In the current design, we use KernelType to describe one kernel.
+
+```cpp
+struct KernelType {
+  Place place_;
+  DataType data_type_;
+  LayoutType layout_;
+};
+```
+ `place_` `data_type_` and `layout_` can be got from the input tensors of the operator, `GetActualKernelType(inputs)` use inputs to infer the proper kernel key that fit the incoming data, but users can not directly configure it.
+
+The [design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) also provides a virtual method `GetExpectedKernelType` that user can overload and use to choose the KernelType they want to use.
+
+So we should send the information user defined in proto to `GetExpectedKernelType` for choosing a kernel.
+
+The problem is, how should we define and send the information for `GetExpectedKernelType` to use?
+
+## Solution
+
+### Potential choice
+1. Do nothing, let the user add the information they want to operator‘s attribute and get them inside `GetExpectedKernelType`, this can work properly. But there is a little problem that users may define many kinds of hints for the same purpose, such as `force_cpu`, `use_cpu`, `cpu_kernel` to choose CPU kernel, and `use_cudnn`, `force_cudnn`, `cudnn_kernel` to choose CUDNN kernel.
+
+2. Pre-define all the needed option and use a single attr key such as `kernel_hint` for the user, this is not so flexible if the user wants to define some more kind of hint.
+
+### Final choice
+To provide enough flexibility while avoiding confusion definition, we can define some global constants for these attribute names, such as `force_cpu`, `use_cudnn`, `use_mkldnn` for a user to choose.
+
+In C++
+
+```cpp
+const std::string kForceCPU = "force_cpu";
+const std::string kUseCUDNN = "use_cudnn";
+const std::string kUseMKLDNN = "use_mkldnn";
+
+KernelType GetExpectedKernelType() {
+  if (Attr<bool>(kForceCPU)) {
+    return KernelType(CPUPlace, ...)
+  } else {
+    ...
+  }
+}
+```
+
+In Python code
+
+```python
+FORCE_CPU = core.kForceCPU()
+
+def xx_layer(..., force_cpu=false):
+  layer_helper = LayerHelper(...)
+  layer_helper.append_op(
+    type="xx",
+    attr={FORCE_CPU: force_cpu})
+```
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
new file mode 100644
index 0000000000..1f68cef4cc
--- /dev/null
+++ b/doc/design/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
+
+#### In-place Operation
+In a relu activation operator: 
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
+
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+    def __init__(self, Program):
+        self._sucessors = defaultdict(set)
+        self._presucessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+        self._program = Program
+    
+    def build(self):
+        pass
+    
+    def dataflow_analysis(self):
+        pass
+        
+    def memory_optimization(self):
+        pass
+        
+    def get_program(self):
+        return self._program
+```
+
+#### Make dataflow analysis
+
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+    i --> pool
+    pool --> o
+else:
+    pool --> o
+    i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/design/mkl/image/engine.png b/doc/design/mkl/image/engine.png
new file mode 100644
index 0000000000..1f5f65c2cc
Binary files /dev/null and b/doc/design/mkl/image/engine.png differ
diff --git a/doc/design/mkl/image/gradients.png b/doc/design/mkl/image/gradients.png
new file mode 100644
index 0000000000..f031bcf8e4
Binary files /dev/null and b/doc/design/mkl/image/gradients.png differ
diff --git a/doc/design/mkl/image/layers.png b/doc/design/mkl/image/layers.png
new file mode 100644
index 0000000000..306f79b7a8
Binary files /dev/null and b/doc/design/mkl/image/layers.png differ
diff --git a/doc/design/mkl/image/matrix.png b/doc/design/mkl/image/matrix.png
new file mode 100644
index 0000000000..c33ce9cf03
Binary files /dev/null and b/doc/design/mkl/image/matrix.png differ
diff --git a/doc/design/mkl/image/overview.png b/doc/design/mkl/image/overview.png
new file mode 100644
index 0000000000..8fb7bbb9dd
Binary files /dev/null and b/doc/design/mkl/image/overview.png differ
diff --git a/doc/design/mkl/mkl_packed.md b/doc/design/mkl/mkl_packed.md
new file mode 100644
index 0000000000..0123315ad4
--- /dev/null
+++ b/doc/design/mkl/mkl_packed.md
@@ -0,0 +1,108 @@
+# Intel® MKL Packed on PaddlePaddle: Design Doc
+
+
+## Contents
+
+- [Overview](#overview)
+- [Key Points](#key-points) 
+   - [Background](#background)
+   - [Solution](#solution)
+- [Actions](#actions)
+    - [CMake](#cmake)
+	- [Layers](#layers)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+
+
+## Overview
+我们计划将 Intel® MKL 中引入的 GEMM Packed APIs\[[1](#references)\] 集成到 PaddlePaddle 中,充分发挥英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
+现阶段的优化主要针对 Recurrent Neural Network(以下简称RNN)相关层(包括`RecurrentLayer`, `GatedRecurrentLayer`和`LstmLayer`), 以及 PaddlePaddle V1 API。
+
+## Key Points
+
+### Background
+目前PaddlePaddle采用了 Intel® MKL库的[cblas_?gemm](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm)函数,这个函数本身会在计算前将原数据转换为更适合英特尔平台的内部格式。
+
+1. 转换耗时 \
+这一数据格式的转换操作(Packing),在问题本身的计算量比较小的时候,显得相对来说较为耗时。例如在DeepSpeech2 \[[2](#references)\] 的Vanilla RNN部分中,矩阵大小是`batch_size * 2048`。
+2. 转换冗余 \
+由于在现有的某些情况下(例如RNN),多次调用 cblas_?gemm 会使用相同的原数据,因此,每次调用时对原数据的重复Packing便成为了冗余。
+
+为了最大程度减少多次调用 cblas_?gemm 在Packing上的耗时,Intel® MKL 引入了以下四个API:
+   * [cblas_?gemm_alloc](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-alloc)
+   * [cblas_?gemm_pack](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-pack)
+   * [cblas_?gemm_compute](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-compute)
+   * [cblas_?gemm_free](https://software.intel.com/en-us/mkl-developer-reference-c-cblas-gemm-free)
+
+通过使用这些API,我们可以先完成对原数据的Packing操作,再把已转换为Packed格式的数据传递给那些复用同一数据的gemm_compute函数,从而避免了Packing冗余。
+
+### Solution
+在RNN的情况下,同一次前向、后向(forward/backward)过程中所有时间步(time step)共享同一个权重(weight)。当只做推断(inference)时,各次前向之间也都使用了相同的权重,没有必要在每次前向中每个时间步的计算时对权重进行重复的Packing操作。
+
+我们通过使用新引入的GEMM Packed APIs,在层初始化的时候,先完成对权重的Packing操作,然后在前向,后向时复用已经转换过的权重,并在每次权重更新后,对新的权重进行转换用于下次迭代。
+
+* 优化前,对于序列长度(sequence length)为`T`的网络模型(model), `N`次迭代执行的转换次数为:
+  - `inference`: `N * T`  
+  - `training`: `2 * N * T`
+* 优化后,对于同样设置的网络模型,其转换次数减少至:
+  - `inference`: `1`    
+  - `training`: `2 * N`
+
+## Actions
+
+添加的相关文件和目录结构如下:
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+└── paddle/
+    ├── ...
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   ├── MKLPackedRecurrentLayer.*
+        |   ├── MKLPackedGatedRecurrentLayer.*
+        |   ├── MKLPackedLstmLayer.*
+        |   └── MKLPackedGemm.h
+        └── tests/
+            ├── ...
+            └── test_MKLPacked.cpp
+```
+
+### CMake
+在对应的`CMakeLists.txt`中根据`WITH_MKL`是否打开,来决定是否开启MKL Packed相关功能。
+
+### Layers
+所有的`MKLPacked*Layer`都继承于PaddlePaddle的基类`Layer`, 并添加头文件 `MKLPackedGemm.h`,该文件对相关GEMM Packed APIs做了封装。
+
+### Unit Tests
+我们会添加`test_MKLPacked.cpp`用于MKL Packed优化后layer的测试。
+对于每一个新加的RNN layer,我们会对比如下2个方面:
+1. 对比优化后layer自身,sequence mode(`rnn_use_batch=false`)与batch mode(`rnn_use_batch=true`)的结果。
+2. 对比优化后layer与相对应的PaddlePaddle原有layer, 在batch mode下的结果。
+
+### Python API
+计划在`paddle/utils.Flags`中添加`use_mkl_packed`的flag,用于选择是否使用相关功能,并且当编译时`WITH_MKL=ON`的情况下,默认设置为`true`。
+
+同时,在`python/paddle/trainer/config_parser.py`中对应的layer处,添加`use_mkl_packed`这个选择,方便用户在Python端选择是否启用这个功能。
+
+具体实现方式比如:
+
+```python
+use_mkl_packed = bool(int(g_command_config_args.get("use_mkl_packed", 0)))
+if use_mkl_packed:
+    self.layer_type = mkl_packed_*
+```
+
+所有相关的`layer_type`会以*mkl_packed_*开头,这些会在`MKLPacked*Layer`注册layer的时候保证,以示区分。 
+
+
+### Benchmarking
+会添加相应的脚本用于测试和对比在使用MKL Packed recurrent layers 前后的网络性能。
+
+## References 
+1. [Introducing the new Packed APIs for GEMM](https://software.intel.com/en-us/articles/introducing-the-new-packed-apis-for-gemm)
+2. [DeepSpeech2 on PaddlePaddle](https://github.com/PaddlePaddle/DeepSpeech#deepspeech2-on-paddlepaddle)
+
diff --git a/doc/design/mkl/mkldnn.md b/doc/design/mkl/mkldnn.md
new file mode 100644
index 0000000000..e2fe1e6b26
--- /dev/null
+++ b/doc/design/mkl/mkldnn.md
@@ -0,0 +1,210 @@
+# Intel® MKL-DNN on PaddlePaddle: Design Doc
+
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle,
+充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。
+
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+
+近期目标
+
+- 完成常用Layer的MKL-DNN实现。
+- 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。
+
+目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
+
+## Contents
+
+- [Overview](#overview)
+- [Actions](#actions)
+ 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
+	- [Layers](#layers)
+	- [Activations](#activations)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
+	- [Unit Tests](#unit-tests)
+	- [Python API](#python-api)
+	- [Benchmarking](#benchmarking)
+	- [Others](#others)
+- [Design Concerns](#design-concerns)
+
+## Overview
+
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+
+同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。
+
+MKL,MKLML以及MKL-DNN三者关系如下表:
+
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+
+MKLML可以与MKL-DNN共同使用,以此达到最好的性能。
+
+<div align="center">
+<img src="image/engine.png"><br/>
+Figure 2. PaddlePaddle with MKL Engines
+</div>
+
+## Actions
+
+添加的相关文件和目录结构如下:
+
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
+
+### CMake
+在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN
+
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
+
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
+
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
+
+### Layers
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。
+在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑,
+子类只需要使用定义好的接口,实现具体的函数功能即可。
+
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix:
+
+- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。
+- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于,
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。
+需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`,
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存,
+如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。
+- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`,
+表示对输入数据,输入梯度,输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。
+
+注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。
+
+### Activations
+在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存,
+所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。
+
+### Parameters
+对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式,
+在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。
+
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加,
+这样带来的好处就是不需要一直清空memory,节省了不必要的操作。
+但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。
+所以整体上,在实现每个子类的时候就不需要关心分支的事情了。
+
+<div align="center">
+<img src="image/gradients.png"><br/>
+Figure 5. Merge Gradients
+</div>
+
+### Unit Tests
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。
+每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。
+
+### Python API
+目前只考虑**v1 API**。
+
+计划在`python/paddle/trainer/config_parser.py`里面添加`use_mkldnn`这个选择,方便用户选择使用MKL-DNN的layers。
+
+具体实现方式比如:
+
+```python
+use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+if use_mkldnn
+    self.layer_type = mkldnn_*
+```
+
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 
+
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
+
+### Benchmarking
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
+
+### Others
+1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
+2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。
+
+## Design Concerns
+
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
+
+我们总结出一些特别需要注意的点:
+
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,
+我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。
+2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。
+3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。
+4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存,
+同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。
+在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
+
+## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
+主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
+目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。
+所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。
diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md
new file mode 100644
index 0000000000..bef126f3f0
--- /dev/null
+++ b/doc/design/mkl/mkldnn_fluid.md
@@ -0,0 +1,149 @@
+# Design Doc: Add MKLDNN Kernel in Fluid Operator
+
+## Principles
+
+First of all, we should follow some basical principles like:
+1.  [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc.
+2.  [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h).
+3.  [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels.
+4.  [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`.
+
+## Sulution
+
+In general, there are four parts we should follow to run a MKL-DNN primitive.
+-  Create a primitive descriptor that describe this operator
+-  Create a primitive itself by primitive descriptor and the engine
+-  Create all memory buffers that primitive needed
+-  Launch a stream to execute the primitive created
+More details can refer to [here](http://01org.github.io/mkl-dnn).
+
+It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \
+So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822).
+
+It's assumed that following three conditions should be satisfied.
+1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`.
+2. the `Input Tensor` inside `Compute` function is the one after converted.
+3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user.
+
+### Compute
+The algorithm of `Compute` would be described as follow, let's take conv like an example.
+
+```c++
+
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace.");
+  PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library.");
+
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+
+  // find primitive by unique key from mkldnn context
+  // the op_key should be a unique name of this op instance
+  auto& p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  // assuming the input tensor inside this compute function is the one after converted
+  // this point should be guarantee by another mechanism
+  auto& i = dev_ctx.findMemory(op_key + "_input");
+  
+  if (p == nullptr || i == nullptr || inputSizeChanged(p, i))  {
+    auto fwd_primitive_desc = createPrimitiveDesc(ctx);
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+    shared_ptr<mkldnn::memory> in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data<T>()));
+    shared_ptr<mkldnn::memory> wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data<T>()));
+    shared_ptr<mkldnn::memory> out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data<T>(ctx.GetPlace())));
+    shared_ptr<mkldnn::conv_fwd> fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out));
+
+    dev_ctx.addMemory(op_key+"_input", in);
+    dev_ctx.addMemory(op_key+"_output", out);
+    dev_ctx.addMemory(op_key+"_filer", wgt);
+    dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive);
+    dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_fwd");
+
+  PADDLE_ENFORCE(p, "Should have forward Primitive");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory");
+  PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory");
+  PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc");
+  dev_ctx.submit(p);
+  dev_ctx.execute();  // the convert primitive should have already contained.
+
+```
+
+The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this:
+```c++
+  auto* input = ctx.Input<Tensor>("Input");
+  auto* filter = ctx.Input<Tensor>("Filter");
+  auto* output = ctx.Output<Tensor>("Output");
+  std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+  int groups = ctx.Attr<int>("groups");
+  algorithm algo = static_cast<algorithm>(ctx.Attr<int>("convolution_algorithm_option"));
+  prop_kind pk = ctx.Attr<bool>("is_test") ? prop_kind::forward_inference : prop_kind::forward_training;
+    
+  auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/);
+  shared_ptr<mkldnn::conv_fwd::primitive_desc> fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine()));
+
+  return fwd_primitive_desc;
+  }
+```
+
+### MKLDNNDeviceContext
+`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed.
+
+
+### mkldnn_helper
+Some functions would be put in `paddle/platform/mkldnn_helper.h`.
+- create MKLDNN memories
+- create MKLDNN primitives
+- error check function
+- etc
+
+
+### Kernel Switch
+We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it.
+
+`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. 
+`trans` would be like this:
+
+```c++
+void trans(inputs, ctx) override {
+  if (NoNeedTrans()) {
+    return;
+  }
+  // find reorder primitive by op_key from context
+  auto& dev_ctx = ctx.template device_context<platform::MKLDNNDeviceContext>();
+  auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  auto& i = dev_ctx.findMemory(op_key + "_src_input");
+
+  if (p == nullptr || i == nullptr || changeSized(i, input)) {
+    auto prim = createPrimitiveDesc(ctx);
+    auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data);
+    auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes());
+    auto dst = createMemory(p->expected_desc(), newbuffer->data);
+    auto reorder_primitive(new mkldnn::reorder(src, dst));
+
+    dev_ctx.addMemory(op_key+"_src_input", src);
+    dev_ctx.addMemory(op_key+"_input", dst);
+    dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive);
+  }
+
+  p = dev_ctx.findPrimitive(op_key + "_reorder_input");
+  PADDLE_ENFORCE(p, "Should have Reorder Primitive");
+  dev_ctx.submit(p);
+  if (! this->isMKLDNNKernel()) {
+    // execute immediately only if this is not mkldnn kernel function.
+    // otherwise, it can be executed with the operator primitive in Compute
+    dev_ctx.stream();
+  }
+  // after submit, the input tensor in ExecutionContext should be changed as the converted one
+  // there should be another mechanism to ensure this
+}
+```
+
+### Unit Test
+All the functions should be tested corresponding.
+TBD
diff --git a/doc/design/model_format.md b/doc/design/model_format.md
new file mode 100644
index 0000000000..e29129fddf
--- /dev/null
+++ b/doc/design/model_format.md
@@ -0,0 +1,36 @@
+# Design Doc: Model Format
+
+## Motivation
+
+A model is an output of the training process. One complete model consists of two parts, the **topology** and the **parameters**. In order to support industrial deployment, the model format must be self-complete and must not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** is represented as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model. We must support large size parameters and efficient serialization/deserialization of parameters. 
+
+## Implementation
+
+The topology is saved as a plain text in a detailed self-contain protobuf file. 
+
+The parameters are saved as a binary file. As we all know, the protobuf message has a limit of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We have done a [benchmark experiment](https://github.com/PaddlePaddle/Paddle/pull/4610), which shows that protobuf is not fit for the task.
+
+As a result, we design a particular format for tensor serialization. By default, an arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of [LoDTensorDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99). We save the DescProto as the byte string header. It contains all the necessary information, such as the `dims`, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). A tensor stores values in a continuous memory buffer. For speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+
+The table below shows a tensor's byte view in detail. Note that all the signed values are written in the little-endian format.
+
+|field name  | type | description |
+| --- | --- | --- |
+| version | uint32_t | Version of saved file. Always 0 now. |
+| tensor desc length | uint32_t | TensorDesc(Protobuf message) length in bytes. |
+| tensor desc | void* | TensorDesc protobuf binary message |
+| tensor data | void* | Tensor's data in binary format. The length of `tensor_data` is decided by `TensorDesc.dims()` and `TensorDesc.data_type()` |
+| lod_level | uint64_t | Level of LoD |
+| length of lod[0] | uint64_t | [Optional] length of lod[0] in bytes. |
+| data of lod[0] | uint64_t*  | [Optional] lod[0].data() |
+| ... | ... | ... |
+
+
+
+## Summary
+
+- We introduce a model format.
+- The model represented by its forward-pass computation procedure is saved in a **ProgramDesc** protobuf message.
+- A bunch of specified format binary tensors describe the **parameters**.
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
new file mode 100644
index 0000000000..f86e6b7a56
--- /dev/null
+++ b/doc/design/operator_kernel_type.md
@@ -0,0 +1,91 @@
+# Design Doc: The Keys of Operator Kernel Type
+## Problem
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  proto::DataType data_type_;
+};
+```
+For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
+
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
+
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
+
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
+
+## Solution
+
+There are four keys to determine a kernel type of an operator: `Place`/`Library`/`DataType`/`Layout`.
+
+```cpp
+struct OpKernelType {
+  platform::Place place_;
+  platform::Library library_;
+  proto::DataType data_type_;
+  framework::Layout layout_;
+};
+```
+
+The details are as follows:
+
+### Place
+
+`Place` is defined as:
+
+```cpp
+typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
+```
+
+`Place` represents the device memory where data is located.
+
+
+### Library
+
+One operator kernel is usually implemented based on one library. `Library` is defined as a enum variable:
+
+```cpp
+enum Library { Plain, MKLDNN, CUDNN };
+```
+
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
+
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
+
+
+### DataType
+
+
+`DataType` is defined in [framework.proto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). Currently, int32/int64/fp32/fp64 are supported.
+
+### Layout
+
+Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
+
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
+
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
+
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
+
+- The inference of Layout is at run-time, not at compile-time.
+
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
+
+`Layout` is also defined as a enum variable:
+
+```cpp
+enum Layout {
+  kNCHW,
+  kNHWC,
+#ifdef PADDLE_WITH_MKLDNN
+  knChw8c
+  ...
+#endif
+};
+```
diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot
new file mode 100644
index 0000000000..5d77865061
--- /dev/null
+++ b/doc/design/ops/images/2_level_rnn.dot
@@ -0,0 +1,56 @@
+digraph G {
+
+  rnn [label="1st level RNN" shape=box]
+
+  subgraph cluster0 {
+    label = "time step 0"
+
+    sent0 [label="sentence"]
+    sent1 [label="sentence"]
+
+    rnn1 [label="2nd level RNN" shape=box]
+
+    sent0 -> rnn1
+    sent1 -> rnn1
+  }
+
+  subgraph cluster1 {
+    label = "time step 1"
+
+    sent2 [label="sentence"]
+    sent3 [label="sentence"]
+
+    rnn2 [label="2nd level RNN" shape=box]
+
+    sent2 -> rnn2
+    sent3 -> rnn2
+  }
+
+  subgraph cluster2 {
+    label = "time step 2"
+
+    sent4 [label="sentence"]
+    sent5 [label="sentence"]
+
+    rnn3 [label="2nd level RNN" shape=box]
+
+    sent4 -> rnn3
+    sent5 -> rnn3
+  }
+
+
+  para0 [label="paragraph info 0"]
+  para1 [label="paragraph info 1"]
+  para2 [label="paragraph info 2"]
+
+  rnn1 -> para0
+  rnn2 -> para1
+  rnn3 -> para2
+
+  para0 -> rnn
+  para1 -> rnn
+  para2 -> rnn
+
+  chapter [label="chapter info"]
+  rnn -> chapter
+}
diff --git a/doc/design/ops/images/2_level_rnn.png b/doc/design/ops/images/2_level_rnn.png
new file mode 100644
index 0000000000..0537a75beb
Binary files /dev/null and b/doc/design/ops/images/2_level_rnn.png differ
diff --git a/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg
new file mode 100644
index 0000000000..8b0d90f7b9
Binary files /dev/null and b/doc/design/ops/images/LOD-and-shape-changes-during-decoding.jpg differ
diff --git a/doc/design/ops/images/rnn.dot b/doc/design/ops/images/rnn.dot
new file mode 100644
index 0000000000..c1141cd9c9
--- /dev/null
+++ b/doc/design/ops/images/rnn.dot
@@ -0,0 +1,87 @@
+digraph G {
+  label = "simple RNN implementation" 
+
+  ranksep=2;
+
+  //graph [nodesep=1, ranksep=1];
+
+  node[nodesep=1]
+
+  subgraph cluster0 {
+    label = "global scope"
+    rankdir = TB
+    W
+    boot_memory
+    input
+    output
+  }
+
+  subgraph cluster1 {
+    label = "step-scope 0"
+    rankdir = TB
+    memory0[label="memory"]
+    prememory0[label="pre-memory"]
+    step_input0[label="step input"]
+    step_output0[label="step output"]
+  }
+
+  subgraph cluster2 {
+    label = "step-scope 1"
+    rankdir = TB
+    memory1[label="memory"]
+    prememory1[label="pre-memory"]
+    step_input1[label="step input"]
+    step_output1[label="step output"]
+  }
+
+  subgraph cluster3 {
+    label = "step-scope 2"
+    rankdir = TB
+    memory2[label="memory"]
+    prememory2[label="pre-memory"]
+    step_input2[label="step input"]
+    step_output2[label="step output"]
+  }
+
+  stepnet [shape=box]
+  stepnet0 [shape=box, style=dashed]
+  stepnet1 [shape=box, style=dashed]
+  stepnet2 [shape=box, style=dashed]
+
+
+  edge[color=blue]
+  boot_memory -> prememory0 [label="init" color="blue"]
+  memory0 -> prememory1  [label="copy/reference" color="blue"]
+  memory1 -> prememory2 [label="copy/reference" color="blue"]
+
+  edge[color=black]
+  W -> stepnet0[constraint=false, style=dashed]
+  W -> stepnet1[constraint=false, style=dashed]
+  W -> stepnet2[constraint=false, style=dashed]
+
+  memory0 -> stepnet0[style=dashed]
+  prememory0 -> stepnet0 -> step_output0[style=dashed]
+
+  memory1 -> stepnet1[style=dashed]
+  prememory1 -> stepnet1 -> step_output1[style=dashed]
+
+  memory2 -> stepnet2[style=dashed]
+  prememory2 -> stepnet2 -> step_output2[style=dashed]
+
+  input -> step_input0
+  input -> step_input1
+  input -> step_input2
+
+  step_input0 -> stepnet0 [style=dashed]
+  step_input1 -> stepnet1[style=dashed]
+  step_input2 -> stepnet2[style=dashed]
+
+  step_output0 -> output
+  step_output1 -> output
+  step_output2 -> output
+
+  stepnet0 -> stepnet[style=dashed]
+  stepnet1 -> stepnet[style=dashed]
+  stepnet2 -> stepnet[style=dashed]
+
+}
diff --git a/doc/design/ops/images/rnn.jpg b/doc/design/ops/images/rnn.jpg
new file mode 100644
index 0000000000..9867e404cf
Binary files /dev/null and b/doc/design/ops/images/rnn.jpg differ
diff --git a/doc/design/ops/images/rnn.png b/doc/design/ops/images/rnn.png
new file mode 100644
index 0000000000..e139e373fe
Binary files /dev/null and b/doc/design/ops/images/rnn.png differ
diff --git a/doc/design/ops/images/rnn_2level_data.dot b/doc/design/ops/images/rnn_2level_data.dot
new file mode 100644
index 0000000000..1d85ae2617
--- /dev/null
+++ b/doc/design/ops/images/rnn_2level_data.dot
@@ -0,0 +1,75 @@
+digraph G {
+  chapter [label="chapter"]
+
+  subgraph cluster0 {
+    label = "paragraph 0"
+
+    top_rnn0[label="top rnn step 0" shape=box]
+
+    p0 [label="paragraph 0"]
+    p1 [label="paragraph 1"]
+  }
+
+  subgraph cluster1{
+    label = "paragraph 1"
+
+    top_rnn1[label="top rnn step 1" shape=box]
+
+    p2 [label="paragraph 0"]
+    p3 [label="paragraph 1"]
+  }
+
+  subgraph cluster_p0 {
+    label = "sentence 0"
+
+    low_rnn0 [label="low rnn step 0" shape=box]
+    s00 [label="sentence 0"]
+    s01 [label="sentence 1"]
+
+    low_rnn0 -> s00
+    low_rnn0 -> s01
+  }
+
+  subgraph cluster_p1 {
+    label = "sentence 1"
+    low_rnn1 [label="low rnn step 1" shape=box]
+    s10 [label="sentence 0"]
+    s11 [label="sentence 1"]
+    low_rnn1 -> s10
+    low_rnn1 -> s11
+  }
+
+  subgraph cluster_p2 {
+    label = "sentence 1"
+    low_rnn2 [label="low rnn step 0" shape=box]
+    s20 [label="sentence 0"]
+    s21 [label="sentence 1"]
+    low_rnn2 -> s20
+    low_rnn2 -> s21
+  }
+
+  subgraph cluster_p3 {
+    label = "sentence 1"
+    low_rnn3 [label="low rnn step 1" shape=box]
+    s30 [label="sentence 0"]
+    s31 [label="sentence 1"]
+    low_rnn3 -> s30
+    low_rnn3 -> s31
+  }
+
+
+  chapter -> top_rnn0
+  chapter -> top_rnn1
+
+  top_rnn0 -> p0
+  top_rnn0 -> p1
+  top_rnn1 -> p2
+  top_rnn1 -> p3
+
+
+  p0 -> low_rnn0
+  p1 -> low_rnn1
+  p2 -> low_rnn2
+  p3 -> low_rnn3
+
+}
diff --git a/doc/design/ops/images/rnn_2level_data.png b/doc/design/ops/images/rnn_2level_data.png
new file mode 100644
index 0000000000..4be81b2430
Binary files /dev/null and b/doc/design/ops/images/rnn_2level_data.png differ
diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md
new file mode 100644
index 0000000000..2f4854793f
--- /dev/null
+++ b/doc/design/ops/rnn.md
@@ -0,0 +1,153 @@
+# RNNOp design
+
+This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future.
+
+## RNN Algorithm Implementation
+
+<p align="center">
+<img src="./images/rnn.jpg"/>
+</p>
+
+The above diagram shows an RNN unrolled into a full network.
+
+There are several important concepts here:
+
+- *step-net*: the sub-graph that runs at each step.
+- *memory*, $h_t$, the state of the current step.
+- *ex-memory*, $h_{t-1}$, the state of the previous step.
+- *initial memory value*, the memory of the first (initial) step.
+
+### Step-scope
+
+There could be local variables defined in each step-net.  PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step.
+
+<p align="center">
+<img src="./images/rnn.png"/><br/>
+Figure 2 illustrates the RNN's data flow
+</p>
+
+Please be aware that every step runs the same step-net.  Each step does the following:
+
+1. Creates the step-scope.
+2. Initializes the local variables including step-outputs, in the step-scope.
+3. Runs the step-net, which uses the above mentioned variables.
+
+The RNN operator will compose its output from step outputs in each of the step scopes.
+
+### Memory and Ex-memory
+
+Let's give more details about memory and ex-memory using a simple example:
+
+$$
+h_t = U h_{t-1} + W x_t
+$$,
+
+where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively.
+
+In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step,
+or copy the memory value of the previous step to the current ex-memory variable.
+
+### Usage in Python
+
+For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+We can define an RNN's step-net using a Block:
+
+```python
+import paddle as pd
+
+X = some_op() # x is some operator's output and is a LoDTensor
+a = some_op()
+
+# declare parameters
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+rnn = pd.create_rnn_op(output_num=1)
+with rnn.stepnet():
+    x = rnn.add_input(X)
+    # declare a memory (rnn's step)
+    h = rnn.add_memory(init=a)
+    # h.pre_state(), the previous memory of rnn
+    new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state()))
+    # update current memory
+    h.update(new_state)
+    # indicate that h variables in all step scopes should be merged
+    rnn.add_outputs(h)
+
+out = rnn()
+```
+
+Python API functions in above example:
+
+- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs.
+- `rnn.add_memory`: creates a variable used as the memory.
+- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output.
+
+### Nested RNN and LoDTensor
+
+An RNN whose step-net includes other RNN operators is known as an *nested RNN*.
+
+For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level.
+
+The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text.
+
+<p align="center">
+<img src="./images/2_level_rnn.png"/>
+</p>
+
+```python
+import paddle as pd
+
+W = pd.Variable(shape=[20, 30])
+U = pd.Variable(shape=[20, 30])
+
+W0 = pd.Variable(shape=[20, 30])
+U0 = pd.Variable(shape=[20, 30])
+
+# a is output of some op
+a = some_op()
+
+# chapter_data is a set of 128-dim word vectors
+# the first level of LoD is sentence
+# the second level of LoD is a chapter
+chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2)
+
+def lower_level_rnn(paragraph):
+    '''
+    x: the input
+    '''
+    rnn = pd.create_rnn_op(output_num=1)
+    with rnn.stepnet():
+        sentence = rnn.add_input(paragraph, level=0)
+        h = rnn.add_memory(shape=[20, 30])
+        h.update(
+            pd.matmul(W, sentence) + pd.matmul(U, h.pre_state()))
+        # get the last state as sentence's info
+        rnn.add_outputs(h)
+    return rnn
+
+top_level_rnn = pd.create_rnn_op(output_num=1)
+with top_level_rnn.stepnet():
+    paragraph_data = rnn.add_input(chapter_data, level=1)
+    low_rnn = lower_level_rnn(paragraph_data)
+    paragraph_out = low_rnn()
+
+    h = rnn.add_memory(init=a)
+    h.update(
+        pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state()))
+    top_level_rnn.add_outputs(h)
+
+# output the last step
+chapter_out = top_level_rnn(output_all_steps=False)
+```
+
+In the above example, the construction of the `top_level_rnn` calls  `lower_level_rnn`.  The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences.
+
+By default, the `RNNOp` will concatenate the outputs from all the time steps.
+If the `output_all_steps` is set to False, it will only output the final time step.
+
+
+<p align="center">
+<img src="images/rnn_2level_data.png"/>
+</p>
diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md
new file mode 100644
index 0000000000..c4a9bbeeef
--- /dev/null
+++ b/doc/design/ops/sequence_decoder.md
@@ -0,0 +1,229 @@
+# Design: Sequence Decoder Generating LoDTensors
+In tasks such as machine translation and visual captioning,
+a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time.
+
+This documentation describes how to implement the sequence decoder as an operator.
+
+## Beam Search based Decoder
+The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set.
+
+In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users.
+
+There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users.
+
+During the refactoring of PaddlePaddle, some new concepts are proposed such as:  [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** .
+
+For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`;
+the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated.
+
+## Changing LoD's absolute offset to relative offsets
+The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level.
+
+The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**,
+let's call this format the **absolute-offset LoD** for clarity.
+
+The absolute-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows
+```python
+[[0, 3, 9]
+ [0, 2, 3, 3, 3, 9]]
+```
+The first level tells that there are two sequences:
+- the first's offset is `[0, 3)`
+- the second's offset is `[3, 9)`
+
+while on the second level, there are several empty sequences that both begin and end at `3`.
+It is impossible to tell how many empty second-level sequences exist in the first-level sequences.
+
+There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix.
+
+So let's introduce another format of LoD,
+it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD.
+
+For example, to represent the same sequences of the above data
+
+```python
+[[0, 3, 6]
+ [0, 2, 3, 3, 3, 9]]
+```
+
+the first level represents that there are two sequences,
+their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`.
+
+The second level is the same with the relative offset example because the lower level is a tensor.
+It is easy to find out the second sequence in the first-level LoD has two empty sequences.
+
+The following examples are based on relative-offset LoD.
+
+## Usage in a simple machine translation model
+Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it.
+
+The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences.
+
+**Encoder**
+```python
+import paddle as pd
+
+dict_size = 8000
+source_dict_size = dict_size
+target_dict_size = dict_size
+word_vector_dim = 128
+encoder_dim = 128
+decoder_dim = 128
+beam_size = 5
+max_length = 120
+
+# encoder
+src_word_id = pd.data(
+    name='source_language_word',
+    type=pd.data.integer_value_sequence(source_dict_dim))
+src_embedding = pd.embedding(size=source_dict_size, size=word_vector_dim)
+
+src_word_vec = pd.lookup(src_embedding, src_word_id)
+
+encoder_out_seq = pd.gru(input=src_word_vec, size=encoder_dim)
+
+encoder_ctx = pd.last_seq(encoder_out_seq)
+# encoder_ctx_proj is the learned semantic vector
+encoder_ctx_proj = pd.fc(
+    encoder_ctx, size=decoder_dim, act=pd.activation.Tanh(), bias=None)
+```
+
+**Decoder**
+
+```python
+def generate():
+    decoder = pd.while_loop()
+    with decoder.step():
+        decoder_mem = decoder.memory(init=encoder_ctx)  # mark the memory
+        generated_ids = decoder.memory() # TODO init to batch_size <s>s
+        generated_scores = decoder.memory() # TODO init to batch_size 1s or 0s
+
+        target_word = pd.lookup(trg_embedding, gendrated_ids)
+        # expand encoder_ctx's batch to fit target_word's lod
+        # for example
+        # decoder_mem.lod is
+        # [[0 1 3],
+        #  [0 1 3 6]]
+        # its tensor content is [a1 a2 a3 a4 a5]
+        # which means there are 2 sentences to translate
+        #   - the first sentence has 1 translation prefixes, the offsets are [0, 1)
+        #   - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6)
+        # the target_word.lod is
+        # [[0, 1, 6]
+        #  [0, 2, 4, 7, 9 12]]
+        # which means 2 sentences to translate, each has 1 and 5 prefixes
+        # the first prefix has 2 candidates
+        # the following has 2, 3, 2, 3 candidates
+        # the encoder_ctx_expanded's content will be
+        # [a1 a1 a2 a2 a3 a3 a3 a4 a4 a5 a5 a5]
+        encoder_ctx_expanded = pd.lod_expand(encoder_ctx, target_word)
+        decoder_input = pd.fc(
+            act=pd.activation.Linear(),
+            input=[target_word, encoder_ctx_expanded],
+            size=3 * decoder_dim)
+        gru_out, cur_mem = pd.gru_step(
+            decoder_input, mem=decoder_mem, size=decoder_dim)
+        scores = pd.fc(
+            gru_out,
+            size=trg_dic_size,
+            bias=None,
+            act=pd.activation.Softmax())
+        # K is an config
+        topk_scores, topk_ids = pd.top_k(scores, K)
+        topk_generated_scores = pd.add_scalar(topk_scores, generated_scores)
+
+        selected_ids, selected_generation_scores = decoder.beam_search(
+            topk_ids, topk_generated_scores)
+
+        # update the states
+        decoder_mem.update(cur_mem)  # tells how to update state
+        generated_ids.update(selected_ids)
+        generated_scores.update(selected_generation_scores)
+
+        decoder.output(selected_ids)
+        decoder.output(selected_generation_scores)
+
+translation_ids, translation_scores = decoder()
+```
+The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates,
+returns the result of the beam search algorithm.
+
+In this way, users can customize anything on the input or output of beam search, for example:
+
+1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate.
+2. Remove some specific candidate in `selected_ids`.
+3. Get the final `translation_ids`, remove the translation sequence in it.
+
+The implementation of sequence decoder can reuse the C++ class:  [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30),
+so the python syntax is quite similar to that of an  [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop).
+
+Both of them are two-level `LoDTensors`:
+
+- The first level represents `batch_size` of (source) sentences.
+- The second level represents the candidate ID sets for translation prefix.
+
+For example, 3 source sentences to translate, and has 2, 3, 1 candidates.
+
+Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state.
+
+For example, the previous state:
+
+* LoD is `[0, 1, 3][0, 2, 5, 6]`
+* content of tensor is `a1 a2 b1 b2 b3 c1`
+
+the current state is stored in `encoder_ctx_expanded`:
+
+* LoD is `[0, 2, 7][0 3 5 8 9 11 11]`
+* the content is
+  - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates)
+  - a2 a2
+  - b1 b1 b1
+  - b2
+  - b3 b3
+  - None (c1 has 0 candidates, so c1 is dropped)
+
+The benefit from the relative offset LoD is that the empty candidate set can be represented naturally.
+
+The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is:
+
+```python
+decoder.output(selected_ids)
+decoder.output(selected_generation_scores)
+```
+
+The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences.
+
+Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate.
+
+Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation.
+
+## LoD and shape changes during decoding
+<p align="center">
+  <img src="./images/LOD-and-shape-changes-during-decoding.jpg"/>
+</p>
+
+According to the image above, the only phase that changes the LoD is beam search.
+
+## Beam search design
+The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs:
+
+1. `topk_ids`, the top K candidate ids for each prefix.
+2. `topk_scores`, the corresponding scores for `topk_ids`
+3. `generated_scores`, the score of the prefixes.
+
+All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix.
+
+It will return three variables:
+
+1. `selected_ids`, the final candidate beam search function selected for the next step.
+2. `selected_scores`, the scores for the candidates.
+3. `generated_scores`, the updated scores for each prefix (with the new candidates appended).
+
+## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray`
+The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step,
+so it is natural to store them in arrays.
+
+Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`.
+
+The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors.
+It needs some extensions to support the packing or unpacking an array of `LoDTensors`.
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
new file mode 100644
index 0000000000..691081c268
--- /dev/null
+++ b/doc/design/optimizer.md
@@ -0,0 +1,91 @@
+## Optimizer Design
+
+### The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+### High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+#### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/design/paddle_nccl.md b/doc/design/paddle_nccl.md
new file mode 100644
index 0000000000..c7dac70998
--- /dev/null
+++ b/doc/design/paddle_nccl.md
@@ -0,0 +1,65 @@
+# Design Doc: NCCL support in Paddle Fluid
+
+## Abstract
+
+This Design Doc refers to the NCCL feature in  paddle.  We propose an approach to support NCCL library both on a single machine and multiple machines. We wrapper the NCCL primitives `Broadcast`, `Allreduce`, `Reduce` as operators to utilize Multi-GPU powers in one script.
+
+
+## Motivation
+
+[NCCL](https://developer.nvidia.com/nccl) is a NVIDIA library support Multi-GPU communicating and optimized for NVIDIA GPUs, it provides routines such as all-gather, all-reduce, broadcast, reduce, reduce-scatter, that can achieve high bandwidth over PCIe and NVLink high-speed interconnect. With NCCL library, we can easily accelerate the training in parallel. 
+
+- Pros
+1. easily plug-in with [NCCL2](https://developer.nvidia.com/nccl) library.
+1. high performance in NVIDIA GPUs.
+1. MPI like primitives, which have low learning cost for users.
+
+- Cons
+1. Only design for NVIDIA GPUs, not a general multi-device solution.
+1. Although NCCL1 is opensourced under BSD license, but NCCL2 is not opensourced anymore.
+
+At the beginning of training, the framework needs to distribute the same parameters to every GPU, and merge the gradients at any time user interests.
+
+As a result, during training, we need the operations of peer to peer copy between different GPUs, aggregating gradients/parameters from GPUs, and broadcasting parameters to GPUs. Every GPU only need to run the operator with correct place information.
+
+Besides, it needs interfaces to synchronize model update with each different GPU Cards. 
+
+## Implementation
+
+As mentioned above, we wrap the NCCL routines as several kinds of operators. Need to note that NCCL need to create Communicator between gpu at the beginning, so there is a NCCLInit operator created.
+
+### Transpiler
+
+To be compatible with [parameter server design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md), the transpiler compiles the user defined operation graph into sub-graphs to be executed on different devices.
+
+1. The user-defined model will be a single device program
+
+2. Broadcast/Reduce operators between GPUs will be inserted into the program, even for the multi-node, may insert the `Send`, `Recv` operator.
+
+   *Broadcast, AllReduce in a single machine. And Broadcast, AllReduce, [Send, Recv](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/ops/dist_train.md#graph-converter) in multiple machines*
+
+   <img src="images/multigpu_before_convert.png" width="300"/>
+
+After compiling, the graph as shows
+
+<img src="images/multigpu_allreduce.png" width="1000"/>
+
+Operators are added to the sub-graphs. Every GPU assigned a role of `rank0`, `rank1` etc. 
+
+- **Broadcast**. Broadcast operator distribute initialized parameter to all the GPUs from the GPU who owns it. e.g. from`rank0` GPU.
+- **AllReduce**. AllReduce operator synchronizes parameters/gradients between GPUs. AllReduce implemented in the Ring-Based  communicating method, avoid of the bottle neck in a single GPU.
+
+Need to notice that AllReduce operator force GPUs synchronized at that point. The whole training process in asynchronous or synchronous mode depends on the AllReduce point in the graph.
+
+As it shown in the picture, when each GPU compute the gradient of `W`, followed with a `AllReduce` operator, accumulate the `dW` to full batch of data, then run the optimize process individually and apply the gradient to its `W`.
+
+- **AllReduce**
+  Need to note that our AllReduce operator is a ring-base AllReduce implementation. If we use the NCCL2 AllReduce primitive, every GPU optimized full batch of data, wasted (n-1) GPU compute resources. In addition, NCCL2 built-in AllReduce will only utilize the communicating resource during synchronization, then update the gradient will be a subsequent phase. In fact, we can amortize the update gradient time cost into the communicating phase. The process is
+1. Every parameter has its root card. That card will responsible for aggregating the gradients from GPUs.
+2. The whole model's parameter will be hashed to different root card, ensure the load balance between GPUs.
+3. Logically neighberhood card will start send parameter to the next one. After one round, the parameter main card will aggregate the full gradients.
+4. Then the root card will optimize the parameter.
+5. This parameter card will send its optimized result to its neighberhood, then the neighberhood will send parameter to its next one.
+6. Finish the sychronization round.
+
+The total time cost will be 2 * (n-1) * per-parameter-send-time, we reach the goal of amortize the upgrade time into communicating phase.
diff --git a/doc/design/parameter_average.md b/doc/design/parameter_average.md
new file mode 100644
index 0000000000..2c4edee9fe
--- /dev/null
+++ b/doc/design/parameter_average.md
@@ -0,0 +1,72 @@
+# Averaging Parameter in PaddlePaddle
+
+## Why Averaging
+In a large scale machine learning setup where the size of the training data is huge, it could take us a large number of iterations over the training data before we can achieve the optimal values of parameters of our model. Looking at the problem setup, it is desirable if we can obtain the optimal values of parameters by going through the data in as few passes as we can.
+
+Polyak and Juditsky (1992) showed that the test performance of simple average of parameters obtained by Stochastic Gradient Descent (SGD) is as good as that of parameter values that are obtained by training the model over and over again, over the training dataset.
+
+Hence, to accelerate the speed of Stochastic Gradient Descent, Averaged Stochastic Gradient Descent (ASGD) was proposed in Polyak and Juditsky (1992). For ASGD, the running average of parameters obtained by SGD, is used as the estimator for <img src="./images/theta_star.gif"/><br/> . The averaging is done as follows:
+
+<img src="./images/asgd.gif" align="center"/><br/>
+
+We propose averaging for any optimizer similar to how ASGD performs it, as mentioned above.
+
+### How to perform Parameter Averaging in PaddlePaddle
+
+Parameter Averaging in PaddlePaddle works in the following way during training :
+1. It will take in an instance of a normal optimizer as an input, e.g. RMSPropOptimizer
+2. The optimizer itself is responsible for updating the parameters.
+3. The ParameterAverageOptimizer maintains a separate copy of the parameters for itself:
+    1. In concept, the values of this copy are the average of the values of the parameters in the most recent N batches.
+    2. However, saving all the N instances of the parameters in memory is not feasible.
+    3. Therefore, an approximation algorithm is used.
+
+Hence, overall we have have two copies of the parameters: one for the optimizer itself, and one for the ParameterAverageOptimizer. The former should be used in back propagation, while the latter should be used during testing and should be saved.
+
+During the testing/ saving the model phase, we perform the following steps:
+1. Perform the delayed operations.
+2. Save current values of the parameters to a temporary variable.
+3. Replace the values of the parameters with the averaged values.
+4. Perform testing and/or save the parameters.
+5. Restore the values of the parameters once done.
+
+### How to implement Averaging of Parameter in PaddlePaddle
+
+We can add the ParameterAverageOptimizer op to the graph through Python API. Using this approach, we manually add this op to the graph and direct the output of the optimizer op to this op during training.
+
+	**Advantages**:
+    - Allows for greater flexibility to the users of PaddlePaddle. Using this approach, the users can plug different optimizers into ParameterAverageOptimizer by passing in the optimizer to the op.
+    - Makes it easy for the users to customize and extend the framework.
+
+	**Disadvantages**:
+    - Implementation requires re-writing the averaging methodology in Python.  
+
+### Low-Level implementation
+
+In the new design, we propose to create a new operation for averaging parameter updates (ParameterAverageOptimizer). For now, we can add an op that takes in the following as input:
+- the optimizer
+- the window_size to keep the updates
+
+The ParameterAverageOptimizer op can be like any other operator with its own CPU/GPU implementation either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement the kernel using Eigen following the abstraction pattern implemented for [Operators](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.h). We also want to support the case when the Trainer/Optimizer runs on the GPU while ParameterAverageOptimizer runs on a CPU.
+
+The idea of building an op for averaging is in sync with the refactored PaddlePaddle philosophy of using operators to represent any computation unit. The way the op will be added to the computation graph will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API.
+
+### Python API implementation for ParameterAverageOptimizer
+
+Based on Polyak and Juditsky (1992), we can generalize the averaging of updates to any optimizer. The input to the op would be the following:
+- Any optimizer (RMSProp , AdaGrad etc.)
+- A window size. The op keeps accumulating updated parameter values over a window of N batches and takes an average. Move the averaged value to a buffer when window is full to avoid loss of precision.
+
+Using the ParameterAverageOptimizer op, any user can add the operation to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support averaging. As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since ParameterAverageOptimizer will be an operator, it makes sense to create it in the layer functions.
+We will have a wrapper written in Python that will support the functionality and implement the actual core computation in C++ core as we have done for other [Optimizers](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/rmsprop_op.cc)
+
+#### Creation of the ParameterAverageOptimizer operator
+There are two ways for creating the ParameterAverageOptimizer op:
+1. We create the op immediately while building the computation graph.
+2. We add the op in a lazy manner, just before the backward pass, similar to the way the optimization ops are added.
+
+The proposal is to add the op immediately while building the computation graph.
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide parameter average functionality in layer functions.
diff --git a/doc/design/parameters_in_cpp.md b/doc/design/parameters_in_cpp.md
index b6f99bc7d9..a7ac3f17c4 100644
--- a/doc/design/parameters_in_cpp.md
+++ b/doc/design/parameters_in_cpp.md
@@ -1,19 +1,19 @@
 # Design Doc: The C++ Class `Parameters`
 
-`Parameters` is a concept we designed in Paddle V2 API. `Parameters` is a container of parameters, and make Paddle can shared parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
+`Parameters` is a concept we designed in PaddlePaddle V2 API. `Parameters` is a container of parameters, which makes PaddlePaddle capable of  sharing parameter between topologies. We described usages of `Parameter` in [api.md](./api.md).
 
-We used Python to implement Parameters when designing V2 API before. There are several defects for current implementation:
+We used Python to implement Parameters when designing V2 API before. There are several defects for the current implementation:
 * We just use `memcpy` to share Parameters between topologies, but this is very inefficient. 
-* We did not implement share Parameters while training. We just trigger `memcpy` when start training.
+* We did not support sharing Parameters while training. We just trigger `memcpy` when start training.
 
-It is necessary that we implement Parameters in CPP side. However, it could be a code refactoring for Paddle, because Paddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current Paddle implementation, there are three concepts associated with `Parameters`:
+It is necessary that we implement Parameters in CPP side. However, it could result a code refactoring for PaddlePaddle, because PaddlePaddle was designed for training only one topology before, i.e., each GradientMachine contains its Parameter as a data member. In current PaddlePaddle implementation, there are three concepts associated with `Parameters`:
 
 1. `paddle::Parameter`. A `Parameters` is a container for `paddle::Parameter`.
 It is evident that we should use `paddle::Parameter` when developing `Parameters`.
 However, the `Parameter` class contains many functions and does not have a clear interface.
 It contains `create/store Parameter`, `serialize/deserialize`, `optimize(i.e SGD)`, `randomize/zero`.
 When we developing `Parameters`, we only use `create/store Parameter` functionality.
-We should extract functionalities of Parameter into many classes to clean Paddle CPP implementation.
+We should extract functionalities of Parameter into many classes to clean PaddlePaddle CPP implementation.
 
 2. `paddle::GradientMachine` and its sub-classes, e.g., `paddle::MultiGradientMachine`, `paddle::NeuralNetwork`.
 We should pass `Parameters` to `paddle::GradientMachine` when `forward/backward` to avoid `memcpy` between topologies.
@@ -24,7 +24,7 @@ Also, we should handle multi-GPU/CPU training, because `forward` and `backward`
 So `Parameters` should be used by `paddle::ParameterUpdater`, and `paddle::ParameterUpdater` should optimize `Parameters` (by SGD).
 
 
-The step by step approach for implementation Parameters in Paddle C++ core is listed below. Each step should be a PR and could be merged into Paddle one by one.
+The step by step approach for implementation Parameters in PaddlePaddle C++ core is listed below. Each step should be a PR and could be merged into PaddlePaddle one by one.
 
 1. Clean `paddle::Parameter` interface. Extract the functionalities of `paddle::Parameter` to prepare for the implementation of Parameters.
 
diff --git a/doc/design/profiler.md b/doc/design/profiler.md
new file mode 100644
index 0000000000..b20b5efdc1
--- /dev/null
+++ b/doc/design/profiler.md
@@ -0,0 +1,97 @@
+## Introduction
+
+There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices.  The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program.  We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks.  The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
+
+## Architecture
+
+The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.  
+
+```python
+for i in xrange(M):  # M is  the iteration number
+  for op in operator_lists: # The `operator_lists` contains all the operators in the network.
+    op.run();
+```
+
+In summary, the proflier should have following features:
+
+- records time span in loop.
+- supports nested time span.
+- supports multiple threads/multiple GPUs.
+- supports to be enabled and disabled by users.
+
+But how to record the time for the mixed C++ and CUDA program?  There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.  
+
+The overall flow is shown as the following figure.
+
+<img src="./images/profiler.png" align="center"/><br/>
+
+### Event
+
+In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
+
+```c++
+enum EventKind {
+  kMark,
+  kPushRange,
+  kPopRange};
+```
+- kMark: only a marker without time range.
+- kPushRange: mark the starting event for time range. 
+- kPopRange: mark the ending event for time range.
+
+For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used.  For many pieces of code, an event lists are used to record each piece. 
+
+```c++
+class Event {
+ public:
+  // The DeviceContext is used to get current  CUDA stream.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const platform::DeviceContext* dev_ctx = nullptr);
+  double CpuElapsedUs(const Event& e) const;
+  double CudaElapsedUs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+```
+
+As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. 
+
+```c++
+enum ProfilerState {
+  kDisabled, 
+  kCPU,
+  kCUDA
+};
+ProfilerState g_state;
+```
+- kDisabled: the disabled state.
+- kCPU: CPU profiling state.
+- kCUDA: GPU profiling state.
+
+A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
+
+```c++
+struct RecordEvent {
+  explicit RecordEvent(const std::string name,
+                       platform::DeviceContext* dev_ctx = nullptr) {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the starting event to the event lists.
+  }
+  ~RecordEvent() {
+    if (kState == ProfilerState::kDisabled) return;
+    // push the ending event to the event lists.
+  }
+};
+```
diff --git a/doc/design/program.md b/doc/design/program.md
new file mode 100644
index 0000000000..bd2456787c
--- /dev/null
+++ b/doc/design/program.md
@@ -0,0 +1,139 @@
+# Design Doc: PaddlePaddle Programs
+
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
+
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
+
+```python
+x = layer.data("images")
+l = layer.data("label")
+y = layer.fc(x)
+cost = layer.mse(y, l)
+optimize(cost)
+train(cost, reader=mnist.train())
+```
+
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
+
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
+}
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
+message BlockDesc {
+  required int32 parent = 1;
+  repeated VarDesc vars = 2;
+  repeated OpDesc ops = 3;
+}
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
+
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
+message OpDesc {
+  AttrDesc attrs = 1;
+  ...
+}
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
+
+```
+message AttrDesc {
+  required string name = 1;
+
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
+  ...
+}
+```
+
+## InferShape
+
+With this design, the InferShape function should take the following parameters:
+
+```c++
+void InferShape(int current_block,
+                int current_operator,
+                ProgramDesc* program // might change VarDesc values.
+                ) {
+  ...
+}
+```
+
+where
+
+- `current_block` indices into `ProgramDesc::blocks`,
+- `current_operator` indices into `BlockDesc::ops`.
diff --git a/doc/design/prune.md b/doc/design/prune.md
new file mode 100644
index 0000000000..4a5cf10c79
--- /dev/null
+++ b/doc/design/prune.md
@@ -0,0 +1,63 @@
+# Prune
+
+## Motivation
+
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+
+## Challenge
+
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+
+## Solution
+
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+
+### Algorithm
+
+If an operator needs to be run, it must fall into one of the following cases:
+
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
new file mode 100644
index 0000000000..73f6d7b90c
--- /dev/null
+++ b/doc/design/python_api.md
@@ -0,0 +1,304 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+
+    def type(self):
+        return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
index f21f7af520..2cd4b6225b 100644
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
@@ -1,25 +1,25 @@
 # Python Data Reader Design Doc
 
-At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
 
-- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
-- A *reader creator* is a function that returns a reader function.
-- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
-- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
+- A *reader creator*: A function that returns a reader function.
+- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
+- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
 
-and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
 
 ## Data Reader Interface
 
-Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
 
 ```
 iterable = data_reader()
 ```
 
-Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
 
-An example implementation for single item data reader creator:
+An example implementation for single item data reader creator is as follows:
 
 ```python
 def reader_creator_random_image(width, height):
@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
     return reader
 ```
 
-An example implementation for multiple item data reader creator:
+An example implementation for multiple item data reader creator is as follows:
 ```python
 def reader_creator_random_image_and_label(width, height, label):
     def reader():
@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
 
 ## Batch Reader Interface
 
-*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
+
+Here are some valid outputs:
 
-Here are valid outputs:
 ```python
 # a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
 [(1, 1, 1),
@@ -52,26 +53,28 @@ Here are valid outputs:
 # a mini batch of three data items, each data item is a list (single column).
 [([1,1,1],),
 ([2,2,2],),
-([3,3,3],),
+([3,3,3],)]
 ```
 
 Please note that each item inside the list must be a tuple, below is an invalid output:
 ```python
  # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
- # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
- # or three column of datas, each of which is 1.
+ # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three columns of data, each of which is 1.
 [[1,1,1],
 [2,2,2],
 [3,3,3]]
 ```
 
-It's easy to convert from reader to batch reader:
+It is easy to convert from a reader to a batch reader:
+
 ```python
 mnist_train = paddle.dataset.mnist.train()
 mnist_train_batch_reader = paddle.batch(mnist_train, 128)
 ```
 
-Also easy to create custom batch reader:
+It is also straight forward to create a custom batch reader:
+
 ```python
 def custom_batch_reader():
     while True:
@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
 
 ## Usage
 
-batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+Following is how we can use the reader with PaddlePaddle:
+The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
 
 ```python
 # two data layer is created:
@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
 
 ## Data Reader Decorator
 
-*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
 
-Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
 
 ### Prefetch Data
 
-Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
 
 Use `paddle.reader.buffered` to prefetch data:
 
@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
 
 ### Compose Multiple Data Readers
 
-For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
 
-We can do:
+We can do the following :
 
 ```python
 def reader_creator_random_image(width, height):
@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
 
 reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
 # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
-# And we don't care second item at this time.
+# And we don't care about the second item at this time.
 paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
 ```
 
 ### Shuffle
 
-Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
 
 Example:
 ```python
@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
 
 ## Q & A
 
-### Why reader return only a single entry, but not a mini batch?
+### Why does a reader return only a single entry, and not a mini batch?
 
-Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
 
-We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
 
-### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
 
-In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
 
-### Why use a dictionary but not a list to provide mapping?
+### Why use a dictionary instead of a list to provide mapping?
 
-We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
 
-### How to create custom data reader creator
+### How to create a custom data reader creator ?
 
 ```python
 def image_reader_creator(image_path, label_path, n):
@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
 
 ### How is `paddle.train` implemented
 
-An example implementation of paddle.train could be:
+An example implementation of paddle.train is:
 
 ```python
 def train(batch_reader, mapping, batch_size, total_pass):
diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md
new file mode 100644
index 0000000000..f93d6155e1
--- /dev/null
+++ b/doc/design/refactorization.md
@@ -0,0 +1,249 @@
+# Design Doc: Refactorization Overview
+
+The goals of refactoring include:
+
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
+
+## Computation Graphs
+
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
+
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
+
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
+
+1. A graph is composed of *variables* and *operators*.
+
+1. The description of graphs must be serializable/deserializable, so that:
+
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
+
+1. The Python program does two things
+
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
+      1. the C++ library `libpaddle.so` for local execution,
+      1. the master process of a distributed training job for training, or
+      1. the server process of a Kubernetes serving job for distributed serving.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
+
+## Description and Realization of Computation Graph
+
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
+
+At runtime, the C++ program realizes the graph and runs it.
+
+| | Representation (protobuf messages) | Realization (C++ class objects) |
+|---|---|---|
+|Data|[VarDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L107)|[Variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24)|
+|Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
+|Block|BlockDesc|Block|
+
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
+
+## Compilation and Execution
+
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
+
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
+
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
+
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+      1. realize local variables defined in the BlockDesc message in the new scope,
+      1. a scope is similar to the stack frame in programming languages,
+
+   1. Create an instance of class `Block`, in which,
+      1. realize operators in the BlockDesc message,
+
+   1. Run the Block by calling
+      1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
+      1. `Block::Eval(vector<Operator>* targets)` for optimization.
+
+
+## Intermediate Representation (IR)
+
+```text
+Compile Time -> IR -> Runtime
+```
+
+### Benefits of IR
+
+- Optimization
+  ```text
+  Compile Time -> IR -> Optimized IR -> Runtime
+  ```
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
+    ```text
+    Compile Time
+    |-> Single GPU IR
+        |-> [trainer-IR-0, trainer-IR-1, pserver-IR]
+            |-> Node-0 (runs trainer-IR-0)
+            |-> Node-1 (runs trainer-IR-1)
+            |-> Node-2 (runs pserver-IR)
+    ```
+  - Automatic Model Parallelism (planned for future)
+
+---
+
+# Operator/OpWithKernel/OpKernel
+
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
+
+---
+
+# Operator
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
+
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
+
+---
+
+# OpWithKernel/Kernel
+
+![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
+
+* `OpWithKernel` inherits `Operator`.
+* `OpWithKernel` contains a Kernel map.
+    * `OpWithKernel::Run` get device's kernel, and invoke `OpKernel::Compute`.
+    * `OpKernelKey` is the map key. Only device place now, but may be data type later.
+
+---
+
+# Why separate Kernel and Operator
+
+* Separate GPU and CPU code.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
+---
+
+# Libraries for Kernel development
+
+* `Eigen::Tensor` contains basic math and element-wise functions.
+    * Note that `Eigen::Tensor` has broadcast implementation.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
+* Hand-writing `GPUKernel` and `CPU` code
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
+---
+# Operator Registration
+
+## Why is registration necessary?
+We need a method to build mappings between Op type names and Op classes.
+
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
+
+---
+# The Registry Map
+
+### `OpInfoMap`
+
+`op_type(string)` -> `OpInfo`
+
+`OpInfo`:
+
+- **`creator`**: The Op constructor.
+- **`grad_op_type`**: The type of the gradient Op.
+- **`proto`**: The Op's Protobuf, including inputs, outputs and required attributes.
+- **`checker`**: Used to check attributes.
+
+---
+# Related Concepts
+
+### Op_Maker
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+
+### Register Macros
+```cpp
+REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
+REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
+```
+
+---
+# Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
+
+---
+# Backward Module (1/2)
+### Create Backward Operator
+- Mapping from forward Op to backward Op
+![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
+
+---
+# Backward Module (2/2)
+### Build Backward Network
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
+	- RNN Op => recursively call `Backward` on stepnet
+
+
+---
+# Scope, Variable, Tensor
+
+* `Tensor` is an n-dimension array with type.
+	* Only dims and data pointers are stored in `Tensor`.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
+
+---
+# Block (in design)
+## the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
+
+---
+# Milestone
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
+    - the MNIST demo needs a Python interface,
+    - the RNN models require the framework to support `LoDTensor`.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
+
+---
+# Control the migration quality
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
new file mode 100644
index 0000000000..8d973eb531
--- /dev/null
+++ b/doc/design/register_grad_op.md
@@ -0,0 +1,92 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+
+However, we noticed two problems with the current design:
+
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+
+## The Current Implementation
+
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/design/regularization.md b/doc/design/regularization.md
new file mode 100644
index 0000000000..21280ac898
--- /dev/null
+++ b/doc/design/regularization.md
@@ -0,0 +1,72 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. A frequently faced problem is the problem of **overfitting**, where the model does not make reliable predictions on new unseen data. **Regularization** is the process of introducing additional information in order to prevent overfitting. This is usually done by adding extra penalties to the loss function that restricts the parameter spaces that an optimization algorithm can explore.
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+<img src="./images/loss_equation.png" align="center"/><br/>
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+<img src="./images/l2_regularization.png" align="center"/><br/>
+
+##### L1 Regularization
+<img src="./images/l1_regularization.png" align="center"/><br/>
+
+A much more detailed mathematical background of regularization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+## Regularization Survey
+
+A detailed survey of regularization in various deep learning frameworks can be found [here](https://github.com/PaddlePaddle/Paddle/wiki/Regularization-Survey). 
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops that correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate CPU and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes other than L1 and L2 norm penalties. 
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+<img src="./images/feed_forward.png" align="center"/><br/>
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+   
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph. 
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+
+The proposal is to add these ops in a lazy manner just before the backward pass. 
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we also need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
+
+
+
+
+
+    
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 3692a5248a..b978726109 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -1,31 +1,63 @@
-# Paddle发行规范
+# PaddlePaddle发行规范
 
-Paddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
+PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Versioning](http://semver.org/)标准表示PaddlePaddle版本号。
 
-Paddle每次发新的版本,遵循以下流程:
+PaddlePaddle每次发新的版本,遵循以下流程:
 
 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
-2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
-3. 对这个版本的提交,做如下几个操作:
-	* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
-	* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
-	* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
-		* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
-4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
-6. 协同完成Release Note的书写
+1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
+1. 对这个版本的提交,做如下几个操作:
+  * 使用Regression Test List作为检查列表,测试本次release的正确性。
+	  * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
+	* 编译这个版本的python wheel包,并发布到pypi。
+		* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
+		* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。
+		* 上传方法:
+			```
+			cd build/python
+			pip install twine
+			twine upload dist/[package to upload]
+			```
+		* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
+1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 协同完成Release Note的书写
 
 
 需要注意的是:
 
-* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试Paddle的行为。
+* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
 
-# Paddle 分支规范
+## 发布wheel包到pypi
 
-Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
+弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
 
-* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+<img src="ci_build_whl.png">
+
+* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+  发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可:
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
+## PaddlePaddle 分支规范
+
+PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
+
+* PaddlePaddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
 	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
 	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试,但并没有经过回归测试。
 	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
@@ -33,18 +65,18 @@ Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branch
 * 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,但所有fork的版本库的所有分支都相当于特性分支。
 	* 建议,开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
 	* 建议,开发者fork的版本库中,再基于`develop`版本fork出自己的功能分支。
-	* 当功能分支开发完毕后,向Paddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
+	* 当功能分支开发完毕后,向PaddlePaddle的主版本库提交`Pull Reuqest`,进而进行代码评审。
 		* 在评审过程中,开发者修改自己的代码,可以继续在自己的功能分支提交代码。 
 
 * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。
 
-# Paddle回归测试列表
+## PaddlePaddle回归测试列表
 
-本列表说明Paddle发版之前需要测试的功能点。
+本列表说明PaddlePaddle发版之前需要测试的功能点。
 
-## Paddle Book中所有章节
+### PaddlePaddle Book中所有章节
 
-Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
 
 | | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |
diff --git a/doc/design/scope.md b/doc/design/scope.md
new file mode 100644
index 0000000000..4da76eebb7
--- /dev/null
+++ b/doc/design/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state (momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* Var(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope is destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* FindVar(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->FindVar(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+# Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* FindVar(const std::string& name) const;
+
+  // return if already contains same name variable.
+  Variable* Var(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md
new file mode 100644
index 0000000000..1a98839a95
--- /dev/null
+++ b/doc/design/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators that need to be written to support `SelectedRows`. These are:
+
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
new file mode 100644
index 0000000000..c7aeed7f9b
--- /dev/null
+++ b/doc/design/simple_op_design.md
@@ -0,0 +1,202 @@
+## Interaction between C++ and Python
+
+Users employ API in Python to describe their own network, however, the network construction actually happens in C++. so Protobuf is introduced to send the message between Python and C++. 
+
+The Interaction between Python and C++ can be simplified as two steps:
+
+1. C++ tells Python how many Ops there are, and what parameter do users need to offer to initialize a new Op. Python then builds API for each Op at compile time.
+
+2. Users invoke APIs built by Python and provide necessary parameters. These parameters will be sent to C++ for finishing the Op construction task.
+
+### Message from C++ to Python
+
+We define a Protobuf message class `OpProto` to hold message needed in the first step. What should an `OpProto` contain? This question is equivalent to “What message do we need to offer, to build a Python API which is legal and user oriented and can use to describe a whole Op.”
+
+Following message are necessary:
+
+1. Op's name, and its simple comment.
+2. Input and output variable number; each variable's name, type, and comment.
+3. Op's attributes; each attribute includes name, type, comment, **default value** and **value range**.
+
+So `OpProto` can be defined as follows:
+
+```proto
+enum AttrType {
+	INT = 1;
+	FLOAT = 2;
+	STRING = 3;
+	INTS = 4;
+	FLOATS = 5;
+	STRINGS = 6;
+};
+
+message AttrValue {
+	AttrType type = 1;
+	optional int iv = 2;
+	optional float fv = 3;
+	optional string sv = 4;
+	repeated int ivs = 5;
+	repeated float fvs = 6;
+	repeated string svs = 7;
+};
+
+message AttrProto {
+	required string name = 1;
+	required string comment = 2;
+	required AttrType type = 3;
+};
+
+message VarProto {
+	required string name = 1;
+	required string comment = 2;
+	required bool is_tensor = 3;
+};
+
+message OpProto {
+	repeated VarProto inputs = 1;
+	repeated VarProto outputs = 2;
+	repeated AttrProto attrs = 3;
+	required string type = 4;
+	required string comment = 5;
+};
+```
+
+To generate Python code automatically:
+
+```python 
+def create_python_ops_creatation_functions():
+	op_protos = paddle.framework.OpRegistry.get_all_op_proto()
+	for type_name in op_protos:
+		op_proto = op_protos[type_name]
+		def __impl__(**kwargs):  # User must use key word args in Paddle API
+			inputs = [kwargs.get(ipt.name, "") for ipt in op_proto.inputs]
+			outputs = [kwargs.get(opt.name, "") for opt in op_proto.outputs]
+			attrs = [cast_to_op_attr(attr, kwargs.get(attr.name, None)) for attr in op_proto.attrs]
+			opdesc = (input, outputs, type_name, attrs)
+			return paddle.framework.OpRegistry.CreateOp(opdesc)
+		__impl__.__doc__ = create_doc_string(op_proto)
+		globals()[type_name] = __impl__
+
+create_python_ops_creatation_functions()
+```
+
+### Message from Python to C++
+
+To hold message needed in the above second step, we define Protobuf message class `OpDesc`. It is used to hold user-specified parameters in Op describing.
+
+```proto
+message OpDesc {
+	required string type = 1;	
+	repeated string inputs = 2;
+	repeated string outputs = 3;
+	map<string, AttrValue> attrs = 4;
+};
+```
+
+## OpProto Register
+
+Every Op has its own `OpProto`. For using convenience, we need to register them and record all their messages. For each `Op` class, we define a corresponding `OpMaker` class, in whose constructor we implement the `OpProto`'s building process. `OpMaker`'s constructor will be invoked by another function `OpRegistry::RegisterOp()`.
+
+```cpp
+class OpProtoMaker {
+public:
+	OpProtoMaker(OpProto* proto): proto_(proto) {}
+protected:
+	OpProto* proto_;
+	void AddInput(const std::string& name, const std::string& desc) {...}
+	void AddAttr(const std::string& name, const std::string& desc, TypeId type) {...}
+	void AddComment(const std::string& comment) { ... }
+};
+
+class OpRegistry {
+public:
+	using OpCreator = std::function<OperatorBase* (OpDesc& desc)>;
+	
+	template <typename OpType, typename OpMaker>
+	static void RegisterOp(const std::string& name) {
+		gCreators_[name] = [](const OpDesc& desc) {
+			return new OpType(desc);
+		};
+		OpProto& opProto = gProtos_[name];
+		OpMaker()(&opProto);
+	}
+
+	static map<string, OpCreator> gCreators_;
+	static map<string, OpProto> gProtos_;
+};
+
+template <typename OpType, typename OpMaker>
+class OpRegister {
+  public:
+    OpRegister(std::string type) {
+        OpRegistry::RegisterOp<OpType, OpMaker>(type);
+    }
+};
+
+#define REGISTER_OP(op_class, op_maker_class, type_name)         \
+    class op_class##Register {                                   \
+      private:                                                   \
+        const static OpRegister<#op_class, #op_maker_class> reg; \
+    };                                                           \
+    const Register op_class##Register::reg(#type_name);
+    
+class CosineOp {
+// ...
+}
+
+struct CosineOpProtoMaker : public OpProtoMaker {
+	CosineOpProtoMaker(OpProto* proto) : OpProtoMaker(proto) {
+		AddInput("input", "input of cosine op");
+		AddAttr("scale", "scale of cosine op", float).Default(1.0).GreaterThan(0.0);
+		AddType("cos");
+		AddComment("This is cos op");
+	}
+}
+
+REGISTER_OP(CosineOp, CosineOpProtoMaker, cos);
+```
+
+In `REGISTER_OP(CosineOp, CosineOpProtoMaker, cos)`, we register not only `CosineOp` but also `CosineOpProto`. As fields of `CosineOpProto`, the default value and value range of `scale` are also registered here. 
+
+## Python API
+
+Python  APIs are divided into two types, high-level API and low-level API.
+
+### High-Level API
+
+High-level API is called by users directly, so it should keep its style consistent with existing V2 APIs.
+
+Here is a sample about how a define a fc layer:
+
+```python
+hd = fc_layer(input=data, size=56, with_bias=True, activation="sigmoid");
+```
+
+`hd` is the output of `fc_layer` and it's a `variable`. It can be further sent into other layers as input.
+
+The definition of `fc_layer()`:
+
+```python
+def fc_layer(input, size, with_bias, activation):
+	attr_map = {"size":size}
+	check_attrs(attr_map)
+	w = make_variable('w')
+	if with_bias:
+		b = make_variable('b')
+	else:
+		b = None
+	fc_output = make_variable('fc_output');
+	fc_op(input, w, b, fc_output, attr_map)
+	act_output = make_variable('sigmod_output');
+	if activation == "sigmod":
+		sigmod_op(fc_output, act_output);
+	elif:
+		# ...
+	return act_output;
+```
+
+### Low Leval API
+
+In above sample, `fc_op` and `sigmod_op` are low-level API. They build `OpDesc` and invoke corresponding C++ code.
+
+*TODO*
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
new file mode 100644
index 0000000000..8983df9004
--- /dev/null
+++ b/doc/design/support_new_device.md
@@ -0,0 +1,240 @@
+# Design Doc: Supporting new Device/Library
+
+## Background
+
+Deep learning has a high demand for computing resources. New high-performance devices and computing libraries are appearing very frequently. Deep learning frameworks have to integrate these high-performance devices and computing libraries in a flexible and efficient manner.
+
+On one hand, hardware and computing libraries usually do not have a one-to-one correspondence. For example, Intel CPUs support Eigen and MKL computing libraries while Nvidia GPUs support Eigen and cuDNN computing libraries. We have to implement operator specific kernels for each computing library.
+
+On the other hand, users usually do not want to care about the low-level hardware and computing libraries when writing a neural network configuration. In Fluid, `Layer` is exposed in `Python`, and `Operator` is exposed in `C++`. Both `Layer` and `Operator` are hardware independent.
+
+So, how to support a new Device/Library in Fluid becomes a challenge.
+
+
+## Basic: Integrate A New Device/Library
+
+For a general overview of fluid, please refer to the [overview doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/read_source.md).
+
+There are mainly three parts that we have to consider while integrating a new device/library:
+
+- Place and DeviceContext: indicate the device id and manage hardware resources
+
+- Memory and Tensor: malloc/free data on certain device
+
+- Math Functor and OpKernel: implement computing unit on certain devices/libraries
+
+### Place and DeviceContext
+
+Please note that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices.
+
+#### Place
+Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add the corresponding `DevicePlace`.
+
+```
+        |   CPUPlace
+Place --|   CUDAPlace
+        |   FPGAPlace
+```
+
+And `Place` is defined as follows:
+
+```
+typedef boost::variant<CUDAPlace, CPUPlace, FPGAPlace> Place;
+```
+
+#### DeviceContext
+
+Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`.
+
+
+```
+                /->  CPUDeviceContext   
+DeviceContext ---->  CUDADeviceContext  
+                \->  FPGADeviceContext
+```
+
+An example of Nvidia GPU is as follows:
+
+- DeviceContext
+
+
+```
+class DeviceContext {
+  virtual Place GetPlace() const = 0;
+};  
+```
+
+
+- CUDADeviceContext
+
+
+```
+class CUDADeviceContext : public DeviceContext {
+  Place GetPlace() const override { return place_; }
+private:
+  CUDAPlace place_;
+  cudaStream_t stream_; 
+  cublasHandle_t cublas_handle_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;  // binds with stream_
+};
+```
+
+### Memory and Tensor
+
+
+#### memory module
+
+Fluid provides the following [memory interfaces](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/memory/memory.h#L36):
+
+```
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+template <typename Place>
+void Free(Place place, void* ptr);
+
+template <typename Place>
+size_t Used(Place place);
+```
+
+To implement these interfaces, we have to implement MemoryAllocator for different Devices.
+
+
+#### Tensor
+
+[Tensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h#L36) holds data with some shape in a specific Place.
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configurate its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+
+
+### Math Functor and OpKernel
+
+Fluid implements computing units based on different DeviceContexts. Some computing units are shared between operators. This common part will be put in operators/math directory as basic Functors.
+
+Let's take [MaxOutFunctor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/math/maxouting.h#L27) as an example:
+
+The interface is defined in the header file.
+
+```
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+```
+
+CPU implementation is in .cc file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+  public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};
+```
+
+CUDA implementation is in .cu file
+
+```
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+                  ...
+                  }
+};                  
+```
+
+
+We first obtain the computing handle from a concrete DeviceContext and then compute on tensors.
+
+The implementation of `OpKernel` is similar to math functors, the extra thing we need to do is to register the OpKernel in a global map.
+
+Fluid provides different register interfaces in op_registry.h
+
+
+Let's take [Crop](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/crop_op.cc#L134) operator as an example:
+
+In .cc file:
+
+```
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
+```
+
+In .cu file:
+
+```
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
+```
+
+
+## Advanced topics: How to switch between different Device/Library
+
+Generally, we will implement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not suitable on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run on GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library.
+
+
+For more details, please refer to following docs:
+
+- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md)
+- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md)
diff --git a/doc/design/switch_kernel.md b/doc/design/switch_kernel.md
new file mode 100644
index 0000000000..9719e031c7
--- /dev/null
+++ b/doc/design/switch_kernel.md
@@ -0,0 +1,99 @@
+## Background
+Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
+
+The `OpKernelType ` is as follows:
+
+```cpp
+struct OpKernelType {
+  Place place_;
+  DataType data_type_;
+  DataLayout data_layout_;
+  LibraryType library_type_;
+};
+```
+
+- The `place_` is a descriptor of the device, e.g., CPUPlace, CUDAPlace.
+
+- The `data_type_` is the data type that this kernel performs on, e.g., `FP32`, `INT64`. Note that one kernel may have inputs with different data types. However, it will be a major `data_type`. For example, the `cross_entropy` takes `int64` as it label, and `double`/`float` as its input logit and output cost. The major `data_type` of `cross_entropy` is `float` or `double`.
+
+- The `data_layout_ ` is useful for some computational library. One example is that MKLDNN uses many kinds of layout, such as `nChw8c`. Each kind of layout will invoke the different kernel.
+
+- The `library_type_` describes the computational library, e.g., `MKLDNN`, `CUDNN`.
+
+## Problem
+
+We register a kernel for every operator and every kernel type ideally. However, it is impracticable for the following situations.
+
+1. Some operators, like CRF, are complicated and inefficient to be implemented on GPU. The CRF operator will only have a CPU kernel.
+2. Some operators will take too many memory. It is better to force them into CPU. However, the rest of operators in this neural network will be performed on GPU, i.e., model parallel problem.
+3. Some layout and place are particular. One example is that MKLDNN uses `nChw8` and there is no other library uses `nChw8c`.
+
+Take one situation to give a detailed explanation, if we have two Operators: OP1 and OP2, OP1 has one output `op1_to_op2`, and `op1_to_op2` is the input of OP2.
+
+If OP1 and OP2 run on the same place(for example CPUPlace), then `op1_2_op2` can be used directly by OP2.
+
+```
+OP1(CPUPlace)
+     |
+ op1_2_op2
+     |
+OP2(CPUPlace)
+```
+
+If OP1 and OP2 run one different place, then OP2 cannot `use op1_2_op2` directly.
+
+Problems under these situations are similar. We can formalize this problem as follow.
+
+We register kernels with types $KT = \{kt_1, kt_2, kt_3, ...\}$ for one operator. The inputs of this operator should be run on kernel type $kt_{?}$, which the $kt_{?} \notin KT$. How to cast the input of this operator from $kt_{?}$ to any of kernel type in $KT$.
+
+## Solution: data transform
+
+It is clear that transforming inputs of an operator to adapt another kernel type is not related to the particular operator. So we should register these transformation methods as global methods.
+
+We can infer kernel type for each input of an operator. We let this kernel type as `actual kernel type for var`, which means this kernel type is the kernel type that can process this input variable.
+
+We can get a kernel type by 1) The configuration of operator description. (Users may want to force use `MKL` for `conv` operator). 2) The place of the current executor. (Executor is running on GPU). This kernel type is what we expect the operator will be performed on. We let this kernel type as `expect kernel type`.
+
+We transform the input data from `actual` to `expect` if the actual kernel type is not as same as expect kernel type.
+
+The algorithm is described as following
+
+```cpp
+void OperatorWithKernel::Run(
+        const Scope& scope,
+        const platform::Place& place) const {
+  ExecutionContext ctx(...);
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name : this->Inputs()) {
+    auto* tensor_in = GetTensor(var_name);
+    auto kernel_type_for_var = this->GetKernelTypeForVar(...);
+    if (kernel_type_for_var.place_ != expected_kernel_key.place_) {
+      auto* trans_var = new_scope.Var(var_name);
+      auto* out = DataTransform(expected_kernel_key,
+                                kernel_type_for_var,
+                                *tensor_in);
+      CopyVariableWithTensor(...);
+    }
+  }
+
+  auto kernel = kernels.find(expected_kernel_key);
+  kernel->Compute(ExecutionContext(...));
+}
+```
+
+then the actual process for the multi-device above will be:
+
+```
+OP1(CPUPlace)
+     |
+op1_2_op2(on CPU)
+     |
+[transform](from CPU to GPU)
+     |
+op1_2_op2(on GPU)
+     |
+OP2(CUDAPlace)
+```
diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md
new file mode 100644
index 0000000000..37e4f7b90f
--- /dev/null
+++ b/doc/design/tensor_array.md
@@ -0,0 +1,271 @@
+# Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+
+ private:
+  vector<LoDTensor> values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
+TensorArray as a new concept is borrowed from TensorFlow, 
+it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
+
+This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
+such as `recurrent_op`, `RecurrentGradientMachine`.
+
+In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
+`TensorArray` is used to segment inputs and store states in all time steps.
+By providing some methods similar to a C++ array,
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+
+
+    def size(self, output):
+        '''
+        Return the number of values.
+
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
+
+## LoDTensor-related Supports
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input, 
+some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
+
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
+
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
+
+```c++
+// input is the varient-length data
+LodTensor sentence_input(xxx);
+TensorArray ta;
+Tensor indice_map;
+Tensor boot_state = xxx; // to initialize rnn's first state
+TensorArray::unpack(input, 1/*level*/, true/*sort_by_length*/, &ta, &indice_map);
+TessorArray step_outputs;
+TensorArray states;
+
+for (int step = 0; step = ta.size(); step++) {
+  auto state = states.read(step);
+  // rnnstep is a function which acts like a step of RNN
+  auto step_input = ta.read(step);
+  auto step_output = rnnstep(step_input, state);
+  step_outputs.write(step_output, true/*data_shared*/);
+}
+
+// rnn_output is the final output of an rnn
+LoDTensor rnn_output = ta.pack(ta, indice_map);
+```
+the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
+the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
diff --git a/doc/design/test.dot b/doc/design/test.dot
new file mode 100644
index 0000000000..62c69b8fc8
--- /dev/null
+++ b/doc/design/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png
new file mode 100644
index 0000000000..4e121a40b9
Binary files /dev/null and b/doc/design/test.dot.png differ
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
new file mode 100644
index 0000000000..89fa95326c
--- /dev/null
+++ b/doc/design/var_desc.md
@@ -0,0 +1,69 @@
+## Background
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
+
+PaddlePaddle use proto message to describe compile time program because
+
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
+
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
+
+| |compile time|runtime|
+|---|---|---|
+|Data|VarDesc(proto)|Variable(cpp)|
+|Operation|OpDesc(proto)|Operator(cpp)|
+
+
+## Definition of VarDesc
+
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
+
+```proto
+message VarDesc {
+  required string name = 1;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## Definition of TensorDesc
+
+```proto
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+```
+
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
+
+## Definition of LodTensorDesc
+
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+```
+
+A LoDTensorDesc contains a tensor and a lod_level.
+
+## Definition of Variable in Python
+
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst
new file mode 100644
index 0000000000..ed8a0c7e87
--- /dev/null
+++ b/doc/faq/build_and_install/index_cn.rst
@@ -0,0 +1,139 @@
+###################
+编译安装与单元测试
+###################
+
+..  contents::
+
+1. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
+----------------------------------------------------------------
+
+用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
+具体的解决方法是:
+
+..  code-block:: bash
+
+    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    $ docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
+
+更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
+
+
+2. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
+----------------------------------------------------------------
+
+这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是,
+用户强制指定特定的Python版本,具体操作如下:
+
+    ..  code-block:: bash
+
+        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
+
+用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+
+3. CMake源码编译,Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+
+那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。
+
+4. paddlepaddle\*.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
+
+出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。最新的paddlepaddle python安装包支持Linux x86_64和MacOS 10.12操作系统,并安装了python 2.7和pip 9.0.1。
+
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
+
+如果还不行,可以执行 :code:`python -c "import pip; print(pip.pep425tags.get_supported())"` 获取当前系统支持的python包的后缀,
+并对比是否和正在安装的后缀一致。
+
+如果系统支持的是 :code:`linux_x86_64` 而安装包是 :code:`manylinux1_x86_64` ,需要升级pip版本到最新;
+如果系统支持 :code:`manylinux1_x86_64` 而安装包(本地)是 :code:`linux_x86_64` ,可以重命名这个whl包为 :code:`manylinux1_x86_64` 再安装。
+
+5. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本,有的话需要先卸载:
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
+
+6. 遇到“非法指令”或者是“illegal instruction”
+--------------------------------------------
+
+PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。
+
+7.  python相关的单元测试都过不了
+--------------------------------
+
+如果出现以下python相关的单元测试都过不了的情况:
+
+..  code-block:: bash
+
+    24 - test_PyDataProvider (Failed)
+    26 - test_RecurrentGradientMachine (Failed)
+    27 - test_NetworkCompare (Failed)
+    28 - test_PyDataProvider2 (Failed)
+    32 - test_Prediction (Failed)
+    33 - test_Compare (Failed)
+    34 - test_Trainer (Failed)
+    35 - test_TrainerOnePass (Failed)
+    36 - test_CompareTwoNets (Failed)
+    37 - test_CompareTwoOpts (Failed)
+    38 - test_CompareSparse (Failed)
+    39 - test_recurrent_machine_generation (Failed)
+    40 - test_PyDataProviderWrapper (Failed)
+    41 - test_config_parser (Failed)
+    42 - test_swig_api (Failed)
+    43 - layers_test (Failed)
+
+并且查询PaddlePaddle单元测试的日志,提示:
+
+..  code-block:: bash
+
+    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
+    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
+
+解决办法是:
+
+* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。
+
+8. 下载MKLML库失败
+------------------
+
+..  code-block:: bash
+
+    make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4
+    make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2
+    make[1]: *** 正在等待未完成的任务....
+
+原因:网速或SSL链接原因,导致MKLML库下载不成功。
+
+解决办法是:手动下载并安装,具体步骤如下。
+
+..  code-block:: bash
+
+    // 1. 进入对应的目录
+    cd build/third_party/mklml/src/extern_mklml
+
+    // 2. 查看包的大小, 正常情况下是75M,如果小于75M,即下载失败:
+    du -sh mklml_lnx_2018.0.1.20171007.tgz
+
+    // 3. 手动下载且解压缩,并手动生成download成功标签:
+    wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz 
+    tar zxf mklml_lnx_2018.0.1.20171007.tgz
+    touch ../extern_mklml-stamp/extern_mklml-download
+
+    // 4. 接着编译即可
diff --git a/doc/faq/cluster/index_cn.rst b/doc/faq/cluster/index_cn.rst
new file mode 100644
index 0000000000..e59c1e1a54
--- /dev/null
+++ b/doc/faq/cluster/index_cn.rst
@@ -0,0 +1,17 @@
+###############
+集群训练与预测
+###############
+
+..  contents::
+
+1. 集群多节点训练,日志中保存均为网络通信类错误
+------------------------------------------------
+
+集群多节点训练,日志报错为网络通信类错误,比如 :code:`Connection reset by peer` 等。
+此类报错通常是由于某一个节点的错误导致这个节点的训练进程退出,从而引发其他节点无法连接导致,可以参考下面的步骤排查:
+
+* 从 :code:`train.log` , :code:`server.log` 找到最早报错的地方,查看是否是其他错误引发的报错(比如FPE,内存不足,磁盘空间不足等)。
+
+* 如果发现最早的报错就是网络通信的问题,很有可能是非独占方式执行导致的端口冲突,可以联系OP,看当前MPI集群是否支持resource=full参数提交,如果支持增加此参数提交,并更换job 端口。
+
+* 如果当前MPI集群并不支持任务独占模式,可以联系OP是否可以更换集群或升级当前集群。
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index c14160d55e..9929767cac 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -1,313 +1,11 @@
-####################
 FAQ
-####################
+====
 
-..  contents::
+..  toctree::
+  :maxdepth: 1
 
-1. 如何减少内存占用
----------------------------------
-
-神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。
-PaddlePaddle的内存占用主要分为如下几个方面\:
-
-* DataProvider缓冲池内存(只针对内存)
-* 神经元激活内存(针对内存和显存)
-* 参数内存 (针对内存和显存)
-* 其他内存杂项
-
-其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。
-
-减少DataProvider缓冲池内存
-++++++++++++++++++++++++++
-
-PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
-
-..  graphviz::
-
-    digraph {
-        rankdir=LR;
-        数据文件 -> 内存池 -> PaddlePaddle训练
-    }
-
-所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
-个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
-那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
-
-..  literalinclude:: src/reduce_min_pool_size.py
-
-这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。
-
-神经元激活内存
-++++++++++++++
-
-神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。
-在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
-一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
-的时间步信息成正比。
-
-所以做法可以有两种:
-
-* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
-* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
-  但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。
-
-参数内存
-++++++++
-
-PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
-例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录
-文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
-
-可以考虑使用一些优化算法,例如 :code:`momentum`。
-
-2. 如何加速PaddlePaddle的训练速度
----------------------------------
-
-加速PaddlePaddle训练可以考虑从以下几个方面\:
-
-* 减少数据载入的耗时
-* 加速训练速度
-* 利用分布式训练驾驭更多的计算资源
-
-减少数据载入的耗时
-++++++++++++++++++
-
-使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
-:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
-
-..  literalinclude:: src/reduce_min_pool_size.py
-
-同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
-
-
-加速训练速度
-++++++++++++
-
-PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
-
-这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
-
-使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
-
-..  literalinclude:: src/word2vec_dataprovider.py
-
-这个任务的配置为\:
-
-..  literalinclude:: src/word2vec_config.py
-
-
-利用更多的计算资源
-++++++++++++++++++
-
-利用更多的计算资源可以分为一下几个方式来进行\:
-
-* 单机CPU训练
-
-  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
-
-* 单机GPU训练
-
-  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
-  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
-
-* 多机训练
-
-  * 请参考 :ref:`cluster_train` 。
-
-
-3. 遇到“非法指令”或者是“illegal instruction”
---------------------------------------------
-
-PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二进制发行版可能会导致这种错误,请选择正确的版本。
-
-4. 如何选择SGD算法的学习率
---------------------------
-
-在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
-
-通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
-
-如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
-
-
-5. 如何初始化参数
------------------
-
-默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
-
-* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
-* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
-
-比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
-
-..  code-block:: python
-
-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
-                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
-
-上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
-
-6. 如何共享参数
----------------
-
-PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
-
-简单的全连接网络,参数共享的配置示例为\:
-
-..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
-
-这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
-
-7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
-------------------------------------------------------------------------
-
-出现这个问题的主要原因是,系统编译wheel包的时候,使用的 :code:`wheel` 包是最新的,
-而系统中的 :code:`pip` 包比较老。具体的解决方法是,更新 :code:`pip` 包并重新编译PaddlePaddle。
-更新 :code:`pip` 包的方法是\:
-
-..  code-block:: bash
-
-    pip install --upgrade pip
-
-8.  python相关的单元测试都过不了
---------------------------------
-
-如果出现以下python相关的单元测试都过不了的情况:
-
-..  code-block:: bash
-
-    24 - test_PyDataProvider (Failed)
-    26 - test_RecurrentGradientMachine (Failed)
-    27 - test_NetworkCompare (Failed)
-    28 - test_PyDataProvider2 (Failed)
-    32 - test_Prediction (Failed)
-    33 - test_Compare (Failed)
-    34 - test_Trainer (Failed)
-    35 - test_TrainerOnePass (Failed)
-    36 - test_CompareTwoNets (Failed)
-    37 - test_CompareTwoOpts (Failed)
-    38 - test_CompareSparse (Failed)
-    39 - test_recurrent_machine_generation (Failed)
-    40 - test_PyDataProviderWrapper (Failed)
-    41 - test_config_parser (Failed)
-    42 - test_swig_api (Failed)
-    43 - layers_test (Failed)
-
-并且查询PaddlePaddle单元测试的日志,提示:
-
-..  code-block:: bash
-
-    paddle package is already in your PYTHONPATH. But unittest need a clean environment.
-    Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-
-解决办法是:
-
-* 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。
-
-
-9. 运行Docker GPU镜像出现 "CUDA driver version is insufficient"
-----------------------------------------------------------------
-
-用户在使用PaddlePaddle GPU的Docker镜像的时候,常常出现 `Cuda Error: CUDA driver version is insufficient for CUDA runtime version`, 原因在于没有把机器上CUDA相关的驱动和库映射到容器内部。
-具体的解决方法是:
-
-..  code-block:: bash
-
-    $ export CUDA_SO="$(\ls usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddlepaddle:latest-gpu
-
-更多关于Docker的安装与使用, 请参考 `PaddlePaddle Docker 文档 <http://www.paddlepaddle.org/doc_cn/build_and_install/install/docker_install.html>`_ 。
-
-
-10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
-----------------------------------------------------------------
-
-这是目前CMake寻找Python的逻辑存在缺陷,如果系统安装了多个Python版本,CMake找到的Python库和Python解释器版本可能有不一致现象,导致编译PaddlePaddle失败。正确的解决方法是,
-用户强制指定特定的Python版本,具体操作如下:
-
-    ..  code-block:: bash
-
-        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
-
-用户需要指定本机上Python的路径:``<exc_path>``, ``<lib_path>``, ``<inc_path>``
-
-11. CMake源码编译,Paddle版本号为0.0.0
---------------------------------------
-
-如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`;或者运行 :code:`cmake ..`,出现
-
-..  code-block:: bash
-
-    CMake Warning at cmake/version.cmake:20 (message):
-      Cannot add paddle version from git tag
-          
-那么用户需要拉取所有的远程分支到本机,命令为 :code:`git fetch upstream`,然后重新cmake即可。
-
-12. A protocol message was rejected because it was too big
-----------------------------------------------------------
-
-如果在训练NLP相关模型时,出现以下错误:
-
-..  code-block:: bash
-
-    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
-
-可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:
-
-..  code-block:: python
-
-     src_dict = dict()
-     for line_count, line in enumerate(open(src_dict_path, "r")):
-        src_dict[line.strip()] = line_count
-
-     define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={"src_dict": src_dict})
-
-解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:
-
-..  code-block:: python
-
-     define_py_data_sources2(
-        train_list,
-        test_list,
-        module="dataprovider",
-        obj="process",
-        args={"src_dict_path": src_dict_path})
-
-完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
-
-13. 如何指定GPU设备
--------------------
-
-例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU:
-
-* 方式1:通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
-
-..      code-block:: bash
-
-        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
-
-* 方式2:通过命令行参数 ``--gpu_id`` 指定。
-
-..      code-block:: bash
-
-        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-
-
-14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
-------------------------------------------------------------------------
-
-Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
-主要原因包括两个方面:
-
-* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。
-* 模型一直不收敛,发散到了一个数值特别大的地方。
-* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
-
-主要的解决办法是减小学习律或者对数据进行归一化处理。
+  build_and_install/index_cn.rst
+  model/index_cn.rst
+  parameter/index_cn.rst
+  local/index_cn.rst
+  cluster/index_cn.rst
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
new file mode 100644
index 0000000000..0306b1e5dd
--- /dev/null
+++ b/doc/faq/local/index_cn.rst
@@ -0,0 +1,259 @@
+###############
+本地训练与预测
+###############
+
+..  contents::
+
+1. 如何减少内存占用
+-------------------
+
+神经网络的训练本身是一个非常消耗内存和显存的工作,经常会消耗数10GB的内存和数GB的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+
+* DataProvider缓冲池内存(只针对内存)
+* 神经元激活内存(针对内存和显存)
+* 参数内存 (针对内存和显存)
+* 其他内存杂项
+
+其中,其他内存杂项是指PaddlePaddle本身所用的一些内存,包括字符串分配,临时变量等等,暂不考虑在内。
+
+减少DataProvider缓冲池内存
+++++++++++++++++++++++++++
+
+PyDataProvider使用的是异步加载,同时在内存里直接随即选取数据来做Shuffle。即
+
+..  graphviz::
+
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+
+所以,减小这个内存池即可减小内存占用,同时也可以加速开始训练前数据载入的过程。但是,这
+个内存池实际上决定了shuffle的粒度。所以,如果将这个内存池减小,又要保证数据是随机的,
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+这样做可以极大的减少内存占用,并且可能会加速训练过程,详细文档参考 :ref:`api_pydataprovider2` 。
+
+神经元激活内存
+++++++++++++++
+
+神经网络在训练的时候,会对每一个激活暂存一些数据,如神经元激活值等。
+在反向传递的时候,这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系,
+一是batch size,另一个是每条序列(Sequence)长度。所以,其实也是和每个mini-batch中包含
+的时间步信息成正比。
+
+所以做法可以有两种:
+
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数,减小batch size可能会对训练结果产生影响。
+* 减小序列的长度,或者直接扔掉非常长的序列。比如,一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列,就很容易导致内存超限,特别是在LSTM等RNN中。
+
+参数内存
+++++++++
+
+PaddlePaddle支持非常多的优化算法(Optimizer),不同的优化算法需要使用不同大小的内存。
+例如使用 :code:`adadelta` 算法,则需要使用等于权重参数规模大约5倍的内存。举例,如果参数保存下来的模型目录
+文件为 :code:`100M`, 那么该优化算法至少需要 :code:`500M` 的内存。
+
+可以考虑使用一些优化算法,例如 :code:`momentum`。
+
+2. 如何加速训练速度
+-------------------
+
+加速PaddlePaddle训练可以考虑从以下几个方面\:
+
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用分布式训练驾驭更多的计算资源
+
+减少数据载入的耗时
+++++++++++++++++++
+
+使用\ :code:`pydataprovider`\ 时,可以减少缓存池的大小,同时设置内存缓存功能,即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小,和之前减小通过减小缓存池来减小内存占用的原理一致。
+
+..  literalinclude:: src/reduce_min_pool_size.py
+
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法,将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话,会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里,在之后的 :code:`pass` 中,不会再从 :code:`python` 端读取数据,而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+
+
+加速训练速度
+++++++++++++
+
+PaddlePaddle支持Sparse的训练,sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时,与这个训练数据交互的Layer,需要将其Parameter设置成 sparse 更新模式,即设置 :code:`sparse_update=True`
+
+这里使用简单的 :code:`word2vec` 训练语言模型距离,具体使用方法为\:
+
+使用一个词前两个词和后两个词,来预测这个中间的词。这个任务的DataProvider为\:
+
+..  literalinclude:: src/word2vec_dataprovider.py
+
+这个任务的配置为\:
+
+..  literalinclude:: src/word2vec_config.py
+
+
+利用更多的计算资源
+++++++++++++++++++
+
+利用更多的计算资源可以分为以下几个方式来进行\:
+
+* 单机CPU训练
+
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`。
+
+* 单机GPU训练
+
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count` 。
+
+* 多机训练
+
+  * 请参考 :ref:`cluster_train` 。
+
+3. 如何指定GPU设备
+------------------
+
+例如机器上有4块GPU,编号从0开始,指定使用2、3号GPU:
+
+* 方式1:通过 `CUDA_VISIBLE_DEVICES <http://www.acceleware.com/blog/cudavisibledevices-masking-gpus>`_ 环境变量来指定特定的GPU。
+
+..      code-block:: bash
+
+        env CUDA_VISIBLE_DEVICES=2,3 paddle train --use_gpu=true --trainer_count=2
+
+* 方式2:通过命令行参数 ``--gpu_id`` 指定。
+
+..      code-block:: bash
+
+        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+4. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常,只要出现浮点数异常(即训练过程中出现NaN或者Inf),立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大,导致参数累加,乘除等时候,导致了浮点数溢出。
+* 模型一直不收敛,发散到了一个数值特别大的地方。
+* 训练数据有问题,导致参数收敛到了一些奇异的情况。或者输入数据尺度过大,有些特征的取值达到数百万,这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+这里有两种有效的解决方法:
+
+1. 设置 :code:`gradient_clipping_threshold` 参数,示例代码如下:
+
+..  code-block:: python
+
+optimizer = paddle.optimizer.RMSProp(
+    learning_rate=1e-3,
+    gradient_clipping_threshold=10.0,
+    regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+具体可以参考  `nmt_without_attention  <https://github.com/PaddlePaddle/models/blob/develop/nmt_without_attention/train.py#L35>`_ 示例。
+
+2. 设置 :code:`error_clipping_threshold` 参数,示例代码如下:
+
+..  code-block:: python
+
+decoder_inputs = paddle.layer.fc(
+    act=paddle.activation.Linear(),
+    size=decoder_size * 3,
+    bias_attr=False,
+    input=[context, current_word],
+    layer_attr=paddle.attr.ExtraLayerAttribute(
+        error_clipping_threshold=100.0))
+
+完整代码可以参考示例 `machine translation <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py#L66>`_ 。
+
+两种方法的区别:
+
+1. 两者都是对梯度的截断,但截断时机不同,前者在 :code:`optimzier` 更新网络参数时应用;后者在激活函数反向计算时被调用;
+2. 截断对象不同:前者截断可学习参数的梯度,后者截断回传给前层的梯度;
+
+除此之外,还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
+
+5.  如何调用 infer 接口输出多个layer的预测结果
+-----------------------------------------------
+
+* 将需要输出的层作为 :code:`paddle.inference.Inference()` 接口的 :code:`output_layer` 参数输入,代码如下:
+
+..  code-block:: python
+
+    inferer = paddle.inference.Inference(output_layer=[layer1, layer2], parameters=parameters)
+
+* 指定要输出的字段进行输出。以输出 :code:`value` 字段为例,代码如下:
+
+..  code-block:: python
+
+    out = inferer.infer(input=data_batch, field=["value"])
+
+需要注意的是:
+
+* 如果指定了2个layer作为输出层,实际上需要的输出结果是两个矩阵;
+* 假设第一个layer的输出A是一个 N1 * M1 的矩阵,第二个 Layer 的输出B是一个 N2 * M2 的矩阵;
+* paddle.v2 默认会将A和B 横向拼接,当N1 和 N2 大小不一样时,会报如下的错误:
+
+..      code-block:: python
+
+    ValueError: all the input array dimensions except for the concatenation axis must match exactly
+
+多个层的输出矩阵的高度不一致导致拼接失败,这种情况常常发生在:
+
+* 同时输出序列层和非序列层;
+* 多个输出层处理多个不同长度的序列;
+
+此时可以在调用infer接口时通过设置 :code:`flatten_result=False` , 跳过“拼接”步骤,来解决上面的问题。这时,infer接口的返回值是一个python list:
+
+* list 中元素的个数等于网络中输出层的个数;
+* list 中每个元素是一个layer的输出结果矩阵,类型是numpy的ndarray;
+* 每一个layer输出矩阵的高度,在非序列输入时:等于样本数;序列输入时等于:输入序列中元素的总数;宽度等于配置中layer的size;
+
+6.  如何在训练过程中获得某一个layer的output
+-----------------------------------------------
+
+可以在event_handler中,通过 :code:`event.gm.getLayerOutputs("layer_name")` 获得在模型配置中某一层的name :code:`layer_name` 在当前
+mini-batch forward的output的值。获得的值类型均为 :code:`numpy.ndarray` ,可以通过这个输出来完成自定义的评估指标计算等功能。例如下面代码:
+
+..      code-block:: python
+
+        def score_diff(right_score, left_score):
+            return np.average(np.abs(right_score - left_score))
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 25 == 0:
+                    diff = score_diff(
+                        event.gm.getLayerOutputs("right_score")["right_score"][
+                            "value"],
+                        event.gm.getLayerOutputs("left_score")["left_score"][
+                            "value"])
+                    logger.info(("Pass %d Batch %d : Cost %.6f, "
+                                "average absolute diff scores: %.6f") %
+                                (event.pass_id, event.batch_id, event.cost, diff))
+
+注意:此方法不能获取 :code:`paddle.layer.recurrent_group` 里step的内容,但可以获取 :code:`paddle.layer.recurrent_group` 的输出。
+
+7.  如何在训练过程中获得参数的权重和梯度
+-----------------------------------------------
+
+在某些情况下,获得当前mini-batch的权重(或称作weights, parameters)有助于在训练时观察具体数值,方便排查以及快速定位问题。
+可以通过在 :code:`event_handler` 中打印其值(注意,需要使用 :code:`paddle.event.EndForwardBackward` 保证使用GPU训练时也可以获得),
+示例代码如下:
+
+..      code-block:: python
+
+        ...
+        parameters = paddle.parameters.create(cost)
+        ...
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndForwardBackward):
+                if event.batch_id % 25 == 0:
+                    for p in parameters.keys():
+                        logger.info("Param %s, Grad %s",
+                            parameters.get(p), parameters.get_grad(p))
+
+注意:“在训练过程中获得某一个layer的output”和“在训练过程中获得参数的权重和梯度”都会造成训练中的数据从C++拷贝到numpy,会对训练性能造成影响。不要在注重性能的训练场景下使用。
\ No newline at end of file
diff --git a/doc/faq/local/src/reduce_min_pool_size.py b/doc/faq/local/src/reduce_min_pool_size.py
new file mode 100644
index 0000000000..9efdb5707a
--- /dev/null
+++ b/doc/faq/local/src/reduce_min_pool_size.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
diff --git a/paddle/setup.py.in b/doc/faq/local/src/word2vec_config.py
similarity index 50%
rename from paddle/setup.py.in
rename to doc/faq/local/src/word2vec_config.py
index 06d55d3abc..b4fcf0960e 100644
--- a/paddle/setup.py.in
+++ b/doc/faq/local/src/word2vec_config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+...  # the settings and define data provider is omitted.
+DICT_DIM = 3000  # dictionary dimension.
+word_ids = data_layer('word_ids', size=DICT_DIM)
 
-from setuptools import setup, Extension
-
-setup(name="py_paddle",
-      version="${PADDLE_VERSION}",
-      packages=['py_paddle'],
-      include_package_data=True,
-      package_data={'py_paddle':['*.py','_swig_paddle.so']},
-      install_requires = [
-        'nltk>=3.2.2',
-        'numpy>=1.8.0',      # The numpy is required.
-        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
-      ],
-      url='http://www.paddlepaddle.org/',
-      license='Apache 2.0',
-)
+emb = embedding_layer(
+    input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            'label', size=DICT_DIM)))
diff --git a/v1_api_demo/model_zoo/resnet/get_model.sh b/doc/faq/local/src/word2vec_dataprovider.py
old mode 100755
new mode 100644
similarity index 58%
rename from v1_api_demo/model_zoo/resnet/get_model.sh
rename to doc/faq/local/src/word2vec_dataprovider.py
index b33d8178ab..3b6273b057
--- a/v1_api_demo/model_zoo/resnet/get_model.sh
+++ b/doc/faq/local/src/word2vec_dataprovider.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,21 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+DICT_DIM = 3000
 
-mkdir model
-cd model
 
-echo "Downloading ResNet models..."
-
-for file in resnet_50.tar.gz resnet_101.tar.gz resnet_152.tar.gz mean_meta_224.tar.gz 
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/imagenet/$file
-  tar -xvf $file 
-  rm $file
-done
-
-echo "Done."
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+    with open(filename) as f:
+        # yield word ids to predict inner word id
+        # such as [28, 29, 10, 4], 4
+        # It means the sentance is  28, 29, 4, 10, 4.
+        yield read_next_from_file(f)
diff --git a/doc/faq/model/index_cn.rst b/doc/faq/model/index_cn.rst
new file mode 100644
index 0000000000..6947948bc7
--- /dev/null
+++ b/doc/faq/model/index_cn.rst
@@ -0,0 +1,80 @@
+#########
+模型配置
+#########
+
+..  contents::
+
+1. 出现 :code:`Duplicated layer name` 错误怎么办
+--------------------------------------------------
+
+出现该错误的原因一般是用户对不同layer的参数 :code:`name` 设置了相同的取值。遇到该错误时,先找出参数 :code:`name` 取值相同的layer,然后将这些layer的参数 :code:`name` 设置为不同的值。
+
+2. :code:`paddle.layer.memory` 的参数 :code:`name` 如何使用
+-------------------------------------------------------------
+
+* :code:`paddle.layer.memory` 用于获取特定layer上一时间步的输出,该layer是通过参数 :code:`name` 指定,即,:code:`paddle.layer.memory` 会关联参数 :code:`name` 取值相同的layer,并将该layer上一时间步的输出作为自身当前时间步的输出。
+
+* PaddlePaddle的所有layer都有唯一的name,用户通过参数 :code:`name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。而 :code:`paddle.layer.memory` 不是真正的layer,其name由参数 :code:`memory_name` 设定,当用户没有显式设定时,PaddlePaddle会自动设定。:code:`paddle.layer.memory` 的参数 :code:`name` 用于指定其要关联的layer,需要用户显式设定。
+
+3. 两种使用 drop_out 的方法有何区别
+------------------------------------
+
+* 在PaddlePaddle中使用dropout有两种方式
+
+  * 在相应layer的 :code:`layer_atter` 设置 :code:`drop_rate`,以 :code:`paddle.layer.fc` 为例,代码如下:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
+
+  * 使用 :code:`paddle.layer.dropout`,以 :code:`paddle.layer.fc` 为例,代码如下:
+
+  ..  code-block:: python
+
+      fc = paddle.layer.fc(input=input)
+      drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
+
+* :code:`paddle.layer.dropout` 实际上使用了 :code:`paddle.layer.add_to`,并在该layer里采用第一种方式设置 :code:`drop_rate` 来使用dropout的。这种方式对内存消耗较大。
+
+* PaddlePaddle在激活函数里实现dropout,而不是在layer里实现。
+
+* :code:`paddle.layer.lstmemory`、:code:`paddle.layer.grumemory`、:code:`paddle.layer.recurrent` 不是通过一般的方式来实现对输出的激活,所以不能采用第一种方式在这几个layer里设置 :code:`drop_rate` 来使用dropout。若要对这几个layer使用dropout,可采用第二种方式,即使用 :code:`paddle.layer.dropout`。
+
+4. 不同的 recurrent layer 的区别
+----------------------------------
+以LSTM为例,在PaddlePaddle中包含以下 recurrent layer:
+
+* :code:`paddle.layer.lstmemory`
+* :code:`paddle.networks.simple_lstm`
+* :code:`paddle.networks.lstmemory_group`
+* :code:`paddle.networks.bidirectional_lstm`
+
+按照具体实现方式可以归纳为2类:
+
+1. 由 recurrent_group 实现的 recurrent layer:
+
+  * 用户在使用这一类recurrent layer时,可以访问由recurrent unit在一个时间步内计算得到的中间值(例如:hidden states, memory cells等);
+  * 上述的 :code:`paddle.networks.lstmemory_group` 是这一类的 recurrent layer ;
+
+2. 将recurrent layer作为一个整体来实现:
+
+  * 用户在使用这一类recurrent layer,只能访问它们的输出值;
+  * 上述的 :code:`paddle.networks.lstmemory_group` 、 :code:`paddle.networks.simple_lstm` 和 :code:`paddle.networks.bidirectional_lstm` 属于这一类的实现;
+
+将recurrent layer作为一个整体来实现, 能够针对CPU和GPU的计算做更多优化, 所以相比于recurrent group的实现方式, 第二类 recurrent layer 计算效率更高。 在实际应用中,如果用户不需要访问LSTM的中间变量,而只需要获得recurrent layer计算的输出,我们建议使用第二类实现。
+
+此外,关于LSTM, PaddlePaddle中还包含 :code:`paddle.networks.lstmemory_unit` 这一计算单元:
+
+  * 不同于上述介绍的recurrent layer , :code:`paddle.networks.lstmemory_unit` 定义了LSTM单元在一个时间步内的计算过程,它并不是一个完整的recurrent layer,也不能接收序列数据作为输入;
+  * :code:`paddle.networks.lstmemory_unit` 只能在recurrent_group中作为step function使用;
+
+5. PaddlePaddle的softmax能否指定计算的维度
+-----------------------------------------
+
+PaddlePaddle的softmax不能指定计算维度,只能按行计算。
+在图像任务中,对于NCHW,如果需要在C维度计算softmax,可以先使用 :code:`paddle.layer.switch_order` 改变维度顺序,即将NCHW转换成NHWC,再做一定的reshape,最后计算softmax。
+
+6. PaddlePaddle是否支持维数可变的数据输入
+------------------------------------------
+
+PaddlePaddle提供的 :code:`paddle.data_type.dense_array` 支持维数可变的数据输入。在使用时,将对应数据层的维数设置成一个大于输入数据维数的值用于占位即可。
diff --git a/doc/faq/parameter/index_cn.rst b/doc/faq/parameter/index_cn.rst
new file mode 100644
index 0000000000..6fa0c64413
--- /dev/null
+++ b/doc/faq/parameter/index_cn.rst
@@ -0,0 +1,201 @@
+#########
+参数设置
+#########
+
+..  contents::
+
+1. 如何选择SGD算法的学习率
+--------------------------
+
+在采用sgd/async_sgd进行训练时,一个重要的问题是选择正确的learning_rate。如果learning_rate太大,那么训练有可能不收敛,如果learning_rate太小,那么收敛可能很慢,导致训练时间过长。
+
+通常做法是从一个比较大的learning_rate开始试,如果不收敛,那减少学习率10倍继续试验,直到训练收敛为止。那么如何判断训练不收敛呢?可以估计出如果模型采用不变的输出最小的cost0是多少。
+
+如果训练过程的的cost明显高于这个常数输出的cost,那么我们可以判断为训练不收敛。举一个例子,假如我们是三分类问题,采用multi-class-cross-entropy作为cost,数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass(或者更早)后,cost还大于这个数,那么可以认为训练不收敛,应该降低学习率。
+
+2. 如何设置学习率退火(learning rate annealing)
+------------------------------------------------
+
+在相应的优化算法里设置learning_rate_schedule及相关参数,以使用Adam算法为例,代码如下:
+
+..  code-block:: python
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3,
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=0.75,
+        learning_rate_schedule="poly",)
+
+PaddlePaddle目前支持8种learning_rate_schedule,这8种learning_rate_schedule及其对应学习率计算方式如下:
+
+* "constant"
+  
+  lr = learning_rate
+
+* "poly"
+
+  lr = learning_rate * pow(1 + learning_rate_decay_a * num_samples_processed, -learning_rate_decay_b)
+
+  其中,num_samples_processed为已训练样本数,下同。
+
+* "caffe_poly"
+
+  lr = learning_rate * pow(1.0 - num_samples_processed / learning_rate_decay_a, learning_rate_decay_b)
+
+* "exp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, num_samples_processed / learning_rate_decay_b)
+
+* "discexp"
+
+  lr = learning_rate * pow(learning_rate_decay_a, floor(num_samples_processed / learning_rate_decay_b))
+
+* "linear"
+
+  lr = max(learning_rate - learning_rate_decay_a * num_samples_processed, learning_rate_decay_b)
+
+* "manual"
+
+  这是一种按已训练样本数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下:
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="manual",
+          learning_rate_args="1000:1.0,2000:0.9,3000:0.8",)
+
+  在该示例中,当已训练样本数小于等于1000时,学习率为 :code:`1e-3 * 1.0`;当已训练样本数大于1000小于等于2000时,学习率为 :code:`1e-3 * 0.9`;当已训练样本数大于2000时,学习率为 :code:`1e-3 * 0.8`。
+
+* "pass_manual"
+
+  这是一种按已训练pass数分段取值的学习率退火方法。使用该learning_rate_schedule时,用户通过参数 :code:`learning_rate_args` 设置学习率衰减因子分段函数,当前的学习率为所设置 :code:`learning_rate` 与当前的衰减因子的乘积。以使用Adam算法为例,代码如下:
+
+  ..  code-block:: python
+
+      optimizer = paddle.optimizer.Adam(
+          learning_rate=1e-3,
+          learning_rate_schedule="pass_manual",
+          learning_rate_args="1:1.0,2:0.9,3:0.8",)
+
+  在该示例中,当已训练pass数小于等于1时,学习率为 :code:`1e-3 * 1.0`;当已训练pass数大于1小于等于2时,学习率为 :code:`1e-3 * 0.9`;当已训练pass数大于2时,学习率为 :code:`1e-3 * 0.8`。
+
+3. 如何初始化参数
+-----------------
+
+默认情况下,PaddlePaddle使用均值0,标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式,PaddlePaddle目前提供两种参数初始化的方式\:
+
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+
+比如设置一个全连接层的参数初始化方式和bias初始化方式,可以使用如下代码。
+
+..  code-block:: python
+
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+
+4. 如何共享参数
+---------------
+
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID,相同名字的参数,会共享参数。设置参数的名字,可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式,是使得要共享的参数使用同样的 :code:`ParamAttr` 对象。
+
+简单的全连接网络,参数共享的配置示例为\:
+
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
+
+5. 如何加载预训练参数
+------------------------
+
+* 对加载预训练参数的层,设置其参数属性 :code:`is_static=True`,使该层的参数在训练过程中保持不变。以embedding层为例,代码如下:
+
+..  code-block:: python
+
+    emb_para = paddle.attr.Param(name='emb', is_static=True)
+    paddle.layer.embedding(size=word_dim, input=x, param_attr=emb_para)
+
+
+* 从模型文件将预训练参数载入 :code:`numpy.array`,在创建parameters后,使用 :code:`parameters.set()` 加载预训练参数。PaddlePaddle保存的模型参数文件前16字节为头信息,用户将参数载入 :code:`numpy.array` 时须从第17字节开始。以embedding层为例,代码如下:
+
+..  code-block:: python
+
+    def load_parameter(file_name, h, w):
+        with open(file_name, 'rb') as f:
+            f.read(16)  # skip header.
+            return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+    parameters = paddle.parameters.create(my_cost)
+    parameters.set('emb', load_parameter(emb_param_file, 30000, 256))
+
+6. 存储的参数格式是什么,如何和明文进行相互转化
+--------------------------------------------------
+
+PaddlePaddle保存的模型参数文件内容由16字节头信息和网络参数两部分组成。头信息中,1~4字节表示PaddlePaddle版本信息,请直接填充0;5~8字节表示每个参数占用的字节数,当保存的网络参数为float类型时为4,double类型时为8;9~16字节表示保存的参数总个数。
+
+将PaddlePaddle保存的模型参数还原回明文时,可以使用相应数据类型的 :code:`numpy.array` 加载具体网络参数,此时可以跳过PaddlePaddle模型参数文件的头信息。若在PaddlePaddle编译时,未指定按照double精度编译,默认情况下按照float精度计算,保存的参数也是float类型。这时在使用 :code:`numpy.array` 时,一般设置 :code:`dtype=float32` 。示例如下:
+
+..  code-block:: python
+
+    def read_parameter(fname, width):
+        s = open(fname).read()
+        # skip header
+        vec = np.fromstring(s[16:], dtype=np.float32)
+        # width is the size of the corresponding layer
+        np.savetxt(fname + ".csv", vec.reshape(width, -1),
+                fmt="%.6f", delimiter=",")
+
+
+将明文参数转化为PaddlePaddle可加载的模型参数时,首先构造头信息,再写入网络参数。下面的代码将随机生成的矩阵转化为可以被PaddlePaddle加载的模型参数。
+
+..  code-block:: python
+
+    def gen_rand_param(param_file, width, height, need_trans):
+        np.random.seed()
+        header = struct.pack("iil", 0, 4, height * width)
+        param = np.float32(np.random.rand(height, width))
+        with open(param_file, "w") as fparam:
+            fparam.write(header + param.tostring())
+
+7. A protocol message was rejected because it was too big
+------------------------------------------------------------
+
+如果在训练NLP相关模型时,出现以下错误:
+
+..  code-block:: bash
+
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
+
+可能的原因是:传给dataprovider的某一个args过大,一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似:
+
+..  code-block:: python
+
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+
+解决方案是:将字典的地址作为args传给dataprovider,然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为:
+
+..  code-block:: python
+
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
+
+
diff --git a/doc/faq/src/reduce_min_pool_size.py b/doc/faq/src/reduce_min_pool_size.py
deleted file mode 100644
index 5715397cc1..0000000000
--- a/doc/faq/src/reduce_min_pool_size.py
+++ /dev/null
@@ -1,6 +0,0 @@
-@provider(min_pool_size=0, ...)
-def process(settings, filename):
-    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
-    with open('%s.shuf' % filename, 'r') as f:
-        for line in f:
-            yield get_sample_from_line(line)
diff --git a/doc/faq/src/word2vec_config.py b/doc/faq/src/word2vec_config.py
deleted file mode 100644
index 866b40c3d4..0000000000
--- a/doc/faq/src/word2vec_config.py
+++ /dev/null
@@ -1,12 +0,0 @@
-...  # the settings and define data provider is omitted.
-DICT_DIM = 3000  # dictionary dimension.
-word_ids = data_layer('word_ids', size=DICT_DIM)
-
-emb = embedding_layer(
-    input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
-emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
-predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            'label', size=DICT_DIM)))
diff --git a/doc/faq/src/word2vec_dataprovider.py b/doc/faq/src/word2vec_dataprovider.py
deleted file mode 100644
index ec2753a7d0..0000000000
--- a/doc/faq/src/word2vec_dataprovider.py
+++ /dev/null
@@ -1,10 +0,0 @@
-DICT_DIM = 3000
-
-
-@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
-def process(settings, filename):
-    with open(filename) as f:
-        # yield word ids to predict inner word id
-        # such as [28, 29, 10, 4], 4
-        # It means the sentance is  28, 29, 4, 10, 4.
-        yield read_next_from_file(f)
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index 428f58830e..0000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ,其中 `y = wx + b + ε`, 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格(y)是其大小(x)的一个线性函数,那么我们可以通过收集市场上房子的大小和价格,用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为: `y = 2x + 0.3 + ε`,这里展示如何使用观测数据来拟合这一线性关系。首先,Python代码将随机产生2000个观测点,作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`,我们先从一条随机的直线 `y' = wx + b` 开始,然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小,最终趋于接近。这个过程就是模型的训练过程,而 `w` 和 `b` 就是模型的参数,即我们的训练目标。
-
-在PaddlePaddle里,该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源,调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数,即  ȳ 和真实 y 之间的距离
-    cost = mse_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法:
-
-- 第一部分定义了数据输入。一般情况下,PaddlePaddle先从一个文件列表里获得数据文件地址,然后交给用户自定义的函数(例如上面的 `process`函数)进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件,所以放一个空列表(`empty.list`)即可。
-
-- 第二部分主要是选择学习算法,它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法,这里使用一个基于momentum的随机梯度下降(SGD)算法,该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层,所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元:
-    
-    - **数据层**:数据层 `data_layer` 是神经网络的入口,它读入数据并将它们传输到接下来的网络层。这里数据层有两个,分别对应于变量 `x` 和 `y`。
-    - **全连接层**:全连接层 `fc_layer` 是基础的计算单元,这里利用它建模变量之间的线性关系。计算单元是神经网络的核心,PaddlePaddle支持大量的计算单元和任意深度的网络连接,从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**:回归误差代价层 `mse_cost` 是众多误差代价函数层的一种,它们在训练过程作为网络的出口,用来计算模型的误差,是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后,运行以下训练命令:
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮,并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到,随着轮数增加误差代价函数的输出在不断的减小,这意味着模型在训练数据上不断的改进,直到逼近真实解:` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后,我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测,评价预测的效果。在这个例子中,由于已经知道了真实答案,我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件,所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到,虽然 `w` 和 `b` 都使用随机值初始化,但在起初的几轮训练中它们都在快速逼近真实值,并且后续仍在不断改进,使得最终得到的模型几乎与真实模型一致。
-
-这样,我们用PaddlePaddle解决了单变量线性回归问题, 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 6775da20c2..0000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = mse_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec6748095..0000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000..71904dc41e
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,151 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle,需要执行:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境,执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装:
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+如果机器中已经安装过PaddlePaddle,有两种方法:
+
+.. code-block:: bash
+
+   1. 先卸载之前的版本,再重新安装
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. 直接升级到更新的版本
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法:
+
+使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+
+如果不使用Docker,可以执行ctest命令即可:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖(包含但不限于),其他的依赖软件,会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库,如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集,
+还会下载MKL-DNN数学库,详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL,则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构,加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时,首先在系统路径( :code:`/usr/lib:/usr/local/lib` )中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(** :code:`rm -rf` )**后,再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 69f4501f37..0000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,222 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS, you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a gcc compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000..27f73b2e2c
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,169 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install build/python/dist/*.whl
+
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.2", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentations", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad9..0000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa45..0000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项,包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们,详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中,通过使用 ``-D`` 命令设置该类编译选项,例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库:`MKL <https://software.intel.com/en-us/intel-mkl>`_ ,`ATLAS <http://math-atlas.sourceforge.net/>`_ ,`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行,但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时,首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库,同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置,例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意:这几个编译选项的设置,只在第一次cmake的时候有效。如果之后想要重新设置,推荐清理整个编译目录(``rm -rf``)后,再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16..0000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h,${MKL_ROOT}/lib目录下需要包含mkl_core,mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h,${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h,${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h,${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470..0000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库,而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢,打印的日志变多,但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口,该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 87c286a1af..79d214635a 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,183 +1,145 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行。 请注意,您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
+如果您在使用Windows,可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程,完成在Windows上安装和使用Docker。
 
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像,称为开发镜像,里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像,称为生产镜像,里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新
-的Docker镜像,可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
-内的开发者下载Docker镜像,我们提供了国内的镜像服务器供大家使用。如果您
-在国内,请把文档里命令中的paddlepaddle/paddle替换成
-docker.paddlepaddle.org/paddle。
-
-1. 开发镜像::code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境,完成开发,编译,发布,
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具,所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作,用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作,
-   也可以在开发镜像中启动一个SSHD服务,方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者,可以以后台进程方式运行容器:
-
-   .. code-block:: bash
+在了解Docker的基本使用方法之后,即可开始下面的步骤:
 
-      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev
+.. _docker_pull:
 
-   然后用密码 :code:`root` SSH进入容器:
-
-   .. code-block:: bash
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-      ssh -p 2202 root@localhost
+执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
 
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如,一个终端运行vi,另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上,并在笔记本上通过SSH与其连接。
+  .. code-block:: bash
 
-2. 生产镜像:根据CPU、GPU和非AVX区分了如下4个镜像:
+     docker pull paddlepaddle/paddle
 
-   - GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX::code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
+对于国内用户,我们提供了加速访问的镜像源:
 
-   纯CPU镜像以及GPU镜像都会用到AVX指令集,但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     docker pull docker.paddlepaddlehub.com/paddle
 
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
 
-   如果输出是No,就需要选择使用no-AVX的镜像
+  .. code-block:: bash
 
-   以上方法在GPU镜像里也能用,只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 
-   .. code-block:: bash
+选择下载使用不同的BLAS库的Docker镜像:
 
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+  .. code-block:: bash
 
-   注意: 如果使用nvidia-docker存在问题,你也许可以尝试更老的方法,具体如下,但是我们并不推荐这种方法。:
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   .. code-block:: bash
+下载指定版本的Docker镜像,可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag,并执行下面的命令:
 
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如:
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`,这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行:
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+----------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写),就可以使用下面的命令开始执行训练:
 
-   如果要使用GPU,请运行:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中, :code:`-it` 参数说明容器已交互式运行; :code:`-v $PWD:/work`
+指定将当前路径(Linux中$PWD变量会展开为当前路径的绝对路径)挂载到容器内部的 :code:`/work`
+目录; :code:`paddlepaddle/paddle` 指定需要使用的容器; 最后 :code:`/work/train.py`
+为容器内执行的命令,即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然,您也可以进入到Docker容器中,以交互式的方式执行或调试您的代码:
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像,可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注:PaddlePaddle Docker镜像为了减小体积,默认没有安装vim,您可以在容器中执行** :code:`apt-get install -y vim` **安装后,在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序,大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+-----------------------------------
 
-PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning,PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程,或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像,直接运行:
 
-.. code-block:: bash
-
-    docker run -p 8888:8888 paddlepaddle/book
+  .. code-block:: bash
 
-然后在浏览器中输入以下网址:
+     docker run -p 8888:8888 paddlepaddle/book
 
-.. code-block:: text
+国内用户可以使用下面的镜像源来加速访问:
 
-    http://localhost:8888/
+  .. code-block: bash
 
-就这么简单,享受您的旅程!
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
-通过Docker容器开发PaddlePaddle
-------------------------------
+然后在浏览器中输入以下网址:
 
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux,Mac OS X和Windows。
+  .. code-block:: text
 
-1. 制作PaddlePaddle开发镜像
+     http://localhost:8888/
 
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个,一个是直接把一个容器转换成镜像,另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷,适合自己实验,可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚,其他人很容易看懂镜像生成过程,持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行:
+就这么简单,享受您的旅程!
 
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
+.. _docker_run_gpu:
 
-   docker build这个命令的-t指定了生成的镜像的名字,这里我们用paddle:dev。到此,PaddlePaddle开发镜像就被构建完毕了。
+使用Docker执行GPU训练
+------------------------------
 
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行,我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   生产镜像的生成分为两步,第一步是运行:
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
-   以上命令会编译PaddlePaddle,生成运行程序,以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU,“WITH_AVX”控制生成的生产镜像是否支持AVX,”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker,可以尝试以下的方法,将CUDA库和Linux设备挂载到Docker容器内:**
 
-   第二步是运行:
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置,最终生成名为paddle:prod的生产镜像。
+**关于AVX:**
 
-3. 运行单元测试
+AVX是一种CPU指令集,可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的,所以,如果您的电脑不支持AVX,需要单独
+`编译 <./build_from_source_cn.html>`_ PaddlePaddle为no-avx版本。
 
-   运行以下指令:
+以下指令能检查Linux电脑是否支持AVX:
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
 
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码,便于用户浏览C++源码。
-
-只要在Docker里启动PaddlePaddle的时候给它一个名字,就可以再运行另一个Nginx Docker镜像来服务HTML代码:
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No,就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index b6fd3329b2..e0e0559fb8 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,152 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
-
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
+     docker pull docker.paddlepaddlehub.com/paddle
 
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. If you are in
-China, you can use our Docker image registry mirror to speed up the
-download process. To use it, please replace all paddlepaddle/paddle in
-the commands to docker.paddlepaddle.org/paddle.
-
-1. Production images, this image might have multiple variants:
-
-   - GPU/AVX::code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX::code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX::code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX::code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddlehub.com/paddle:latest-gpu
 
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
+Choose between different BLAS version:
 
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-2. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-
-Train Model Using Python API
-----------------------------
+  .. code-block:: bash
 
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-Create a directory as workspace:
 
-.. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-   mkdir ~/workspace
+  .. code-block:: bash
 
-Edit a PaddlePaddle python program using your favourite editor
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddlehub.com/paddle:0.11.0-gpu
 
-.. code-block:: bash
+.. _docker_run:
 
-   emacs ~/workspace/example.py
+Launch your training program in Docker
+--------------------------------------
 
-Run the program using docker:
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-Or if you are using GPU for training:
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-.. code-block:: bash
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
+.. _docker_run_book:
 
-.. code-block:: bash
+PaddlePaddle Book
+------------------
 
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
-Running with GPU is identical:
+We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
+     docker run -p 8888:8888 paddlepaddle/book
 
+For users in China, we provide a faster mirror:
 
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
+  .. code-block: bash
 
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+    docker run -p 8888:8888 docker.paddlepaddlehub.com/book
 
-1. Build PaddlePaddle develop image
+Then, you would back and paste the address into the local browser:
 
-   Use following command to build PaddlePaddle develop image:
+  .. code-block:: text
 
-   .. code-block:: bash
+     http://localhost:8888/
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
+That's all. Enjoy your journey!
 
-2. Build PaddlePaddle production image
+.. _docker_run_gpu:
 
-   There are two steps for building production image, the first step is to run:
+Train with Docker with GPU
+------------------------------
 
-   .. code-block:: bash
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+  .. code-block:: bash
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+     nvidia-docker run -it -v $PWD:/work paddlepaddle/paddle:latest-gpu /bin/bash
 
-   The second step is to run:
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
-   .. code-block:: bash
+  .. code-block:: bash
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddlepaddle/paddle:latest-gpu
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+**About AVX:**
 
-3. Run unit test
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.html>`_ with :code:`WITH_AVX=OFF`.
 
-   Following command will run unit test:
+The following command will tell you whether your computer supports AVX.
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-PaddlePaddle Book
-------------------
-
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
-dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
-
-We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
-
-    docker run -p 8888:8888 paddlepaddle/book
-
-Then, you would back and paste the address into the local browser:
-
-.. code-block:: text
-
-    http://localhost:8888/
-
-That's all. Enjoy your journey!
-
-
-Documentation
--------------
-
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
-
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
-
-
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index a24df6c518..c9ba84c842 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,24 +6,28 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供数个预编译的二进制来进行安装,包括Docker镜像,ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境,同时欢迎贡献更多的安装包。
+PaddlePaddle提供pip和Docker的安装方式:
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
-   ubuntu_install_cn.rst
-
 
+   pip_install_cn.rst
+   docker_install_cn.rst
+   ../../howto/dev/build_cn.md
 
 编译流程
 ++++++++
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看,普通用户请走安装流程。
+    建议直接使用上述安装流程,方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 1bfd4f75c0..32d66d63dd 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,23 +1,34 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
+   ../../howto/dev/build_en.md
 
-    docker_install_en.rst
-    ubuntu_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`deb` package or :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000..16087ce059
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000..8e4165da6b
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,87 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装,并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的(开发分支)PaddlePaddle,可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装,
+您可以从下面的表格中找到需要的版本:
+
+如果在点击下面链接时出现如下登陆界面,点击“Log in as guest”即可开始下载:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序,而且包含了C++编写的部分,所以我们确保发布的二进制包可以支持主流的Linux操作系统,比如CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准,通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上,而且CentOS 5即将停止维护,所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上,Ubuntu 14.04以上,MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是,没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准,需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题,可以执行:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式,并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000..c1e806c0fe
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,105 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements, the version is cpu_avx_openblas.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later,Ubuntu 14.04 or later,MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
deleted file mode 100644
index 9e39ccb00f..0000000000
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-Ubuntu部署PaddlePaddle
-===================================
-
-PaddlePaddle提供了ubuntu 14.04 deb安装包。
-
-安装
-------
-
-安装包的下载地址是\: https://github.com/PaddlePaddle/Paddle/releases
-
-它包含四个版本\:
-
-* cpu版本: 支持主流x86处理器平台, 使用了avx指令集。
-
-* cpu-noavx版本:支持主流x86处理器平台,没有使用avx指令集。
-
-* gpu版本:支持主流x86处理器平台,支持nvidia cuda平台,使用了avx指令集。
-
-* gpu-noavx版本:支持主流x86处理器平台,支持nvidia cuda平台,没有使用avx指令集。
-
-下载完相关安装包后,执行:
-
-..  code-block:: shell
-
-    sudo apt-get install gdebi
-    gdebi paddle-*-cpu.deb
-
-或者:
-
-..  code-block:: shell
-
-    dpkg -i paddle-*-cpu.deb
-    apt-get install -f
-
-
-在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的,
-在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
-
-安装完成后,可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
-
-..  code-block:: shell
-
-    PaddlePaddle 0.8.0b1, compiled with
-        with_avx: ON
-        with_gpu: OFF
-        with_double: OFF
-        with_python: ON
-        with_rdma: OFF
-        with_timer: OFF
-        with_predict_sdk:
-
-
-可能遇到的问题
---------------
-
-libcudart.so/libcudnn.so找不到
-++++++++++++++++++++++++++++++
-
-安装完成后,运行 :code:`paddle train` 报错\:
-
-..  code-block:: shell
-
-      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
-
-原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle,请安装CUDA 7.5 和CUDNN 5到本地环境中,并设置:
-
-..  code-block:: shell
-
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/lib:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-
diff --git a/doc/getstarted/build_and_install/ubuntu_install_en.rst b/doc/getstarted/build_and_install/ubuntu_install_en.rst
deleted file mode 100644
index ea8042085b..0000000000
--- a/doc/getstarted/build_and_install/ubuntu_install_en.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-Debian Package installation guide
-=================================
-
-PaddlePaddle supports :code:`deb` pacakge. The installation of this :code:`deb` package is tested in ubuntu 14.04, but it should be support other debian based linux, too.
-
-There are four versions of debian package, :code:`cpu`, :code:`gpu`, :code:`cpu-noavx`, :code:`gpu-noavx`. And :code:`noavx` version is used to support CPU which does not contain :code:`AVX` instructions. The download url of :code:`deb` package is \: https://github.com/baidu/Paddle/releases/
-
-
-After downloading PaddlePaddle deb packages, you can use :code:`gdebi` install.
-
-..	code-block:: bash
-
-	gdebi paddle-*.deb
-
-If :code:`gdebi` is not installed, you can use :code:`sudo apt-get install gdebi` to install it.
-
-Or you can use following commands to install PaddlePaddle.
-
-..	code-block:: bash
-
-	dpkg -i paddle-*.deb
-	apt-get install -f
-
-And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
-
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
new file mode 100644
index 0000000000..a1b60388c4
--- /dev/null
+++ b/doc/getstarted/concepts/src/infer.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import numpy as np
+
+paddle.init(use_gpu=False)
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+# loading the model which generated by training
+with open('params_pass_90.tar', 'r') as f:
+    parameters = paddle.parameters.Parameters.from_tar(f)
+
+# Input multiple sets of data,Output the infer result in a array.
+i = [[[1, 2]], [[3, 4]], [[5, 6]]]
+print paddle.infer(output_layer=y_predict, parameters=parameters, input=i)
+# Will print:
+# [[ -3.24491572]
+#  [ -6.94668722]
+#  [-10.64845848]]
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
index 679d0a931a..0e5bdb57bc 100644
--- a/doc/getstarted/concepts/src/train.py
+++ b/doc/getstarted/concepts/src/train.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle.v2 as paddle
 import numpy as np
 
@@ -8,7 +22,7 @@ paddle.init(use_gpu=False)
 x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
 y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
 y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-cost = paddle.layer.mse_cost(input=y_predict, label=y)
+cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 
 # create parameters
 parameters = paddle.parameters.create(cost)
@@ -26,12 +40,17 @@ def event_handler(event):
         if event.batch_id % 1 == 0:
             print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
                                                   event.cost)
+    # product model every 10 pass
+    if isinstance(event, paddle.event.EndPass):
+        if event.pass_id % 10 == 0:
+            with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
 
 
 # define training dataset reader
 def train_reader():
     train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
-    train_y = np.array([-2, -3, -7, -7])
+    train_y = np.array([[-2], [-3], [-7], [-7]])
 
     def reader():
         for i in xrange(train_y.shape[0]):
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e63ca11102..e695ff283e 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -81,9 +81,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
 ..	code-block:: bash
 
     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
-    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+    cost = paddle.layer.square_error_cost(input=y_predict, label=y)
 
-其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上均方误差层。
+其中,x与y为之前描述的输入层;而y_predict是接收x作为输入,接上一个全连接层;cost接收y_predict与y作为输入,接上平方误差层。
 
 最后一层cost中记录了神经网络的所有拓扑结构,通过组合不同的layer,我们即可完成神经网络的搭建。
 
@@ -111,7 +111,7 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
     # define training dataset reader
     def train_reader():
         train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
-        train_y = np.array([-2, -3, -7, -7])
+        train_y = np.array([[-2], [-3], [-7], [-7]])
         def reader():
             for i in xrange(train_y.shape[0]):
                 yield train_x[i], train_y[i]
@@ -147,4 +147,9 @@ PaddlePaddle支持不同类型的输入数据,主要包括四种类型,和
 ..  literalinclude:: src/train.py
     :linenos:
 
-有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
\ No newline at end of file
+使用以上训练好的模型进行预测,取其中一个模型params_pass_90.tar,输入需要预测的向量组,然后打印输出:
+
+..  literalinclude:: src/infer.py
+    :linenos:
+
+有关线性回归的实际应用,可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a..9f6ee25987 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
+执行下面的命令完成快速安装,版本为cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码:
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d4..063d9d880c 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install, the version is cpu_avx_openblas:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
deleted file mode 100644
index 90dc84718c..0000000000
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# 构建Android平台上的PaddlePaddle库
-
-用户可通过交叉编译的方式,在用户熟悉的开发平台(Linux,Mac OS X和Windows)上编译Android平台上适用的PaddlePaddle库。
-本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
-
-## 准备交叉编译环境
-
-从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取:
-
-```bash
-wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
-unzip -q android-ndk-r14b-linux-x86_64.zip
-```
-
-Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
-比如:
-
-```bash
-your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-        --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain
-```
-
-此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。
-
-注意:**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。
-
-## 配置交叉编译参数
-
-CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
-
-交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
-- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
-- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
-- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
-
-Android平台可选配置参数:
-
-- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。
-- `ANDROID_ABI`,目标架构ABI。目前只支持`armeabi-v7a`,默认值为`armeabi-v7a`。
-- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
-- `ANROID_ARM_MODE`,是否使用ARM模式。可设置`ON/OFF`,默认值为`ON`。
-- `ANDROID_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。
-
-其他配置参数:
-
-- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
-
-一种常用的cmake配置如下:
-
-```bash
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      ..
-```
-
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
-
-## 编译和安装
-
-CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
-
-```bash
-make
-make install
-```
-
-注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
-
-执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Android版本的库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。
diff --git a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md b/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
deleted file mode 100644
index 085b5dda16..0000000000
--- a/doc/howto/cross_compiling/cross_compiling_for_raspberry_cn.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# 构建Raspberry Pi平台上的PaddlePaddle库
-
-对于Rasspberry Pi系统,用户可通过ssh等方式登录到Raspberry Pi系统上,按照[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档所述,直接编译Raspberry Pi平台上适用的PaddlePaddle库。
-
-用户也可以在自己熟悉的开发平台上,通过交叉编译的方式来编译。这篇文档将以Linux x86-64平台为例,介绍交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
-
-## 准备交叉编译环境
-
-从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。用户可自行前往[github](https://github.com/raspberrypi/tools)下载Raspberry Pi平台使用的C/C++交叉编译工具链,也可通过以下命令获取:
-
-```bash
-git clone https://github.com/raspberrypi/tools.git
-```
-
-该github仓库中包含若干个预编译好的、针对不同平台的编译工具。宿主机是Linux x86-64环境,则需选用`arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`下的作为编译工具,所使用的编译器为arm-linux-gnueabihf-gcc 4.8.3。
-
-注意,该编译工具链需要系统glibc支持2.14以上。
-
-## 配置交叉编译参数
-
-CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake),以提供一些默认的编译器和编译参数相关配置。
-
-交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数:
-
-- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
-
-Raspberry Pi平台可选配置参数:
-
-- `RPI_TOOLCHAIN`,编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。
-- `RPI_ARM_NEON`,是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。
-
-其他配置参数:
-
-- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
-
-cmake参数如下;
-
-```
-cmake -DCMAKE_SYSTEM_NAME=RPi \
-      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
-      -DRPI_ARM_NEON=ON \
-      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
-      -DWITH_GPU=OFF \
-      -DWITH_C_API=ON \
-      -DWITH_PYTHON=OFF \
-      -DWITH_SWIG_PY=OFF \
-      ..
-```
-
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
-
-## 编译和安装
-
-CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。
-
-```bash
-make
-make install
-```
-
-注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
-
-执行完安装命令后,由于上一步cmake配置中`WITH_C_API`设置为`ON`,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。
-
-更多的编译配置见[源码编译PaddlePaddle](http://www.paddlepaddle.org/doc_cn/getstarted/build_and_install/cmake/build_from_source_cn.html)相关文档。
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca851..9ecab5594c 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
index 13a153b05c..7adc79873d 100644
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -1,2 +1,7 @@
 RNN Models
 ==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
index ac2bd0775f..63fa161faf 100644
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -5,36 +5,13 @@ RNN配置
 中配置循环神经网络(RNN)。PaddlePaddle
 高度支持灵活和高效的循环神经网络配置。 在本教程中,您将了解如何:
 
--  准备用来学习循环神经网络的序列数据。
 -  配置循环神经网络架构。
 -  使用学习完成的循环神经网络模型生成序列。
 
 我们将使用 vanilla 循环神经网络和 sequence to sequence
 模型来指导你完成这些步骤。sequence to sequence
-模型的代码可以在\ ``demo / seqToseq``\ 找到。
-
-准备序列数据
-------------
-
-PaddlePaddle
-不需要对序列数据进行任何预处理,例如填充。唯一需要做的是将相应类型设置为输入。例如,以下代码段定义了三个输入。
-它们都是序列,它们的大小是\ ``src_dict``\ ,\ ``trg_dict``\ 和\ ``trg_dict``\ :
-
-.. code:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-在\ ``process``\ 函数中,每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列:
-
-.. code:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-有关如何编写数据提供程序的更多细节描述,请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
-``demo/seqToseq/dataprovider.py``\ 。
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
 
 配置循环神经网络架构
 --------------------
@@ -44,7 +21,7 @@ PaddlePaddle
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
       :align: center
 
 一般来说,循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
@@ -85,19 +62,19 @@ vanilla
                    act=None,
                    rnn_layer_attr=None):
         def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
            return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
 
 PaddlePaddle
 使用“Memory”(记忆模块)实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
@@ -119,7 +96,7 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 在这个模型中,源序列 :math:`S = \{s_1, \dots, s_T\}` 
@@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention
 .. code:: python
 
     # 定义源语句的数据层
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
     # 计算每个词的词向量
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
         input=src_word_id,
         size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
     # 应用前向循环神经网络
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
     # 应用反向递归神经网络(reverse=True表示反向循环神经网络)
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
     # 将循环神经网络的前向和反向部分混合在一起
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
 
     # 投射编码向量到 decoder_size
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
 
     # 计算反向RNN的第一个实例
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
 
     # 投射反向RNN的第一个实例到 decoder size
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义:
 
 .. code:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
     group_inputs.append(trg_embedding)
 
     # 对于配备有注意力机制的解码器,在训练中,
@@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention
     # StaticInput 意味着不同时间步的输入都是相同的值,
     # 否则它以一个序列输入,不同时间步的输入是不同的。
     # 所有输入序列应该有相同的长度。
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 
 单步函数的实现如下所示。首先,它定义解码网络的\ **Memory**\ 。然后定义
 attention,门控循环单元单步函数和输出函数:
@@ -198,27 +185,32 @@ attention,门控循环单元单步函数和输出函数:
         # 定义解码器的Memory
         # Memory的输出定义在 gru_step 内
         # 注意 gru_step 应该与它的Memory名字相同
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
         # 计算 attention 加权编码向量
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
         # 混合当前词向量和attention加权编码向量
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+         decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
         # 定义门控循环单元循环神经网络单步函数
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+         gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
         # 定义输出函数
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+         out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
 
 生成序列
@@ -238,41 +230,32 @@ attention,门控循环单元单步函数和输出函数:
    -  ``beam_size``: beam search 算法中的beam大小。
    -  ``max_length``: 生成序列的最大长度。
 
--  使用 ``seqtext_printer_evaluator``
-   根据索引矩阵和字典打印文本。这个函数需要设置:
-
-   -  ``id_input``: 数据的整数ID,用于标识生成的文件中的相应输出。
-   -  ``dict_file``: 用于将词ID转换为词的字典文件。
-   -  ``result_file``: 生成结果文件的路径。
-
 代码如下:
 
 .. code:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
     # 在生成时,解码器基于编码源序列和最后生成的目标词预测下一目标词。
     # 编码源序列(编码器输出)必须由只读Memory的 StaticInput 指定。
     # 这里, GeneratedInputs 自动获取上一个生成的词,并在最开始初始化为起始词,如 <s>。
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
     group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
-
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-
-注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
-
-完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
+
+    return beam_gen
+
+注意,这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务,请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
+
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
index 73f5d5371f..f92edd108f 100644
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -3,34 +3,11 @@ RNN Configuration
 
 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
 
-- prepare sequence data for learning recurrent neural networks.
 - configure recurrent neural network architecture.
 - generate sequence with learned recurrent neural network models.
 
-We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`.
-
-=====================
-Prepare Sequence Data
-=====================
-
-PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`:
-
-.. code-block:: python
-
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-
-
-Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers:
-
-.. code-block:: python
-
-    yield src_ids, trg_ids, trg_ids_next
-
-
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -42,7 +19,7 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
      :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output.
                    act=None,
                    rnn_layer_attr=None):
         def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                          full_matrix_projection(out_mem)],
-                                 name = name,
-                                 bias_attr = rnn_bias_attr,
-                                 act = act,
-                                 layer_attr = rnn_layer_attr,
-                                 size = size)
+           out_mem = paddle.layer.memory(name=name, size=size)
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
+                                        name = name,
+                                        bias_attr = rnn_bias_attr,
+                                        act = act,
+                                        layer_attr = rnn_layer_attr,
+                                        size = size)
            return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
-                               reverse=reverse,
-                               input=input)
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
+                                            step=__rnn_step__,
+                                            reverse=reverse,
+                                            input=input)
 
 
 PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
@@ -101,7 +78,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
@@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 .. code-block:: python
 
     # Define the data layer of the source sentence.
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
     # Calculate the word embedding of each word.
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
         input=src_word_id,
         size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
     # Apply forward recurrent neural network.
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
     # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
-    src_backward = grumemory(input=src_embedding,
-                              size=encoder_size,
-                              reverse=True)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
     # Mix the forward and backward parts of the recurrent neural network together.
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
 
     # Project encoding vector to decoder_size.
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                               size = decoder_size)
+    encoded_proj = paddle.layer.mixed(
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
 
     # Compute the first instance of the backward RNN.
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
 
     # Project the first instance of backward RNN to decoder size.
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 
 
 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
 
 .. code-block:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
-    trg_embedding = embedding_layer(
-        input=data_layer(name='target_language_word',
-                         size=target_dict_dim),
-        size=word_vector_dim,
-        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+    trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
     group_inputs.append(trg_embedding)
 
     # For decoder equipped with attention mechanism, in training,
@@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
     # StaticInput means the same value is utilized at different time steps.
     # Otherwise, it is a sequence input. Inputs at different time steps are different.
     # All sequence inputs should have the same length.
-    decoder = recurrent_group(name=decoder_group_name,
-                              step=gru_decoder_with_attention,
-                              input=group_inputs)
+    decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 
 
 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th
         # Defines the memory of the decoder.
         # The output of this memory is defined in gru_step.
         # Notice that the name of gru_step should be the same as the name of this memory.
-        decoder_mem = memory(name='gru_decoder',
-                             size=decoder_size,
-                             boot_layer=decoder_boot)
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
         # Compute attention weighted encoder vector.
-        context = simple_attention(encoded_sequence=enc_vec,
-                                   encoded_proj=enc_proj,
-                                   decoder_state=decoder_mem)
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
         # Mix the current word embedding and the attention weighted encoder vector.
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                               full_matrix_projection(current_word)],
-                                     size = decoder_size * 3)
+        decoder_inputs = paddle.layer.mixed(
+            size=decoder_size * 3,
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
         # Define Gated recurrent unit recurrent neural network step function.
-        gru_step = gru_step_layer(name='gru_decoder',
-                                  input=decoder_inputs,
-                                  output_mem=decoder_mem,
-                                  size=decoder_size)
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
         # Defines the output function.
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                          size=target_dict_dim,
-                          bias_attr=True,
-                          act=SoftmaxActivation())
+        out = paddle.layer.mixed(
+            size=target_dict_dim,
+            bias_attr=True,
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
         return out
 
 
@@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice
   - :code:`eos_id`: the end token. Every sentence ends with the end token.
   - :code:`beam_size`: the beam size used in beam search.
   - :code:`max_length`: the maximum length of the generated sentences.
-
-* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
-
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
-  - :code:`result_file`: the path of the generation result file.
     
 The code is listed below:
 
 .. code-block:: python
 
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
     # In generation, decoder predicts a next target word based on
     # the encoded source sequence and the last generated target word.
     # The encoded source sequence (encoder's output) must be specified by
     # StaticInput which is a read-only memory.
     # Here, GeneratedInputs automatically fetchs the last generated word,
     # which is initialized by a start mark, such as <s>.
-    trg_embedding = GeneratedInput(
-        size=target_dict_dim,
-        embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+    trg_embedding = paddle.layer.GeneratedInput(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
     group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
-                           step=gru_decoder_with_attention,
-                           input=group_inputs,
-                           bos_id=0, # Beginnning token.
-                           eos_id=1, # End of sentence token.
-                           beam_size=beam_size,
-                           max_length=max_length)
+    beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0, # Beginnning token.
+            eos_id=1, # End of sentence token.
+            beam_size=beam_size,
+            max_length=max_length)
 
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
+    return beam_gen
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
 
-The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
diff --git a/doc/tutorials/sentiment_analysis/bi_lstm.jpg b/doc/howto/deep_model/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/tutorials/sentiment_analysis/bi_lstm.jpg
rename to doc/howto/deep_model/rnn/src/bi_lstm.jpg
diff --git a/doc/tutorials/text_generation/encoder-decoder-attention-model.png b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/tutorials/text_generation/encoder-decoder-attention-model.png
rename to doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md
new file mode 100644
index 0000000000..4a80a52451
--- /dev/null
+++ b/doc/howto/dev/build_cn.md
@@ -0,0 +1,124 @@
+# 用Docker编译和测试PaddlePaddle
+
+## 需要的软硬件
+
+为了开发PaddlePaddle,我们需要
+
+1. 一台电脑,可以装的是 Linux, BSD, Windows 或者 MacOS 操作系统,以及
+1. Docker。
+
+不需要依赖其他任何软件了。即便是 Python 和 GCC 都不需要,因为我们会把所有编译工具都安装进一个 Docker image 里。
+
+## 总体流程
+
+1. 获取源码
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. 安装开发工具到 Docker image 里
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   请注意这个命令结尾处的 `.`;它表示 `docker build` 应该读取当前目录下的 [`Dockerfile`文件](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile),按照其内容创建一个名为 `paddle:dev` 的 Docker image,并且把各种开发工具安装进去。
+
+3. 编译
+
+   以下命令启动一个 Docker container 来执行 `paddle:dev` 这个 Docker image,同时把当前目录(源码树根目录)映射为 container 里的 `/paddle` 目录,并且运行 `Dockerfile` 描述的默认入口程序 [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `cmake` 和 `make` 来编译 `/paddle` 里的源码,结果输出到 `/paddle/build`,也就是本地的源码树根目录里的 `build` 子目录。
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev
+   ```
+
+   上述命令编译出一个 CUDA-enabled 版本。如果我们只需要编译一个只支持 CPU 的版本,可以用
+
+   ```bash
+   docker run --rm -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. 运行单元测试
+
+   用本机的第一个 GPU 来运行包括 GPU 单元测试在内的所有单元测试:
+
+   ```bash
+   NV_GPU=0 nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   如果编译的时候我们用了 `WITH_GPU=OFF` 选项,那么编译过程只会产生 CPU-based 单元测试,那么我们也就不需要 nvidia-docker 来运行单元测试了。我们只需要:
+
+   ```bash
+   docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   有时候我们只想运行一个特定的单元测试,比如 `memory_test`,我们可以
+
+   ```bash
+   nvidia-docker run --rm -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. 清理
+
+   有时候我们会希望清理掉已经下载的第三方依赖以及已经编译的二进制文件。此时只需要:
+
+   ```bash
+   rm -rf build
+   ```
+
+## 为什么要 Docker 呀?
+
+- 什么是 Docker?
+
+  如果您没有听说 Docker,可以把它想象为一个类似 virtualenv 的系统,但是虚拟的不仅仅是 Python 的运行环境。
+
+- Docker 还是虚拟机?
+
+  有人用虚拟机来类比 Docker。需要强调的是:Docker 不会虚拟任何硬件,Docker container 里运行的编译工具实际上都是在本机的 CPU 和操作系统上直接运行的,性能和把编译工具安装在本机运行一样。
+
+- 为什么用 Docker?
+
+  把工具和配置都安装在一个 Docker image 里可以标准化编译环境。这样如果遇到问题,其他人可以复现问题以便帮助。
+
+  另外,对于习惯使用Windows和MacOS的开发者来说,使用Docker就不用配置交叉编译环境了。
+
+- 我可以选择不用Docker吗?
+
+  当然可以。大家可以用把开发工具安装进入 Docker image 一样的方式,把这些工具安装到本机。这篇文档介绍基于 Docker 的开发流程,是因为这个流程比其他方法都更简便。
+
+- 学习 Docker 有多难?
+
+  理解 Docker 并不难,大概花十分钟看一下[这篇文章](https://zhuanlan.zhihu.com/p/19902938)。这可以帮您省掉花一小时安装和配置各种开发工具,以及切换机器时需要新安装的辛苦。别忘了 PaddlePaddle 更新可能导致需要新的开发工具。更别提简化问题复现带来的好处了。
+
+- 我可以用 IDE 吗?
+
+  当然可以,因为源码就在本机上。IDE 默认调用 make 之类的程序来编译源码,我们只需要配置 IDE 来调用 Docker 命令编译源码即可。
+
+  很多 PaddlePaddle 开发者使用 Emacs。他们在自己的 `~/.emacs` 配置文件里加两行
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  就可以按 `Ctrl-C` 和 `c` 键来启动编译了。
+
+- 可以并行编译吗?
+
+  是的。我们的 Docker image 运行一个 [Bash 脚本](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh)。这个脚本调用 `make -j$(nproc)` 来启动和 CPU 核一样多的进程来并行编译。
+
+## 可能碰到的问题
+
+- Docker 需要 sudo
+
+  如果用自己的电脑开发,自然也就有管理员权限(sudo)了。如果用公用的电脑开发,需要请管理员安装和配置好 Docker。此外,PaddlePaddle 项目在努力开始支持其他不需要 sudo 的集装箱技术,比如 rkt。
+
+- 在 Windows/MacOS 上编译很慢
+
+  Docker 在 Windows 和 MacOS 都可以运行。不过实际上是运行在一个 Linux 虚拟机上。可能需要注意给这个虚拟机多分配一些 CPU 和内存,以保证编译高效。具体做法请参考[这个issue](https://github.com/PaddlePaddle/Paddle/issues/627)。
+
+- 磁盘不够
+
+  本文中的例子里,`docker run` 命令里都用了 `--rm` 参数,这样保证运行结束之后的 containers 不会保留在磁盘上。可以用 `docker ps -a` 命令看到停止后但是没有删除的 containers。`docker build` 命令有时候会产生一些中间结果,是没有名字的 images,也会占用磁盘。可以参考[这篇文章](https://zaiste.net/posts/removing_docker_containers/)来清理这些内容。
diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md
new file mode 100644
index 0000000000..91c41ef8ce
--- /dev/null
+++ b/doc/howto/dev/build_en.md
@@ -0,0 +1,124 @@
+# Build using Docker
+
+## What Developers Need
+
+To contribute to PaddlePaddle, you need
+
+1. A computer -- Linux, BSD, Windows, MacOS, and
+1. Docker.
+
+Nothing else.  Not even Python and GCC, because you can install all build tools into a Docker image.  We run all the tools by running this image.
+
+## General Process
+
+1. Retrieve source code.
+
+   ```bash
+   git clone https://github.com/paddlepaddle/paddle
+   ```
+
+2. Install build tools into a Docker image.
+
+   ```bash
+   cd paddle; docker build -t paddle:dev .
+   ```
+
+   Please be aware of the `.` at the end of the command, which refers to the [`./Dockerfile` file](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).  `docker build` follows instructions in this file to create a Docker image named `paddle:dev`, and installs building tools into it.
+
+3. Build from source.
+
+   This following command starts a Docker container that executes the Docker image `paddle:dev`, mapping the current directory to `/paddle/` in the container, and runs the default entry-point [`build.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh) as specified in the Dockefile.  `build.sh` invokes `cmake` and `make` to build PaddlePaddle source code, which had been mapped to `/paddle`, and writes outputs to `/paddle/build`, which maps to `build` in the current source directory on the computer.
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev
+   ```
+
+   Above command builds a CUDA-enabled version.  If we want to build a CPU-only version, we can type
+
+   ```bash
+   docker run -e WITH_GPU=OFF -v $PWD:/paddle paddle:dev
+   ```
+
+4. Run unit tests.
+
+   To run all unit tests using the first GPU of a node:
+
+   ```bash
+   NV_GPU=0 nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   If we used `WITH_GPU=OFF` at build time, it generates only CPU-based unit tests, and we don't need nvidia-docker to run them.  We can just run
+
+   ```bash
+   docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest"
+   ```
+
+   Sometimes we want to run a specific unit test, say `memory_test`, we can run
+
+   ```bash
+   nvidia-docker run -v $PWD:/paddle paddle:dev bash -c "cd /paddle/build; ctest -V -R memory_test"
+   ```
+
+5. Clean Build.
+
+   Sometimes, we might want to clean all thirt-party dependents and built binaries.  To do so, just
+
+   ```bash
+   rm -rf build
+   ```
+
+## Docker, Or Not?
+
+- What is Docker?
+
+  If you haven't heard of it, consider it something like Python's virtualenv.
+
+- Docker or virtual machine?
+
+  Some people compare Docker with VMs, but Docker doesn't virtualize any hardware nor running a guest OS, which means there is no compromise on the performance.
+
+- Why Docker?
+
+  Using a Docker image of build tools standardizes the building environment, which makes it easier for others to reproduce your problems and to help.
+
+  Also, some build tools don't run on Windows or Mac or BSD, but Docker runs almost everywhere, so developers can use whatever computer they want.
+
+- Can I choose not to use Docker?
+
+  Sure, you don't have to install build tools into a Docker image; instead, you can install them in your local computer.  This document exists because Docker would make the development way easier.
+
+- How difficult is it to learn Docker?
+
+    It takes you ten minutes to read [an introductory article](https://docs.docker.com/get-started) and saves you more than one hour to install all required build tools, configure them, especially when new versions of PaddlePaddle require some new tools.  Not even to mention the time saved when other people trying to reproduce the issue you have.
+
+- Can I use my favorite IDE?
+
+  Yes, of course.  The source code resides on your local computer, and you can edit it using whatever editor you like.
+
+  Many PaddlePaddle developers are using Emacs.  They add the following few lines into their `~/.emacs` configure file:
+
+  ```emacs
+  (global-set-key "\C-cc" 'compile)
+  (setq compile-command
+   "docker run --rm -it -v $(git rev-parse --show-toplevel):/paddle paddle:dev")
+  ```
+
+  so they could type `Ctrl-C` and `c` to build PaddlePaddle from source.
+
+- Does Docker do parallel building?
+
+  Our building Docker image runs a [Bash script](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build.sh), which calls `make -j$(nproc)` to starts as many processes as the number of your CPU cores.
+
+## Some Gotchas
+
+- Docker requires sudo
+
+  An owner of a computer has the administrative privilege, a.k.a., sudo, and Docker requires this privilege to work properly.  If you use a shared computer for development, please ask the administrator to install and configure Docker.  We will do our best to support rkt, another container technology that doesn't require sudo.
+
+- Docker on Windows/MacOS builds slowly
+
+  On Windows and MacOS, Docker containers run in a Linux VM.  You might want to give this VM some more memory and CPUs so to make the building efficient.  Please refer to [this issue](https://github.com/PaddlePaddle/Paddle/issues/627) for details.
+
+- Not enough disk space
+
+  Examples in this article uses option `--rm` with the `docker run` command.  This option ensures that stopped containers do not exist on hard disks.  We can use `docker ps -a` to list all containers, including stopped.  Sometimes `docker build` generates some intermediate dangling images, which also take disk space.  To clean them, please refer to [this article](https://zaiste.net/posts/removing_docker_containers/).
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index 6993901452..3e0bf7b397 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a")
 
 ## 构建和测试
 
-编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:dev`来代替。
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。
 
 如要build这个开发镜像,在源码目录树的根目录中运行:
 
 ```bash
-➜  docker build -t paddle:dev .
+➜  docker build -t paddle:latest-dev .
 ```
 
 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以:
 
 ```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev
 ```
 
 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`):
@@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a")
 如果要运行所有的单元测试,可以用如下命令:
 
 ```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+➜  docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest"
 ```
 
 关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
deleted file mode 100644
index 40d1eb62d7..0000000000
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ /dev/null
@@ -1,219 +0,0 @@
-# Contribute Code
-
-We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code.
-
-## Code Requirements
-- Your code comments must be fully documented by
-  [Doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
-- Make sure the compiler option `WITH_STYLE_CHECK` is on and the compiler
-  passes the code style check.
-- All code must have unit test.
-- Pass all unit tests.
-
-The following tutorial guides you into submitting your contibution.
-
-## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
-
-Just head over to the GitHub page and click the "Fork" button.
-It's just that simple.
-
-## Clone
-
-Clone remote repository.
-
-```bash
-➜  git clone https://github.com/USERNAME/Paddle
-➜  cd Paddle
-```
-
-## Create a local branch
-
-Paddle is currently using [Git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
-
-All feature and bug fix development work should be done on a new branch, generally create new branch from `develop` branch .
-
-```bash
-➜  git checkout -b my-cool-stuff
-```
-
-Before the checkout, you need to keep the current branch directory clean, otherwise the untracked file will be brought to the new branch, which can be inspected by `git status`.
-
-## Using `pre-commit` hook
-
-Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
-pre-commit hooks. It can help us format source codes (cpp, python), check some
-basic thing before commit (only one EOL for each file, do not add a huge file
-in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
-PR doesn't fit hook can not be merged into Paddle.
-
-To use [pre-commit](http://pre-commit.com/), you should install it by
-`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
-c/cpp sources. Please make sure clang-format 3.8+ installed.
-
-Install and run it as follow:
-
-```bash
-➜  pip install pre-commit
-➜  pre-commit install
-```
-
-When you commit your code, the pre-commit hook will check the local code if there is
-anything not suitable to commit, and so on.
-
-## Start to develop
-
-In this tutorial, I delete a line in README.md and created a new file.
-
-We can use `git status` to inspect the changes of current directory, `git diff` to see difference.
-
-```bash
-➜  git status
-On branch test
-Changes not staged for commit:
-  (use "git add <file>..." to update what will be committed)
-  (use "git checkout -- <file>..." to discard changes in working directory)
-
-	modified:   README.md
-
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-no changes added to commit (use "git add" and/or "git commit -a")
-```
-## Build and Test
-
-We package PaddlePaddle's compile environment into a Docker image, called the develop image named `paddle:dev`, it contains all compiling tools that PaddlePaddle needs. 
-
-If you want to build the develop image, just run:
-
-```bash
-➜  docker build -t paddle:dev .
-```
-
-Then we can use the develop image to build PaddlePaddle source. For example:
-
-```bash
-➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
-```
-
-The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
-
-Then we can generate the production image by copying the compiled PaddlePaddle program into the image by
-
-```bash
-➜  docker build -t paddle:prod -f build/Dockerfile .
-```
-
-Run unit test finally:
-
-```bash
-➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-```
-
-For more details, you can read [this doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
-
-## Commit
-
-Next we cancel the changes to the README.md file and then commit our changes by following command lines:
-
-```bash
-➜  git checkout -- README.md
-➜  git status
-On branch test
-Untracked files:
-  (use "git add <file>..." to include in what will be committed)
-
-	test
-
-nothing added to commit but untracked files present (use "git add" to track)
-➜  git add test
-```
-
-We should write a description of each commit by `git commit` to allow others to know
-the changes in these files.
-
-```bash
-➜  git commit
-CRLF end-lines remover...............................(no files to check)Skipped
-yapf.................................................(no files to check)Skipped
-Check for added large files..............................................Passed
-Check for merge conflicts................................................Passed
-Check for broken symlinks................................................Passed
-Detect Private Key...................................(no files to check)Skipped
-Fix End of Files.....................................(no files to check)Skipped
-clang-formater.......................................(no files to check)Skipped
-[my-cool-stuff c703c041] add test file
- 1 file changed, 0 insertions(+), 0 deletions(-)
- create mode 100644 233
-```
-
-## Keeping Fork Up to Date
-
-Before pull your request, you should sync your code from the latest PaddlePaddle.
-To do this, you'll need to add a remote at first:
-
-```bash
-➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
-➜  git remote
-origin
-upstream
-```
-
-Update your fork with the latest upstream changes:
-
-```bash
-➜  git fetch upstream
-➜  git pull upstream develop
-```
-
-Now, your local master branch is up-to-date with everything modified upstream.
-
-## Push to GitHub
-
-```bash
-# push to your repository in Github
-➜  git push origin my-cool-stuff
-```
-
-## Create an issue and a Pull Request
-
-Create an Issue to describe the problem and record its number.
-
-Go to the page for your fork on GitHub, select your development branch,
-and click the `New pull request`.
-
-<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
-
-Then select the target branch:
-
-<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
-
-We can add `resolve #Issue number` in PR description to close the issue automatically after the PR is merge. More details in <https://help.github.com/articles/closing-issues-via-commit-messages/>.
-
-Then wait for review, if there need to modify, refer to the above steps to update the corresponding origin branch.
-
-## Delete origin branch
-
-After the PR is merge into the main repository, we can delete the remote branch on the PR page.
-
-<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
-
-Or just run:
-
-```bash
-➜  git push origin :my-cool-stuff
-```
-
-## Delete local branch
-
-Finally, we delete local branch:
-
-```bash
-➜  git checkout develop 
-
-# delete my-cool-stuff branch
-➜  git branch -D my-cool-stuff
-```
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
new file mode 120000
index 0000000000..c97564d93a
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -0,0 +1 @@
+../../../CONTRIBUTING.md
\ No newline at end of file
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
index 9489a921c7..75037e693b 100644
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-假设 :math:`z = f(W^T x + b)` ,那么
+假设 :math:`z = W^T x + b` ,那么
 
 .. math::
 
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 46481f5ead..110a9fb38f 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
 
 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
 
-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
 
 Suppose our loss function is :math:`c(y)`, then
 
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 
 .. math::
 
@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:
 
 .. math::
-  
+
    \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
 
 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                       /* weight */ true);
       }
     }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
new file mode 100644
index 0000000000..9299658567
--- /dev/null
+++ b/doc/howto/dev/new_op_cn.md
@@ -0,0 +1,318 @@
+# 如何写新的Operator
+
+ - [概念简介](#概念简介)
+ - [实现C++类](#实现c类)
+   - [定义ProtoMaker类](#定义protomaker类)
+   - [定义Operator类](#定义operator类)
+   - [定义OpKernel类](#定义opkernel类)
+   - [注册Operator](#注册operator)
+   - [编译](#编译)
+ - [绑定Python](#绑定python)
+ - [实现单元测试](#实现单元测试)
+   - [前向Operator单测](#前向operator单测)
+   - [反向Operator单测](#反向operator单测)
+   - [编译和执行](#编译和执行)
+ - [注意事项](#注意事项)
+
+
+## 概念简介
+
+简单介绍需要用到基类,详细介绍请参考设计文档。
+
+- `framework::OperatorBase`: Operator(简写,Op)基类。
+- `framework::OpKernel`: Op计算函数的基类,称作Kernel。
+- `framework::OperatorWithKernel`:继承自OperatorBase,Op有计算函数,称作有Kernel。
+- `class OpProtoAndCheckerMaker`:描述该Op的输入、输出、属性、注释,主要用于Python API接口生成
+
+依据是否包含kernel,可以将Op分为两种:包含Kernel的Op和不包含kernel的Op,前者Op的定义继承自`OperatorWithKernel`,后者继承自`OperatorBase`。本教程主要介绍带Kernel的Op如何写,简单总结Op需要包含的内容如下:
+
+
+ 内容            | 定义位置
+--------------  | :----------------------
+OpProtoMake定义  | `.cc`文件,Backward Op不需要定义OpProtoMake
+Op定义           | `.cc`文件
+Kernel实现       | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。
+注册Op           | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中
+
+
+实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。**
+
+
+下面以矩阵乘操作,即[MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc)为例来介绍如何写带Kernel的Operator。
+
+
+## 实现C++类
+
+
+### 定义ProtoMaker类
+
+矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。
+
+首先定义`ProtoMaker`来描述该Op的输入、输出,并添加注释:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)继承自`framework::OpProtoAndCheckerMaker`,构造函数含有2个参数:
+
+   - `framework::OpProto` : 前者存储Op的输入输出和参数属性,将用于Python API接口的生成。
+   - `framework::OpAttrChecker` :后者用于检查参数属性的合法性。
+
+构造函数里通过`AddInput`添加输入参数,通过`AddOutput`添加输出参数,通过`AddComment`添加Op的注释。这些函数会将对应内容添加到`OpProto`中。
+
+上面的代码在`MulOp`中添加两个输入`X`和`Y`,添加了一个输出`Out`,并解释了各自含义,命名请遵守[命名规范](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md)。
+
+
+再以[`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)为例:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+这个例子有两处不同:
+
+- `AddInput("X","...").NotInGradient()` : 表示`X`这个输入不参与`ScaleOp`对应的梯度Op计算之中,如果Op的某个输入不参与反向梯度的计算,请显示地调用`.NotInGradient()`进行设置。
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。
+
+
+### 定义Operator类
+
+下面的点实现了MulOp的定义:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22)继承自`OperatorWithKernel`。`public`成员:
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+这句表示使用基类`OperatorWithKernel`的构造函数,也可写成:
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+还需要重写`InferShape`接口。`InferShape`为const函数,不能修改Op的成员变量,参数为`const framework::InferShapeContext &ctx`,通过该参数可获取到输入输出以及属性。它的功能是:
+
+  - 1). 做检查, 尽早报错:检查输入数据维度、类型等是否合法。
+  - 2). 设置输出Tensor的形状。
+
+通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中
+
+### 定义OpKernel类
+
+`MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数:
+
+- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+- `typename T` : 表示数据类型,如`float`, `double`等。
+
+需要为`MulKernel`类重写`Compute`接口。
+- `Compute`接受一个输入参数:`const framework::ExecutionContext& context`。
+- 与`InferShapeContext`相比,`ExecutionContext`增加了设备类型,同样可获取到输入输出和属性参数。
+- `Compute`函数里实现`OpKernel`的具体计算逻辑。
+
+下面是 `MulKernel` `Compute`的实现:
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。**
+
+`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。
+
+为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。
+
+
+到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。
+反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。
+
+### 注册Operator
+
+- 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   在上面的代码中:
+
+    - `REGISTER_OP` : 注册`ops::MulOp`类,类型名为`mul`,该类的`ProtoMaker`为`ops::MulOpMaker`,注册`ops::MulOpGrad`,类型名为`mul_grad`。
+    - `REGISTER_OP_WITHOUT_GRADIENT` : 用于注册没有反向的Op。
+    - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。
+
+
+- 在 `.cu`文件中注册CUDA Kernel。
+    - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下:
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### 编译
+
+运行下面命令可以进行编译:
+
+```
+make mul_op
+```
+
+## 绑定Python
+
+系统会对新增的op自动绑定Python,并链接到生成的lib库中。
+
+## 实现单元测试
+
+单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。
+
+### 前向Operator单测
+
+Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要:
+
+1. 在`setUp`函数定义输入、输出,以及相关的属性参数。
+2. 生成随机的输入数据。
+3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。
+4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。
+
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+
+上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释:
+
+- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。
+- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。
+- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。
+
+### 反向operator单测
+
+而反向测试中:
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`:指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
+
+
+### 编译和执行
+
+`python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。
+
+请注意,**不同于Op的编译测试,运行单元测试测时需要编译整个工程**,并且编译时需要打开`WITH_TESTING`, 即`cmake paddle_dir -DWITH_TESTING=ON`。编译成功后,执行下面的命令来运行单元测试:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+或者:
+
+```bash
+ctest -R test_mul_op
+```
+
+## 注意事项
+
+- 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。
+- 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。
+- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。
+- 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
new file mode 100644
index 0000000000..da8b1bdd10
--- /dev/null
+++ b/doc/howto/dev/new_op_en.md
@@ -0,0 +1,336 @@
+# How to write a new operator
+
+ - [Background](#background)
+ - [Implementing C++ Types](#implementing-c-types)
+   - [Defining ProtoMaker](#defining-protomaker)
+   - [Defining Operator](#defining-operator)
+   - [Defining OpKernel](#defining-opkernel)
+   - [Registering Operator and OpKernel](#registering-operator-and-opkernel)
+   - [Compilation](#compilation)
+ - [Python Binding](#python-binding)
+ - [Unit Tests](#unit-tests)
+   - [Testing Forward Operators](#testing-forward-operators)
+   - [Testing Backward Operators](#testing-backward-operators)
+   - [Compiling and Running](#compiling-and-running)
+ - [Remarks](#remarks)
+## Background
+
+Here are the base types needed. For details, please refer to the design docs.
+
+- `class OpProtoAndCheckerMaker`: Describes an Operator's input, output, attributes and description, mainly used to interface with Python API.
+- `framework::OperatorBase`: Operator (Op)base class.
+- `framework::OpKernel`: Base class for Op computation kernel.
+- `framework::OperatorWithKernel`: Inherited from OperatorBase, describing an operator with computation kernels.
+
+
+Operators can be categorized into two groups: operator with kernel(s) and operator without kernel(s). An operator with kernel(s) inherits from `OperatorWithKernel` while the one without kernel(s) inherits from `OperatorBase`. This tutorial focuses on implementing operators with kernels. In short, an operator includes the following information:
+
+
+ Information           | Where is it defined
+--------------  | :----------------------
+OpProtoMake definition  | `.cc`files, Backward Op does not need an OpProtoMake interface.
+Op definition           | `.cc` files
+Kernel implementation       | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files.
+Registering the Op           | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation.
+
+
+New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions.**
+
+
+Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc), as an example to introduce the writing of an Operator with Kernel.
+
+
+## Implementing C++ Types
+
+
+### Defining ProtoMaker
+
+Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output.
+
+First, define `ProtoMaker` to describe the Operator's input, output, and additional comments:
+
+```cpp
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), 2D tensor of size (M x K)");
+    AddInput("Y", "(Tensor), 2D tensor of size (K x N)");
+    AddOutput("Out", "(Tensor), 2D tensor of size (M x N)");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+```
+
+[`MulOpMaker`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L43)is inherited from`framework::OpProtoAndCheckerMaker`, consisting of 2 variables in the constructor:
+
+   - `framework::OpProto` stores Operator input and variable attribute, used for generating Python API interfaces.
+   - `framework::OpAttrChecker` is used to validate variable attributes.
+
+The constructor utilizes `AddInput`, `AddOutput`, and `AddComment`, so that the corresponding information will be added to `OpProto`.
+
+The code above adds two inputs `X` and `Y` to `MulOp`, an output `Out`, and their corresponding descriptions, in accordance to Paddle's [naming convention](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md).
+
+
+An additional example [`ScaleOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37) is implemented as follows:
+
+```cpp
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of scale operator.").NotInGradient();
+    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddComment(R"DOC(Scale operator
+The equation is: Out = scale*X
+)DOC");
+    AddAttr<AttrType>("scale", "scale of scale operator.").SetDefault(1.0);
+  }
+};
+```
+
+There are two changes in this example:
+
+- `AddInput("X","...").NotInGradient()` expresses that input `X` is not involved in `ScaleOp`'s corresponding computation. If an input to an operator is not participating in back-propagation, please explicitly set `.NotInGradient()`.
+
+- `AddAttr<AttrType>("scale", "...").SetDefault(1.0);`  adds `scale`constant as an attribute, and sets the default value to 1.0.
+
+
+### Defining Operator
+
+The following code defines the interface for MulOp:
+
+```cpp
+class MulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    auto dim0 = ctx.Input<Tensor>("X")->dims();
+    auto dim1 = ctx.Input<Tensor>("Y")->dims();
+    PADDLE_ENFORCE_EQ(dim0.size(), 2,
+                      "input X(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("X"));
+    PADDLE_ENFORCE_EQ(dim1.size(), 2,
+                      "input Y(%s) should be a tensor with 2 dims, a matrix",
+                      ctx.op_.Input("Y"));
+    PADDLE_ENFORCE_EQ(
+        dim0[1], dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    ctx.Output<Tensor>("Out")->Resize({dim0[0], dim1[1]});
+  }
+};
+```
+
+[`MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/mul_op.cc#L22) is inherited from `OperatorWithKernel`. Its `public` member
+
+```cpp
+using framework::OperatorWithKernel::OperatorWithKernel;
+```
+
+expresses an operator constructor using base class `OperatorWithKernel`, alternatively written as
+
+```cpp
+MulOp(const std::string &type, const framework::VariableNameMap &inputs,
+      const framework::VariableNameMap &outputs,
+      const framework::AttributeMap &attrs)
+  : OperatorWithKernel(type, inputs, outputs, attrs) {}
+```
+
+`InferShape` interface needs to be re-written.`InferShape` is a constant method and cannot modify Op's member variables, its constant member `const framework::InferShapeContext &ctx` can be used to extract input, output, and attributes. It functions to
+
+  - 1). validate and error out early: it checks input data dimensions and types.
+  - 2). configures the tensor shape in the output.
+
+Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later.
+
+### Defining OpKernel
+
+`MulKernel` inherits `framework::OpKernel`, which includes the following templates:
+
+- `typename  DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+- `typename T` denotes data type, such as `float` or `double`.
+
+`MulKernel` types need to rewrite the interface for `Compute`.
+
+- `Compute` takes one input parameter: `const framework::ExecutionContext& context`.
+- Compared with `InferShapeContext`, `ExecutionContext` includes device types, and can similarly extract input, output, and attribute variables.
+- `Compute` implements the computation logics of an `OpKernel`.
+
+`MulKernel`'s implementation of `Compute` is as follows:
+
+  ```cpp
+  template <typename DeviceContext, typename T>
+  class MulKernel : public framework::OpKernel {
+  public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Input<Tensor>("Y");
+    auto* Z = context.Output<Tensor>("Out");
+    Z->mutable_data<T>(context.GetPlace());
+    auto& device_context = context.template device_context<DeviceContext>();
+    math::matmul<DeviceContext, T>(*X, false, *Y, false, 1, Z, 0, device_context);
+  }
+  };
+  ```
+
+Note that **different devices (CPU, CUDA)share one Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions can support both devices.**
+
+`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing  `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43).
+
+To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md).
+
+
+This concludes the forward implementation of an operator. Next its operation and kernel need to be registered in a `.cc` file.
+
+The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**.
+
+### Registering Operator and OpKernel
+
+- In `.cc` files, register forward and backward operator classes and the CPU kernel.
+
+    ```cpp
+    namespace ops = paddle::operators;
+    REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad);
+
+    REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+    REGISTER_OP_CPU_KERNEL(mul_grad,
+                  ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
+    ```
+
+   In that code block,
+
+    - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
+    - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
+
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
+
+
+- Registering CUDA Kernel in `.cu` files
+    - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as
+
+    ```cpp
+    // if use Eigen unsupported module before include head files
+    #define EIGEN_USE_GPU
+
+    namespace ops = paddle::operators;
+    REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+    REGISTER_OP_CUDA_KERNEL(mul_grad,
+                           ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
+    ```
+
+### Compilation
+
+Run the following commands to compile.
+
+```
+# maybe you need to rerun cmake
+make mul_op
+```
+
+## Python Binding
+
+The system will automatically bind to Python and link it to a generated library.
+
+## Unit Tests
+
+Unit tests for an operator include
+
+1. comparing a forward operator's implementations on different devices,
+
+2. comparing a backward operator's implementation on different devices, and
+
+3. a scaling test for the backward operator.
+
+Here, we introduce the [unit tests for `MulOp`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py).
+
+### Testing Forward Operators
+
+A forward operator unit test inherits `unittest.TestCase` and defines metaclass `__metaclass__ = OpTestMeta`. More concrete tests are performed in `OpTestMeta`. Testing a forward operator requires the following:
+
+1. Defining input, output and relevant attributes in `setUp` method.
+
+2. Generating random input data.
+
+3. Implementing the same computation logic in a Python script.
+
+4. Call check gradient function to check the backward operator.
+
+  ```python
+  import unittest
+  import numpy as np
+  from op_test import OpTest
+
+
+  class TestMulOp(OpTest):
+      def setUp(self):
+          self.op_type = "mul"
+          self.inputs = {
+              'X': np.random.random((32, 84)).astype("float32"),
+              'Y': np.random.random((84, 100)).astype("float32")
+          }
+          self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+      def test_check_output(self):
+          self.check_output()
+          
+      def test_check_grad_normal(self):
+          self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+      def test_check_grad_ingore_x(self):
+          self.check_grad(
+              ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+      def test_check_grad_ingore_y(self):
+          self.check_grad(
+              ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+  ```
+Get its output, and compare it with the forward operator's own output.
+
+The code above first loads required packages. In addition, we have
+
+- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type.
+- `self.inputs` defines input, with type `numpy.array` and initializes it.
+- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script.
+
+### Testing Backward Operators
+
+Some key points in checking gradient above include:
+
+- `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The second variable `"Out"` points to the network's final output target `Out`.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
+
+### Compiling and Running
+
+
+Any new unit testing file of the format `test_*.py`  added to the director `python/paddle/v2/framework/tests` is automatically added to the project to compile.
+
+Note that **unlike the compile test for Ops, running unit tests requires compiling the entire project** and requires compiling with flag `WITH_TESTING` on i.e. `cmake paddle_dir -DWITH_TESTING=ON`.
+
+After successfully compiling the project, run the following command to run unit tests:
+
+```bash
+make test ARGS="-R test_mul_op -V"
+```
+
+Or,
+
+```bash
+ctest -R test_mul_op
+```
+
+## Remarks
+
+- Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file.
+- The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures.
+- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail.
+- If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`.
diff --git a/doc/howto/dev/new_op_kernel_en.md b/doc/howto/dev/new_op_kernel_en.md
new file mode 100644
index 0000000000..123df0a7ee
--- /dev/null
+++ b/doc/howto/dev/new_op_kernel_en.md
@@ -0,0 +1,121 @@
+## Add Kernels for a New Device
+
+### Background
+
+PaddlePaddle Fluid have hundreds of operators.  Each operator could have one or more kernels.  A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
+
+[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels.  The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md).  An operator chooses the right kernel at runtime.  This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
+
+### Write Kernels for A New Device 
+
+#### Add A New Device
+
+  For some historical reaons, we misuse the word *library* for *device*.  For example, we call the deivce type by *library type*.  An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24).  We will correct this ASAP.
+
+To register a new device, we need to add an enum value to `LibraryType`:
+
+```
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+```
+
+
+#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
+
+If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
+
+```cpp
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+```
+
+#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
+After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
+
+```cpp
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+```
+
+#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
+
+A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
+
+```cpp
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+```
+
+
+#### Register the OpKernel to framework
+
+After writing the components described above, we should register the kernel to the framework.
+
+We use `REGISTER_OP_KERNEL` to do the registration.
+
+```cpp
+REGISTER_OP_KERNEL(
+	op_type,
+	library_type,
+	place_type,
+	kernel0, kernel1, ...)
+```
+
+kernel0, kernel1 are kernels that have the same `op_type`, `library_type`, `place_type` but different `data_types`.
+
+take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/conv_cudnn_op.cu.cc#L318)) as an example:
+
+	```cpp
+	REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    		paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+    
+	REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+	       paddle::operators::CUDNNConvOpKernel<float>,
+	       paddle::operators::CUDNNConvOpKernel<double>);
+	```
+
+In the code above:
+
+ - `conv2d` is the type/name of the operator
+ - `CUDNN/CPU` is `library`
+ - `paddle::platform::CUDAPlace/CPUPlace` is `place`
+ - template parameter `float/double` on `CUDNNConvOpKernel<T>` is `data_type`.
diff --git a/doc/howto/dev/use_eigen_cn.md b/doc/howto/dev/use_eigen_cn.md
new file mode 100644
index 0000000000..1367323b71
--- /dev/null
+++ b/doc/howto/dev/use_eigen_cn.md
@@ -0,0 +1,146 @@
+## 在Paddle中如何使用Eigen
+
+神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
+
+
+### Eigen Tensor模块
+
+Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
+
+关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
+
+
+### paddle::framework::Tensor
+
+Paddle Tensor定义在framework目录下,其主要接口如下:
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+  
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+  
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+  
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+  
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:  
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+  
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder`的作用是延迟分配内存,即我们可以先定义一个Tensor,然后使用Resize接口设置Tensor的大小,最后再调用mutable_data接口分配实际的内存。
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor使用样例
+下面以AddOp为例说明Tensor的使用过程:
+
+- InferShape
+
+在运行神经网络计算图时,我们先调用每个`Operator`的`InferShape`接口,根据输入Tensor的大小来设置输出Tensor的大小,`Resize`接口会被调用。
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+`Operator`的`Run`接口最终会调用对应`OpKernel`的`Compute`接口,在这时真正的分配内存,`mutable_data`接口会被调用。
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+### paddle::framework::Tensor到EigenTensor的转换
+
+如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
+
+以EigenTensor为例,做一个介绍
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+From是EigenTensor模板提供的一个接口,可以实现从paddle::framework::Tensor到对EigenTensor的转换。由于Tensor的rank是模板参数,因此在转换时需要显示的指定。
+
+在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
+
+更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。
+
+
+
+### 实现计算
+
+当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+在这段代码中,input0/input1/output可以是任意维度的Tensor。我们调用了EigenVector的Flatten接口,把任意维度的Tensor转为了一维的EigenVector。而在计算结束之后,input0/input1/output的原有shape信息不变。如果想改变原有Tensor的shape信息,可以调用Resize接口进行改变。
+
+由于Eigen Tensor模块的文档较少,我们可以参考TensorFlow的[kernels](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels)模块下的相关`OpKernel`的计算代码。
diff --git a/doc/howto/dev/use_eigen_en.md b/doc/howto/dev/use_eigen_en.md
new file mode 100644
index 0000000000..e169106e12
--- /dev/null
+++ b/doc/howto/dev/use_eigen_en.md
@@ -0,0 +1,146 @@
+## How to use Eigen in Paddle
+
+Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
+
+
+### Eigen Tensor Module
+
+The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
+
+Note that Eigen Tensor is still being actively developed, so its tests are not completely covered and its documentation may be sparse.
+
+For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
+
+
+### paddle::framework::Tensor
+
+Paddle Tensor's is defined in the framework directory with the following interface:
+
+```cpp
+class Tensor {
+ public:
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+ private:
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dim_;
+};
+```
+
+`Placeholder` is used to delay memory allocation; that is, we can first define a tensor, using `Resize` to configure its shape, and then call `mutuable_data` to allocate the actual memory.
+
+```cpp
+paddle::framework::Tensor t;
+paddle::platform::CPUPlace place;
+// set size first
+t.Resize({2, 3});
+// allocate memory on CPU later
+t.mutable_data(place);
+```
+
+### paddle::framework::Tensor Usage
+`AddOp` demonstrates Tensor's usage.
+
+- InferShape
+
+When computing a neural network's compute graph, first call every `Operator`'s `InferShape` method, and use `Resize` to configure the size of the output tensor.
+
+```cpp
+void InferShape(const framework::InferShapeContext &ctx) const override {
+  PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims(),
+                    ctx.Input<Tensor>("Y")->dims(),
+                    "Two input of Add Op's dimension must be same.");
+  ctx.Output<Tensor>("Out")->Resize(ctx.Input<Tensor>("X")->dims());
+}
+```
+
+
+- Run
+
+```cpp
+void Compute(const framework::ExecutionContext& context) const override {
+  auto* input0 = context.Input<Tensor>("X");
+  auto* input1 = context.Input<Tensor>("Y");
+  auto* output = context.Output<Tensor>("Out");
+
+  output->mutable_data<T>(context.GetPlace());
+
+  auto x = EigenVector<T>::Flatten(*input0);
+  auto y = EigenVector<T>::Flatten(*input1);
+  auto z = EigenVector<T>::Flatten(*output);
+
+  auto place = context.GetEigenDevice<Place>();
+
+  z.device(place) = x + y;
+}
+```
+
+
+### paddle::framework::Tensor到EigenTensor的转换
+
+As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
+
+Using EigenTensor as an example:
+
+```cpp
+Tensor t;
+float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+for (int i = 0; i < 1 * 2 * 3; i++) {
+  p[i] = static_cast<float>(i);
+}
+
+EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+```
+
+`From` is an interfacing method provided by the EigenTensor template, which implements the transformation from a `paddle::framework::Tensor` object to an EigenTensor. Since `rank` is a template parameter, it needs to be explicitly specified at the time of the transformation.
+
+In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
+
+For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
+
+
+
+### Implementing Computation
+
+While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
+
+```cpp
+auto x = EigenVector<T>::Flatten(*input0);
+auto y = EigenVector<T>::Flatten(*input1);
+auto z = EigenVector<T>::Flatten(*output);
+auto place = context.GetEigenDevice<Place>();
+z.device(place) = x + y;
+```
+
+In this code segment, input0/input1/output can be Tensors of arbitrary dimension. We are calling Flatten from EigenVector, transforming a tensor of any dimension into a 1-dimensional EigenVector. After completing computation, input0/input1/output will retain the same shape information, and they can be resized using the `Resize` interface.
+
+Because the Eigen Tensor module is under-documented, please refer to `OpKernel`'s computation code in TensorFlow's [kernel module documentation](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/kernels).
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index d536f53abc..1bc947c260 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,75 +3,108 @@
 ##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
+也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
 
+如何构建文档
+============
 
-如何构建PaddlePaddle的文档
-==========================
+PaddlePaddle的文档构建有三种方式。
 
-PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式,我们提供了一个构建脚本build_docs.sh来进行构建。
-PaddlePaddle文档需要准备的环境相对较复杂,所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。
 
-使用Docker构建PaddlePaddle的文档
---------------------------------
-
-使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
+文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
 
 ..  code-block:: bash
 
-    cd TO_YOUR_PADDLE_CLONE_PATH
-    cd paddle/scripts/tools/build_docs
-    bash build_docs.sh with_docker
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
 
-编译完成后,会在当前目录生成两个子目录\:
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
 
-* doc 英文文档目录
-* doc_cn 中文文档目录
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
 
-打开浏览器访问对应目录下的index.html即可访问本地文档。
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
+如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
 
+..  code-block:: bash
 
-直接构建PaddlePaddle的文档
---------------------------
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
 
-因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包,用户需要首先确认py_paddle包已经安装。
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
 
-..  code-block:: bash
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
 
-    python -c "import py_paddle"
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
 
-如果提示错误,那么用户需要在本地编译安装PaddlePaddle,请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
-注意,用户在首次编译安装PaddlePaddle时,请将WITH_DOC选项关闭。在编译安装正确之后,请再次确认py_paddle包已经安装,即可进行下一步操作。
+想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 
-如果提示正确,可以执行以下命令编译生成文档,即
+使用Docker构建
+--------------
+
+使用Docker构建PaddlePaddle的文档,需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档,即
 
 ..  code-block:: bash
 
     cd TO_YOUR_PADDLE_CLONE_PATH
     cd paddle/scripts/tools/build_docs
-    bash build_docs.sh local
+    sh build_docs.sh
 
-编译完成之后,会在当前目录生成两个子目录\:
+编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
+打开浏览器访问对应目录下的index.html即可访问本地文档。
 
-* doc 英文文档目录
-* doc_cn 中文文档目录
+直接构建
+--------
 
+如果提示正确,可以执行以下命令编译生成文档,即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    mkdir -p build
+    cd build
+    cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+    make gen_proto_py
+    make paddle_docs paddle_docs_cn
+
+编译完成之后,会在当前目录生成两个子目录\: doc(英文文档目录)和 doc_cn(中文文档目录)。
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
 
-如何书写PaddlePaddle的文档
-==========================
+如何书写文档
+============
 
 PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。
 
-如何更新www.paddlepaddle.org文档
-================================
-
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中,提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+如何更新www.paddlepaddle.org
+============================
 
+更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 
 
 ..  _cmake: https://cmake.org/
diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst
new file mode 100644
index 0000000000..b3ef07eb1d
--- /dev/null
+++ b/doc/howto/dev/write_docs_en.rst
@@ -0,0 +1,80 @@
+##################
+Contribute Documentation
+##################
+
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+
+How to Build Documentations
+============
+
+We recommend using PaddlePaddle.org tool to build documentation
+
+
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+
+..  code-block:: bash
+
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+
+How to write Documentations
+============
+
+PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail.
+
+
+How to update www.paddlepaddle.org
+============================
+
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 26449a6365..e0c69f7a6a 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,9 +9,7 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/k8s/k8s_basis_cn.md
-  usage/k8s/k8s_cn.md
-  usage/k8s/k8s_distributed_cn.md
+  usage/capi/index_cn.rst
 
 开发标准
 --------
@@ -19,8 +17,8 @@
 ..  toctree::
   :maxdepth: 1
 
-  dev/write_docs_cn.rst
   dev/contribute_to_paddle_cn.md
+  dev/write_docs_cn.rst
 
 模型配置
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1fbfcd260b..6d1bf7dfc0 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -9,8 +9,6 @@ Usage
 
   usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
-  usage/k8s/k8s_en.md
-  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
@@ -20,6 +18,7 @@ Development
 
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst
 
 Configuration
 -------------
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
new file mode 100644
index 0000000000..368af40cc7
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -0,0 +1,196 @@
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+| column | meaning |
+| --- | --- |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000..14eba0e2f3
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址,即可显示性能分析的结果:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号,函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序,效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后,我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址,我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中,
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。
+
+在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。
diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000..8e9edbf377
Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ
diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000..172ba20399
Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
new file mode 100644
index 0000000000..31987920f3
--- /dev/null
+++ b/doc/howto/read_source.md
@@ -0,0 +1,67 @@
+# PaddlePaddle Fluid Source Code Overview
+
+Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book
+
+Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
+
+Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
+
+Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
+
+Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
+
+# Compile Time
+
+The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
+
+```python
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+```
+
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
+  - Every Layer has one or more operators and variables/parameters
+    - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
+      - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
+      - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
+      - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
+- Optimizer: `fluid.optimizer.SGD`. It does the following
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
+
+# Run Time
+
+The following **evaluates** the NN. Instantiates all the variables, operators.
+
+```python
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+# Allocate memory. Initialize Parameter.
+exe.run(fluid.default_startup_program())
+
+# Allocate memory. Do computation.
+exe.run(fluid.default_main_program(),
+        feed=feeder.feed(data),
+        fetch_list=[avg_cost])
+```
+
+- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
+  - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
+- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
+  - Feeds the data: `feed=feeder.feed(data)`
+  - Evaluates all the operators
+  - Fetches the result: `fetch_list=[avg_cost]`
+- Other worth looking files:
+  - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
+    - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
+      - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000..ac5ecffe2e
--- /dev/null
+++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,122 @@
+## 编译 PaddlePaddle 预测库
+
+### 概述
+
+使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项:
+
+必须配置选项:
+- `WITH_C_API`,必须配置为`ON`。
+
+推荐配置选项:
+- `WITH_PYTHON`,推荐配置为`OFF`
+- `WITH_SWIG_PY`,推荐配置为`OFF`
+- `WITH_GOLANG`,推荐设置为`OFF`
+
+可选配置选项:
+- `WITH_GPU`,可配置为`ON/OFF`
+- `WITH_MKL`,可配置为`ON/OFF`
+
+对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
+
+下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+执行上述代码生成Makefile文件后,执行:`make && make install`。成功编译后,使用C-API所需的依赖(包括:(1)编译出的PaddlePaddle预测库和头文件;(2)第三方链接库和头文件)均会存放于`PADDLE_ROOT`目录中。
+
+编译成功后在 `PADDLE_ROOT` 下会看到如下目录结构(包括了编译出的PaddlePaddle头文件和链接库,以及第三方依赖链接库和头文件(如果需要,由链接方式决定)):
+
+```text
+├── include
+│   └── paddle
+│       ├── arguments.h
+│       ├── capi.h
+│       ├── capi_private.h
+│       ├── config.h
+│       ├── error.h
+│       ├── gradient_machine.h
+│       ├── main.h
+│       ├── matrix.h
+│       ├── paddle_capi.map
+│       └── vector.h
+├── lib
+│   ├── libpaddle_capi_engine.a
+│   ├── libpaddle_capi_layers.a
+│   ├── libpaddle_capi_shared.so
+│   └── libpaddle_capi_whole.a
+└── third_party
+    ├── gflags
+    │   ├── include
+    │   │   └── gflags
+    │   │       ├── gflags_completions.h
+    │   │       ├── gflags_declare.h
+    │   │       ...
+    │   └── lib
+    │       └── libgflags.a
+    ├── glog
+    │   ├── include
+    │   │   └── glog
+    │   │       ├── config.h
+    │   │       ...
+    │   └── lib
+    │       └── libglog.a
+    ├── openblas
+    │   ├── include
+    │   │   ├── cblas.h
+    │   │   ...
+    │   └── lib
+    │       ...
+    ├── protobuf
+    │   ├── include
+    │   │   └── google
+    │   │       └── protobuf
+    │   │           ...
+    │   └── lib
+    │       └── libprotobuf-lite.a
+    └── zlib
+        ├── include
+        │   ...
+        └── lib
+            ...
+
+```
+
+### 链接说明
+
+目前提供三种链接方式:
+
+1. 链接`libpaddle_capi_shared.so` 动态库
+    - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意:
+        1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。
+        1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。
+        1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
+    - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。
+
+2. 链接静态库 `libpaddle_capi_whole.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
+        1. 需要指定`-Wl,--whole-archive`链接选项。
+        1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。
+        1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。
+        1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。
+
+3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`
+    - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
+        1. 这种链接方式主要用于移动端预测。
+        1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
+        1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
+        1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/usage/capi/images/csr.png
new file mode 100644
index 0000000000..3dc10b8de4
Binary files /dev/null and b/doc/howto/usage/capi/images/csr.png differ
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/usage/capi/images/sequence_data.png
new file mode 100644
index 0000000000..6e47a46b89
Binary files /dev/null and b/doc/howto/usage/capi/images/sequence_data.png differ
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/usage/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000..a4399ade04
Binary files /dev/null and b/doc/howto/usage/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst
new file mode 100644
index 0000000000..fd774fbc74
--- /dev/null
+++ b/doc/howto/usage/capi/index_cn.rst
@@ -0,0 +1,9 @@
+PaddlePaddle C-API
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000..a889ae4ffa
--- /dev/null
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,285 @@
+## 输入/输出数据组织
+
+这篇文档介绍在使用 PaddlePaddle C-API 时如何组织输入数据,以及如何解析神经网络前向计算的输出结果。
+
+### 输入/输出数据类型
+在C-API中,按照基本数据类型在PaddlePaddle内部的定义和实现,输入数据可分为:
+1. 一维整型数组
+1. 二维浮点型矩阵
+    - 稠密矩阵
+    - 稀疏矩阵
+
+说明:
+1. 一维数组**仅支持整型值**;
+    - 常用于自然语言处理任务,例如:表示词语在词典中的序号;
+    - 分类任务中类别标签;
+1. 逻辑上高于二维的数据(例如含有多个通道的图片,视频等)在程序实现中都会转化为二维矩阵,转化方法在相应的领域都有通用解决方案,需要使用者自己了解并完成转化;
+1. 二维矩阵可以表示行向量和列向量,任何时候如果需要浮点型数组(向量),都应使用C-API中的矩阵来表示,而不是C-API中的一维数组。
+1. 不论是一维整型数组还是二维浮点数矩阵,**为它们附加上序列信息将变成序列输入。PaddlePaddle 会通过判数据是否附带有序列信息来判断一个向量/矩阵是否是一个序列**。当非序列输入时,无需关心和处理序列信息。关于什么是“序列信息”,下文会详细进行介绍。
+
+### 基本使用概念
+
+- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。
+- 在`Argument`内部由`IVector`(对应着上文提到的一维整型数组)和`Matrix`(对应着上文提到的二维浮点型矩阵)来实际存储数据;由 `Sequence Start Positions` (下文详细解释) 来描述输入/输出的序列信息。
+
+- **注**:
+    1. 这篇文档之后部分将会统一使用`argument`来特指PaddlePaddle中神经网络计算层一个输入/输出数据。
+    1. 使用`paddle_ivector`来特指PaddlePaddle中的一维整型数组。
+    1. 使用`paddle_matrix`来特指PaddlePaddle中的二维浮点型矩阵。
+
+### 组织输入数据
+- 一维整型数组
+
+    概念上可以将`paddle_ivector`理解为一个一维的整型数组,通常用于表示离散的类别标签,或是在自然语言处理任务中表示词语在字典中的序号。下面的代码片段创建了含有三个元素`1`、`2`、`3`的`paddle_ivector`。
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **稠密矩阵**
+    - 一个`m×n`的稠密矩阵是一个由`m`行`n`列元素排列成的矩形阵列,矩阵里的元素是浮点数。对神经网络来说,矩阵的高度`m`是一次预测接受的样本数目,宽度$n$是神经网络定义时,`paddle.layer.data`的`size`。
+    - 下面的代码片段创建了一个高度为1,宽度为`layer_size`的稠密矩阵,矩阵中每个元素的值随机生成。
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **稀疏矩阵**
+
+  PaddlePaddle C-API 中 稀疏矩阵使用[CSR(Compressed Sparse Row Format)](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))格式存储。下图是CSR存储稀疏矩阵的示意图。
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> 图1. 稀疏矩阵存储示意图
+  </p>
+
+  CSR存储格式通过:(1)非零元素的值(上图中的`values`);(2)行偏移(上图中的`row offsets`):每一行元素在`values`中的起始偏移,`row offsets`中元素个数总是等于行数 + 1;(3)非零元素的列号(上图中的`column indices`)来确定稀疏矩阵的内容。
+
+  在PaddlePaddle C-API中,通过调用以下接口创建稀疏矩阵:
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. 创建稀疏矩阵时需要显示地指定矩阵的(1)高度(`height`,在神经网络中等于一次预测处理的样本数)(2)宽度(`width`,`paddle.layer.data`的`size`)以及(3)非零元个数(`nnz`)。
+  1. 当上述接口第4个参数`isBinary`指定为`true`时,**只需要设置行偏移(`row_offset`)和列号(`colum indices`),不需要提供元素值(`values`)**,这时行偏移和列号指定的元素默认其值为1。
+
+  下面的代码片段创建了一个CPU上的二值稀疏矩阵:
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  下面的代码片段在创建了一个CPU上的带元素值的稀疏矩阵:
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  注意事项:
+  1. 移动端预测**不支持**稀疏矩阵及相关的接口。
+
+### 组织序列信息
+
+多个排成一列的元素(可以是整型、浮点数、浮点数向量等)构成一个序列,元素之间的顺序是序列所携带的重要信息。不同序列可能会含有不同数目个元素。在 PaddlePaddle 中,序列输入/输出数据是在上文介绍的**数据输入(一维整型数组,二维浮点数矩阵)基础上,附加上序列信息**。下面详细解释什么是“序列信息”。
+
+我们将神经网络一次计算接受的所有输入样本称之为一个`batch`(可以含有一条或多条样本),每一个序列在整个`batch`中的偏移,就是PaddlePaddle中所指的**序列信息**,称之为“sequence start positions”。PaddlePaddle 支持两种序列类型:
+
+1. 单层序列
+    - 序列中的每一个元素是非序列,是进行计算的基本单位,不可再进行拆分。
+    - 例如:自然语言中的句子是一个序列,序列中的元素是词语;
+1. 双层序列
+    - 序列中的每一个元素又是一个序列。
+    - 例如:自然语言中的段落是一个双层序列;段落是由句子构成的序列;句子是由词语构成的序列。
+    - 双层序列在处理长序列的任务或是构建层级模型时会发挥作用。
+
+这篇文档之后部分会统一使用`sequence_start_positions`来特指:PaddlePaddle中神经网络计算层输入/输出所携带的序列信息。
+
+对双层序列来讲,不仅要提供每一个外层序列在整个`batch`中的偏移,每一个外层序列又含有若干个内层序列,需要同时提供每一个内层序列在整个`batch`中的偏移。也就是说:**双层序列需要设置分别为外层序列和内层序列分别设置`sequence_start_positions`信息**。
+
+**注:**
+1. 不论序列中的元素在内存中占用多少实际存储空间,`sequence_start_positions`表示的偏移是以“序列中的一个元素”作为统计的基本单位,而不是相对`batch`起始存储地址以数据的存储大小为单位的偏移。
+1. 非序列输入不携带`sequence_start_positions`,非序列输入无需构造`sequence_start_positions`。
+1. **不论是单层序列还是双层序列的序列信息,都使用`paddle_ivector`(也就是PaddlePaddle中的一维整型数组)来存储。**
+
+图2 是PaddlePaddle中单层序列和双层序列存储示意图。
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>图2. 序列输入示意图
+</p>
+
+- 单层序列
+
+    图2 (a) 展示了一个含有4个序列的`batch`输入:
+    1. 4个序列的长度分别为:5、3、2、4;
+    1. 这时的`sequence_start_positions`为:`[0, 5, 8, 10, 14]`;
+    1. 本地训练. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,都可以通过调用下面的接口为原有的数据输入附加上序列信息,使之变为一个单层序列输入,代码片段如下:
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- 双层序列
+
+    图2 (b) 展示了一个含有4个序列的`batch`输入;
+    1. 4个序列的长度分别为:5、3、2、4;这四个序列又分别含有3、2、1、2个子序列;
+    1. 这时的需要同时提供:
+        - 外层序列在`batch`中的起始偏移`:[0, 5, 8, 10, 14]`;
+        - 内层序列在`batch`中的起始偏移:`[0, 2, 3, 5, 7, 8, 10, 13, 14]`;
+    1. 不论数据域是`paddle_ivector`类型还是`paddle_matrix`类型,这时需要调用创建序列信息和为`argument`设置序列信息的接口**两次**,分别为数据输入添加外层序列和内层序列的序列信息,使之变为一个双层序列输入,代码片段如下:
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7, 8, 10, 13, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+注意事项:
+1. 当一个`batch`中含有多个序列,**不支持序列长度为`0`的序列(也就是空输入)** 作为输入。不同计算层对空输入的处理策略有可能不同,潜在会引起未定义行为,或者引起行时错误,请在输入时进行合法性检查。
+
+### Python 端数据类型说明
+
+下表列出了Python端训练接口暴露的数据类型(`paddle.layer.data`函数`type`字段的取值)对应于调用C-API需要创建的数据类型:
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python 端数据类型</th>
+<th style="text-align:left">C-API 输入数据类型</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">整型数组,无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">浮点型稠密矩阵,无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵,无需提供非零元的值,默认为1,无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">浮点型稀疏矩阵,需提供非零元的值,无需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">整型数组,需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵,需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵,需提供非零元的值,需附加序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">整型数组,需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稠密矩阵,需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵,无需提供非零元的值,默认为1,需附加双层序列信息</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">浮点型稀疏矩阵,需提供非零元的值,需附加双层序列信息</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### 输出数据
+
+PaddlePaddle中一个计算层的输出数据组织方式和输入数据组织方式完全相同。一个输出数据同样被组织为一个`argument`,`argument`通过`paddle_matrix`或`paddle_ivector`存数数据,如果输出是一个序列,那么会携带有`sequence_start_positions`信息。调用C-API相关接口,读取需要的结果即可。
+
+### 总结
+
+- 在PaddlePaddle内部,神经网络中一个计算层的输入/输出被组织为`argument`。
+- `argument`并不真正“存储”数据,而是将输入/输出信息有机地组织在一起。
+- 在`argument`内部由`paddle_ivector`(一维整型数组)和`paddle_matrix`(二维浮点型矩阵)来实际存储数据。
+如果是一个序列输入/输出由 `sequence start positions` 来记录输入/输出的序列信息。
+
+于是,在组织神经网络输入时,需要思考完成以下工作:
+1. 为每一个输入/输出创建`argument`。
+    - C-API 中操作`argument`的接口请查看[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)。
+1. 为每一个`argument`创建`paddle_matrix`或者`paddle_ivector`来存储数据。
+    - C-API 中操作`paddle_ivector`的接口请查看 [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)。
+    - C-API 中操作`paddle_matrix`的接口请查看[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)。
+1. 如果输入是序列数据,需要创建并填写`sequence_start_positions`信息。
+    - 通过调用 [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) 来为一个`argument`添加序列信息。
+    - 通过调用 [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) 来读取一个`argument`添加序列信息。
+    - 接口说明请查看 [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) 文件。
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000..e0a42fff12
--- /dev/null
+++ b/doc/howto/usage/capi/workflow_of_capi_cn.md
@@ -0,0 +1,119 @@
+## C-API 使用流程
+
+这篇文档介绍 PaddlePaddle C-API 整体使用流程。
+
+### 使用流程
+
+使用 C-API 的工作流程如图1所示,分为(1)准备预测模型和(2)预测程序开发两大部分。
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> 图1. C-API使用流程示意图
+</p>
+
+- 准备预测模型
+    1. 只将神经网络结构进行序列化。
+        - 只对神经网络结构进行序列化,加载模型需同时指定:网络结构的序列化结果和模型参数存储目录。
+    1. 将网络结构定义和训练结束存储下来的模型参数文件(多个)合并入一个文件。
+        - 神经网络模型结构和训练好的模型将被序列化合并入一个文件。
+        - 预测时只需加载一个文件便于发布。
+    - **注意**:以上两种方式只需选择其一即可。
+- 调用 C-API 开发预测序
+    1. 初始化PaddlePaddle运行环境。
+    1. 加载预测模型。
+    1. 创建神经网络输入,组织输入数据。
+    1. 进行前向计算,获得计算结果。
+    1. 清理和结束。
+
+### 准备预测模型
+
+准备预测模型部分,我们以手写数字识别任务为例进行介绍。手写数字识别任务定义了一个含有[两个隐层的简单全连接网络](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmax回归softmax-regression),网络接受一幅图片作为输入,将图片分类到 0 ~ 9 类别标签之一。完整代码可以查看[此目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) 中的相关脚本。
+
+调用C-API开发预测程序需要一个训练好的模型,运行[MNIST手写数字识别目录](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)下的[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本,在终端执行`python mnist_v2.py`,会使用 PaddlePaddle 内置的 [MNIST 数据集](http://yann.lecun.com/exdb/mnist/)进行训练。训练好的模型默认保存在当前运行目录下的`models`目录中。
+
+下面,我们将训练结束后存储下来的模型转换成预测模型。
+
+1. 序列化神经网络模型配置
+
+    PaddlePaddle 使用 protobuf 来传输网络配置文件中定义的网络结构和相关参数,使用 C-API 进行预测时,需要将网络结构使用 protobuf 进行序列化,写入文件中。
+
+    调用[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)中的`dump_v2_config`函数能够将使用 PaddlePaddle V2 API 定义的神经网络结构 dump 到指定文件中,示例代码如下:
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)脚本集成了序列化神经网络结构的过程,可以直接运行 `python mnist_v2.py --task dump_config` 对神经网络结构进行序列化,结果会写入当前运行目录下的`trainer_config.bin`文件中。
+
+    使用这种方式,需要**在运行时将神经网络的多个可学习参数放在同一个目录中**,C-API可以通过分别指定序列化后的网络结构文件和参数目录来加载训练好的模型。
+
+2. 合并模型文件(可选)
+
+    一些情况为了便于发布,希望能够将序列化后的神经网络结构和训练好的模型参数打包进一个文件。对于这样的需求,可以使用`paddle.utils.merge_model`中的`merge_v2_model`接口对神经网络结构和训练好的参数进行序列化,将序列化结果写入一个文件内。
+
+    代码示例如下:
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_modelss
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+    对[手写数字识别](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)这个示例,可直接运行 `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)。序列化结果会写入当前运行目录下的`output.paddle.model`文件中。使用这种方式,运行时C-API可以通过指定`output.paddle.model`文件的路径来加载预测模型。
+
+#### 注意事项
+1. 为使用C-API,在调用`dump_v2_config`序列化神经网络结构时,参数`binary`必须指定为`True`。
+1. **预测使用的网络结构往往不同于训练**,通常需要去掉网络中的:(1)类别标签层;(2)损失函数层;(3)`evaluator`等,只留下核心计算层,请注意是否需要修改网络结构。
+1. 预测时,可以获取网络中定义的任意多个(大于等于一个)层前向计算的结果,需要哪些层的计算结果作为输出,就将这些层加入一个Python list中,作为调用`dump_v2_config`的第一个参数。
+
+### 编写预测代码
+
+预测代码更多详细示例代码请参考[C-API使用示例](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) 目录下的代码示例。这一节对图1中预测代码编写的5个步骤进行介绍和说明。
+
+#### step 1. 初始化PaddlePaddle运行环境
+第一步需调用[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) 初始化PaddlePaddle运行环境,该接口接受两个参数:参数的个数和参数列表。
+
+#### step2. 加载模型
+
+这里介绍C-API使用中的一个重要概念:Gradient Machine。
+
+概念上,在 PaddlePaddle 内部,一个GradientMachine类的对象管理着一组计算层(PaddlePaddle Layers)来完成前向和反向计算,并处理与之相关的所有细节。在调用C-API预测时,只需进行前向计算而无需调用反向计算。这篇文档之后部分会使用`gradient machine`来特指调用PaddlePaddle C-API创建的GradientMachine类的对象。每一个 `gradient machine` 都会管理维护一份训练好的模型,下面是C-API提供的,两种常用的模型加载方式:
+
+1. 调用[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)接口,从磁盘加载预测模型。这时`gradient machine`会独立拥有一份训练好的模型;
+1. 调用[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)接口,与其它`gradient machine`的共享已经加载的预测模型。这种情况多出现在使用多线程预测时,通过多个线程共享同一个模型来减少内存开销。可参考[此示例](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)。
+
+- 注意事项
+    1. 使用PaddlePaddle V2 API训练,模型中所有可学习参数会被存为一个压缩文件,需要手动进行解压,将它们放在同一目录中,C-API不会直接加载 V2 API 存储的压缩文件。
+    1. 如果使用`merge model`方式将神经网络结构和训练好的参数序列化到一个文件,请参考此[示例](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)。
+    1. 通过灵活使用以上两个接口,加载模型可其它多种方式,例如也可在程序运行过程中再加载另外一个模型。
+
+#### step 3. 创建神经网络输入,组织输入数据
+
+基本使用概念:
+- 在PaddlePaddle内部,神经网络中一个计算层的输入输出被组织为一个 `Argument` 结构体,如果神经网络有多个输入或者多个输出,每一个输入/输出都会对应有自己的`Argument`。
+- `Argument` 并不真正“存储”数据,而是将输入/输出数据有机地组织在一起。
+- 在`Argument`内部由:1. `Matrix`(二维矩阵,存储浮点类型输入/输出);2. `IVector`(一维数组,**仅用于存储整型值**,多用于自然语言处理任务)来实际存储数据。
+
+C-API支持的所有输入数据类型和他们的组织方式,请参考“输入/输出数据组织”一节。
+
+这篇文档的之后部分会使用`argument`来特指PaddlePaddle C-API中神经网络的一个输入/输出,使用`paddle_matrix`**特指**`argument`中用于存储数据的`Matrix`类的对象。
+
+在组织神经网络输入,获取输出时,需要思考完成以下工作:
+1. 为每一个输入/输出创建`argument`;
+1. 为每一个`argument`创建`paddle_matrix`来存储数据;
+
+与输入不同的是,不需在使用C-API时为输出`argument`的`paddle_matrix`对象分配空间。前向计算之后PaddlePaddle内部已经分配/管理了每个计算层输出的存储空间。
+
+#### step 4. 前向计算
+
+完成上述准备之后,通过调用 [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) 接口完成神经网络的前向计算。
+
+#### step 5. 清理
+
+结束预测之后,对使用的中间变量和资源进行清理和释放。
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 274452fbf0..c2fc86687d 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,159 +1,188 @@
-```eval_rst
-.. _cluster_train:
-```
-
-# 运行分布式训练
-
-在本文中,我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
+# 分布式训练
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统(如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) )的用户参考。
 
-## 前提条件
+## 概述
 
-1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric:
+本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
 
-   ```bash
-   pip install fabric
-   ```
+<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
 
-2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。
+- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
+- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
+- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
 
-3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`, 该 ROOT_DIR 要在所有节点上存在。为了方便起见,我们通常在所有节点上创建一个 Unix 用户 `paddle`,并设置 `ROOT_DIR=/home/paddle`。这样,我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`,以便用户 `paddle` 可以 SSH 到所有节点而不用密码。
+这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
 
-## 准备工作空间
+在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
 
-我们将放置依赖库、配置等文件的目录视为 *工作空间(workspace)*。
 
-这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求,PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据,所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件,并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。
+## 环境准备
 
-通常,你可以使用本地训练中的相同模型文件进行集群训练。请记住,在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小,而不是使用同步 SGD 的总 batch 大小。
+1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
 
-以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。
+安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
 
-你只需完成 demo/recommendation 教程文档到 `Train` 的部分,之后你会得到训练/测试数据和模型配置文件。最后,只需使用 demo/recommendation 作为集群训练的工作空间。
+下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
 
-最后,你的工作空间应如下所示:
+## 启动参数说明
+### 启动参数服务器
+执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
 ```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
-```
-虽然这些文件并非都需要集群训练,但是也没有必要删除无用的文件。
 
-`trainer_config.py`
-表示模型配置文件。
+如果希望可以在后台运行pserver程序,并保存输出到一个日志文件,可以运行:
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-`train.list` 和 `test.list`
-文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。
+参数说明
 
-`dataprovider.py`
-用于读取训练/测试样本。这与本地训练相同。
+- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信
+- ports_num:**必选,默认1**,监听的端口个数
+- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
+- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
 
-`data`
-数据目录中的所有文件被 train.list/test.list 引用。
+### 启动计算节点
+执行以下命令启动使用python编写的trainer程序(文件名为任意文件名,如train.py)
+```bash
+$ python train.py
+```
 
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
 
-## 准备集群作业配置
+使用环境变量:
 
-以下选项必须在 cluster_train/conf.py 中认真设置
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+```
 
-`HOSTS`  所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上,例如 root@192.168.100.17:9090。
+使用参数:
 
-`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
 
-`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称,例如以太网的 eth0,infiniband 的 ib0。
+参数说明
 
-`PADDLE_PORT` 集群通信通道的端口号
+- use_gpu: **可选,默认False**,是否启用GPU训练
+- trainer_count:**必选,默认1**,当前训练任务trainer总个数
+- port:**必选,默认7164**,连接到pserver的端口
+- ports_num:**必选,默认1**,连接到pserver的端口个数
+- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
+- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
+- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
+- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
 
-`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少(少于5〜6个节点),建议将其设置为较大,如2〜8,以获得更好的网络性能。
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update,则可以像 `PADDLE_PORTS_NUM` 一样设置。
+### 准备数据集
 
-`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。
+参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py),准备训练数据和验证数据集,我们使用paddle.dataset.imikolov数据集,并根据分布式训练并发数(trainer节点个数),在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
 
-默认配置如下:
+在线上系统中,通常会使用MapReduce任务的输出结果作为训练结果,这样训练文件的个数会比较多,而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件:
 
 ```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-工作空间配置
-'''
-
-#工作空间根目录
-ROOT_DIR = "/home/paddle"
-
-'''
-网络配置
-'''
-#pserver NIC
-PADDLE_NIC = "eth0"
-#pserver 端口
-PADDLE_PORT = 7164
-#pserver 端口数
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#集群作业中所有进程的环境设置
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+示例程序`prepare.py`会把训练集和测试集分别分割成多个文件(例子中为3个,后缀为`-00000`、`-00001`和`-00002`):
 ```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+在进行分布式训练时,每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中,系统会提供一个分布式存储服务,这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储,则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
 
-### 启动集群作业
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+对于不同的训练任务,训练数据格式和训练程序的`reader()`会大不相同,所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
 
-`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+### 准备训练程序
 
-`job_dispatch_package`  设为本地 `workspace` 目录,它将被分发到 conf.py 中设置的所有节点。  它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。
-`job_workspace`  设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+我们会对每个训练任务都会在每个节点上创建一个工作空间(workspace),其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
 
-`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作,只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后:
+最后,工作空间应如下所示:
 ```
-sh run.sh
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
 ```
 
-集群作业将会在几秒后启动。
+- `my_lib.py`:会被`train.py`调用的一些用户定义的库函数,比如PIL库等。
+- `word_dict.pickle`:在`train.py`中会使用到的字典数据文件。
+- `train.py`:训练程序,代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py)。***注意:*** 对于本样例代码,在使用不同的分布式计算平台时,您可能需要修改`train.py`开头的部分(如下),以便获得训练数据的位置和获取环境变量配置:
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-### 终止集群作业
-`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
+- `test_data_dir`:包含测试数据集的目录。
 
-### 检查集群训练结果
-详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。
+## 使用分布式计算平台或工具
 
-`paddle_trainer.INFO`
-提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
+- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
+- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
+- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
 
-`paddle_pserver2.INFO`
-提供 pserver 运行日志,有助于诊断分布式错误。
+对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
 
-`server.log`
-提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
 
-`train.log`
-提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+## 在不同集群中运行
 
-### 检查模型输出
-运行完成后,模型文件将被写入节点 0 的 `output` 目录中。
-工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
+  - [fabric集群](fabric_cn.md)
+  - [openmpi集群](openmpi_cn.md)
+  - [kubernetes单机](k8s_cn.md)
+  - [kubernetes distributed分布式](k8s_distributed_cn.md)
+  - [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index c60876721c..28cd1fa790 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,156 +1,191 @@
-# Run Distributed Training
+# Distributed Training
 
-In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
+## Introduction
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
+In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
 
-## Prerequisite
+<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
 
-1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
 
-   ```bash
-   pip install fabric
-   ```
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
 
-1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
 
-1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
+## Preparations
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
 
-## Prepare Job Workspace
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
 
-We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
 
-These `train/test` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
 
-Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used.
+## Command-line arguments
 
-Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory.
+### Starting parameter server
 
-You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
+Type the below command to start a parameter server which will wait for trainers to connect:
 
-At last your workspace should look like as follow:
-```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
 ```
-Not all of these files are needed for cluster training, but it's not necessary to remove useless files.
 
-`trainer_config.py`
-Indicates the model config file.
+If you wish to run parameter servers in background, and save a log file, you can type:
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-`train.list` and `test.list`
-File index. It stores all relative or absolute file paths of all train/test data at current node.
+Parameter Description
 
-`dataprovider.py`
-used to read train/test samples. It's same as local training.
+- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
+- ports_num: **required, default 1**, total number of ports will listen on.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
+- num_gradient_servers: **required, default 1**, total number of gradient servers.
 
-`data`
-all files in data directory are refered by train.list/test.list which are refered by data provider.
+### Starting trainer
+Type the command below to start the trainer(name the file whatever you want, like "train.py")
 
+```bash
+$ python train.py
+```
 
-## Prepare Cluster Job Configuration
+Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
 
-The options below must be carefully set in cluster_train/conf.py
+Use environment viriables:
 
-`HOSTS`  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+python train.py
+```
 
-`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory
+Pass arguments:
 
-`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
 
-`PADDLE_PORT` port number for cluster commnunication channel
+Parameter Description
 
-`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+- use_gpu: **optional, default False**, set to "True" to enable GPU training.
+- trainer_count: **required, default 1**, total count of trainers in the training job.
+- port: **required, default 7164**, port to connect to parameter server.
+- ports_num: **required, default 1**, number of ports for communication.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
+- num_gradient_servers: **required, default 1**, total number of gradient server.
+- trainer_id: **required, default 0**, ID for every trainer, start from 0.
+- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM`
+### Prepare Training Dataset
 
-`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
+Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
 
-Default Configuration as follow:
+In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
 
 ```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-workspace configuration
-'''
-
-#root dir for workspace
-ROOT_DIR = "/home/paddle"
-
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
 ```
 
-### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
 
-`paddle.py`provides two distinguished command option for easy job launching.
+```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
+
+When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
+
+Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
 
-`job_dispatch_package`  set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
-`job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
-dispatch latency.
+### Prepare Training program
 
-`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
+
+
+Your workspace may looks like:
 ```
-sh run.sh
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
 ```
 
-The cluster Job will start in several seconds.
+- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
+- `word_dict.pickle`: dict file for training word embeding.
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
+
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed.
+- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
+- `test_data_dir`: containing testing data.
 
-### Check Cluster Training Result
-Check log in $workspace/log for details, each node owns same log structure.
+## Use cluster platforms or cluster management tools
 
-`paddle_trainer.INFO`
-It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
+PaddlePaddle supports running jobs on several platforms including:
+- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
+- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
 
-`paddle_pserver2.INFO`
-It provides pserver running log, which could help to diagnose distributed error.
+We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
 
-`server.log`
-It provides stderr and stdout of pserver process. Check error log if training crashs.
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-`train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashs.
+## Use different clusters
 
-### Check Model Output
-After one pass finished, model files will be writed in `output` directory in node 0.
-`nodefile` in workspace indicates the node id of current cluster job.
+  - [fabric](fabric_en.md)
+  - [openmpi](openmpi_en.md)
+  - [kubernetes](k8s_en.md)
+  - [kubernetes on AWS](k8s_aws_en.md)
diff --git a/doc/howto/usage/cluster/fabric_cn.md b/doc/howto/usage/cluster/fabric_cn.md
new file mode 100644
index 0000000000..0385e401b3
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_cn.md
@@ -0,0 +1,42 @@
+# 使用fabric启动集群训练
+
+## 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下,执行`kubectl -f ssh_servers.yaml`启动一个测试集群,并使用`kubectl get po -o wide`获得这些节点的IP地址。
+
+## 启动集群作业
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下,所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 为方便作业启动提供了两个独特的命令选项。
+
+-  `job_dispatch_package`  设为本地 `workspace` 目录,它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担,否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录,`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务,只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`,然后:
+```
+sh run.sh
+```
+
+集群作业将会在几秒后启动。
+
+## 终止集群作业
+`paddle.py`能获取`Ctrl + C` SIGINT 信号来自动终止它启动的所有进程。只需中断 `paddle.py` 任务来终止集群作业。如果程序崩溃你也可以手动终止。
+
+## 检查集群训练结果
+详细信息请检查 $workspace/log 里的日志,每一个节点都有相同的日志结构。
+
+`paddle_trainer.INFO`
+提供几乎所有训练的内部输出日志,与本地训练相同。这里检验运行时间模型的收敛。
+
+`paddle_pserver2.INFO`
+提供 pserver 运行日志,有助于诊断分布式错误。
+
+`server.log`
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+`train.log`
+提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
+
+## 检查模型输出
+运行完成后,模型文件将被写入节点 0 的 `output` 目录中。
+工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
diff --git a/doc/howto/usage/cluster/fabric_en.md b/doc/howto/usage/cluster/fabric_en.md
new file mode 100644
index 0000000000..bf270d89ab
--- /dev/null
+++ b/doc/howto/usage/cluster/fabric_en.md
@@ -0,0 +1,43 @@
+# Cluster Training Using Fabric
+
+## Prepare a Linux cluster
+
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
+
+## Launching Cluster Job
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+
+`paddle.py`provides two distinguished command option for easy job launching.
+
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+dispatch latency.
+
+`cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
+```
+sh run.sh
+```
+
+The cluster Job will start in several seconds.
+
+## Kill Cluster Job
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
+
+## Check Cluster Training Result
+Check log in $workspace/log for details, each node owns same log structure.
+
+`paddle_trainer.INFO`
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
+
+`paddle_pserver2.INFO`
+It provides parameter server running log, which could help to diagnose distributed error.
+
+`server.log`
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
+
+`train.log`
+It provides stderr and stdout of trainer process. Check error log if training crashes.
+
+## Check Model Output
+After one pass finished, model files will be written in `output` directory in node 0.
+`nodefile` in workspace indicates the node id of current cluster job.
diff --git a/doc/howto/usage/cluster/fluid_cluster_train_en.md b/doc/howto/usage/cluster/fluid_cluster_train_en.md
new file mode 100644
index 0000000000..ae825d9a51
--- /dev/null
+++ b/doc/howto/usage/cluster/fluid_cluster_train_en.md
@@ -0,0 +1,153 @@
+# Fluid Distributed Training
+
+## Introduction
+
+In this article, we'll explain how to configure and run distributed training jobs with PaddlePaddle Fluid in a bare metal cluster.
+
+## Preparations
+
+### Getting the cluster ready
+
+Prepare the compute nodes in the cluster. Nodes in this cluster can be of any specification that runs PaddlePaddle, and with a unique IP address assigned to it. Make sure they can communicate to each other.
+
+### Have PaddlePaddle installed
+
+PaddlePaddle must be installed on all nodes. If you have GPU cards on your nodes, be sure to properly install drivers and CUDA libraries.
+
+PaddlePaddle build and installation guide can be found  [here](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html).
+
+In addition to above, the `cmake` command should be run with the option `WITH_DISTRIBUTE` set to on. An example bare minimum `cmake` command would look as follows:
+
+``` bash
+cmake .. -DWITH_DOC=OFF -DWITH_GPU=OFF -DWITH_DISTRIBUTE=ON -DWITH_SWIG_PY=ON -DWITH_PYTHON=ON
+```
+
+### Update the training script
+
+#### Non-cluster training script
+
+Let's take [Deep Learning 101](http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.html)'s first chapter: "fit a line" as an example.
+
+The non-cluster version of this demo with fluid API is as follows:
+
+``` python
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
+```
+
+We created a simple fully-connected neural network training program and handed it to the fluid executor to run for 100 passes.
+
+Now let's try to convert it to a distributed version to run on a cluster.
+
+#### Introducing parameter server
+
+As we can see from the non-cluster version of training script, there is only one role in the script: the trainer, that performs the computing as well as holds the parameters. In cluster training, since multi-trainers are working on the same task, they need one centralized place to hold and distribute parameters. This centralized place is called the Parameter Server in PaddlePaddle.
+
+![parameter server architecture](src/trainer.png)
+
+Parameter Server in fluid not only holds the parameters but is also assigned with a part of the program. Trainers communicate with parameter servers via send/receive OPs. For more technical details, please refer to  [this document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/dist_refactor/distributed_architecture.md).
+
+Now we need to create programs for both: trainers and parameter servers, the question is how?
+
+#### Slice the program
+
+Fluid provides a tool called "Distributed Transpiler" that automatically converts the non-cluster program into cluster program.
+
+The idea behind this tool is to find the optimize OPs and gradient parameters, slice the program into 2 pieces and connect them with send/receive OP.
+
+Optimize OPs and gradient parameters can be found from the return values of optimizer's minimize function.
+
+To put them together:
+
+``` python
+... #define the program, cost, and create sgd optimizer
+
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) #get optimize OPs and gradient parameters
+
+t = fluid.DistributeTranspiler() # create the transpiler instance
+# slice the program into 2 pieces with optimizer_ops and gradient parameters list, as well as pserver_endpoints, which is a comma separated list of [IP:PORT] and number of trainers
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+... #create executor
+
+# in pserver, run this
+#current_endpoint here means current pserver IP:PORT you wish to run on
+pserver_prog = t.get_pserver_program(current_endpoint)
+pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+exe.run(pserver_startup)
+exe.run(pserver_prog)
+
+# in trainer, run this
+... # define data reader
+exe.run(fluid.default_startup_program())
+for pass_id in range(100):
+    for data in train_reader():
+        exe.run(t.get_trainer_program())
+
+
+```
+
+### E2E demo
+
+Please find the complete demo from [here](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py).
+First `cd` into the folder that contains the `python` files. In this case:
+
+```bash
+cd /paddle/python/paddle/v2/fluid/tests/book_distribute
+```
+
+In parameter server node run the following in the command line:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=PSERVER python notest_dist_fit_a_line.py
+```
+
+*please note we assume that your parameter server runs at 192.168.1.2:6174*
+
+Wait until the prompt `Server listening on 192.168.1.2:6174`
+
+Then in 2 of your trainer nodes run this:
+
+``` bash
+PSERVERS=192.168.1.2:6174 SERVER_ENDPOINT=192.168.1.2:6174 TRAINING_ROLE=TRAINER python notest_dist_fit_a_line.py
+```
+
+*the reason you need to run this command twice in 2 nodes is because: in the script we set the trainer count to be 2. You can change this setting on line 50*
+
+Now you have 2 trainers and 1 parameter server up and running.
diff --git a/doc/howto/usage/cluster/k8s_aws_cn.md b/doc/howto/usage/cluster/k8s_aws_cn.md
new file mode 120000
index 0000000000..c44cd9a731
--- /dev/null
+++ b/doc/howto/usage/cluster/k8s_aws_cn.md
@@ -0,0 +1 @@
+k8s_aws_en.md
\ No newline at end of file
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/cluster/k8s_aws_en.md
similarity index 98%
rename from doc/howto/usage/k8s/k8s_aws_en.md
rename to doc/howto/usage/cluster/k8s_aws_en.md
index ce72b08038..0dfa8237a3 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/cluster/k8s_aws_en.md
@@ -493,7 +493,7 @@ spec:
     spec:
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/efs"
@@ -522,7 +522,7 @@ NAME          DESIRED   SUCCESSFUL   AGE
 paddle-data   1         1            6m
 ```
 
-Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+Data preparation is done by docker image `paddlepaddle/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
 
 #### Start Training
 
@@ -545,7 +545,7 @@ spec:
           claimName: efsvol
       containers:
       - name: trainer
-        image: paddledev/paddle-tutorial:k8s_train
+        image: paddlepaddle/paddle-tutorial:k8s_train
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -617,7 +617,7 @@ kubectl --kubeconfig=kubeconfig log -f POD_NAME
 
 Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
 
-The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+The details for start `pserver` and `trainer` are hidden inside docker image `paddlepaddle/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
 
 #### Inspect Training Output
 
diff --git a/doc/howto/usage/k8s/k8s_cn.md b/doc/howto/usage/cluster/k8s_cn.md
similarity index 83%
rename from doc/howto/usage/k8s/k8s_cn.md
rename to doc/howto/usage/cluster/k8s_cn.md
index ab07cb9cd5..c1a11f7165 100644
--- a/doc/howto/usage/k8s/k8s_cn.md
+++ b/doc/howto/usage/cluster/k8s_cn.md
@@ -1,21 +1,22 @@
 # Kubernetes单机训练
 
-在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。
+在这篇文档里,我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的PaddlePaddle训练作业。在下一篇中,我们将介绍如何启动分布式训练作业。
 
 ## 制作Docker镜像
 
-在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式Paddle训练任务中的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在
-Paddle的Docker image里。为此,我们需要制作一个包含训练数据的Paddle镜像。
+在一个功能齐全的Kubernetes机群里,通常我们会安装Ceph等分布式文件系统来存储训练数据。这样的话,一个分布式PaddlePaddle训练任务中
+的每个进程都可以从Ceph读取数据。在这个例子里,我们只演示一个单机作业,所以可以简化对环境的要求,把训练数据直接放在
+PaddlePaddle的Docker Image里。为此,我们需要制作一个包含训练数据的PaddlePaddle镜像。
+
+PaddlePaddle的 `paddlepaddle/paddle:cpu-demo-latest` 镜像里有PaddlePaddle的源码与demo,
+(请注意,默认的PaddlePaddle生产环境镜像 `paddlepaddle/paddle:latest` 是不包括源码的,PaddlePaddle的各版本镜像可以参考
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)),
+下面我们使用这个镜像来下载数据到Docker Container中,并把这个包含了训练数据的Container保存为一个新的镜像。
 
-Paddle 的 [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) 
-里介绍了用Paddle源码中的脚本下载训练数据的过程。
-而 `paddledev/paddle:cpu-demo-latest` 镜像里有 Paddle 源码与demo,( 请注意,默认的
-Paddle镜像 `paddledev/paddle:cpu-latest` 是不包括源码的, Paddle的各版本镜像可以参考 [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html) ),所以我们使用这个镜像来下载训练数据到Docker container中,然后把这个包含了训练数据的container保存为一个新的镜像。
-  
 ### 运行容器
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### 下载数据
@@ -103,7 +104,7 @@ spec:
       restartPolicy: Never
 ```
 
-### 创建Paddle Job
+### 创建PaddlePaddle Job
 
 使用上文创建的yaml文件创建Kubernetes Job,命令为:
 
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/cluster/k8s_distributed_cn.md
similarity index 88%
rename from doc/howto/usage/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/cluster/k8s_distributed_cn.md
index 3121b3f59d..167089b807 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/cluster/k8s_distributed_cn.md
@@ -1,8 +1,6 @@
 # Kubernetes分布式训练
 
-前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。
-
-有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群,可以参考[k8s_basis](./k8s_basis_cn.md)。
+前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里,我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练,文章 [Cluster Training](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cluster/cluster_train_cn.html)介绍了一种通过SSH远程分发任务,进行分布式训练的方法,与此不同的是,本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群,进行分布式训练的方案。
 
 ## 整体方案
 
@@ -28,7 +26,7 @@ PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行
 - 拷贝训练文件到容器内
 - 生成`paddle pserver`与`paddle train`进程的启动参数,并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
+因为官方镜像 `paddlepaddle/paddle:latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能,所以我们可以在这个基础上,添加启动脚本,制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/src/k8s_train/Dockerfile)。
 
 ```bash
 $ cd doc/howto/usage/k8s/src/k8s_train
@@ -62,7 +60,7 @@ spec:
       hostNetwork: true
       containers:
       - name: paddle-data
-        image: paddledev/paddle-tutorial:k8s_data
+        image: paddlepaddle/paddle-tutorial:k8s_data
         imagePullPolicy: Always
         volumeMounts:
         - mountPath: "/mnt"
@@ -149,20 +147,19 @@ spec:
 
 文件中,`metadata`下的`name`表示这个job的名字。`parallelism,completions`字段表示这个job会同时开启3个PaddlePaddle节点,成功训练且退出的pod数目为3时,这个job才算成功结束。然后申明一个存储卷`jobpath`,代表宿主机目录`/home/work/mfs`,在对容器的描述`containers`字段中,将此目录挂载为容器的`/home/jobpath`目录,这样容器的`/home/jobpath`目录就成为了共享存储,放在这个目录里的文件其实是保存到了MFS上。
 
-`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内。
+`env`字段表示容器的环境变量,我们将`paddle`运行的一些参数通过这种方式传递到容器内:
+
 
-环境变量 | 说明
---- | ---
-JOB_PATH | 共享存储挂在的路径
-JOB_NAME | Job的名字
-TRAIN_CONFIG_DIR | 本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
-CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数,即网卡名
-CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
-CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量,即`--ports_num`参数
-CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量,即`--ports_num_for_sparse`参数
-CONF_PADDLE_GRADIENT_NUM | 训练节点数量,即`--num_gradient_servers参数`
+- JOB_PATH:共享存储挂在的路径
+- JOB_NAME:Job的名字
+- TRAIN_CONFIG_DIR:本次训练文件所在目录,与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+- CONF_PADDLE_NIC:`paddle pserver`进程需要的`--nics`参数,即网卡名
+- CONF_PADDLE_PORT:`paddle paserver`的`--port`参数
+- CONF_PADDLE_PORTS_NUM:稠密更新的端口数量,即`--ports_num`参数
+- CONF_PADDLE_PORTS_NUM_SPARSE:稀疏更新的端口数量,即`--ports_num_for_sparse`参数
+- CONF_PADDLE_GRADIENT_NUM:训练节点数量,即`--num_gradient_servers参数`
 
-这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+这些参数的具体描述,读者可以查看[这里](http://www.paddlepaddle.org/docs/develop/documentation/zh/howto/usage/cmd_parameter/detail_introduction_cn.html)。
 
 编写完YAML文件后,可以使用Kubernetes的命令行工具创建job。
 
@@ -213,7 +210,7 @@ I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__square_error_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/cluster/k8s_en.md
similarity index 79%
rename from doc/howto/usage/k8s/k8s_en.md
rename to doc/howto/usage/cluster/k8s_en.md
index 0c3ab05b70..c374f00a49 100644
--- a/doc/howto/usage/k8s/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s_en.md
@@ -1,18 +1,27 @@
-# Paddle On Kubernetes
+# PaddlePaddle On Kubernetes
 
->In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+In this article, we will introduce how to run PaddlePaddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run PaddlePaddle training job on distributed cluster.
 
 ## Build Docker Image
 
-In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+In distributed Kubernetes cluster, we will use Ceph or other distributed
+storage system for storing training related data so that all processes in
+PaddlePaddle training can retrieve data from Ceph. In this example, we will
+only demo training job on single machine. In order to simplify the requirement
+of the environment, we will directly put training data into the PaddlePaddle Docker Image,
+so we need to create a PaddlePaddle Docker image that includes the training data.
+
+The production Docker Image `paddlepaddle/paddle:cpu-demo-latest` has the PaddlePaddle
+source code and demo. (Caution: Default PaddlePaddle Docker Image `paddlepaddle/paddle:latest` doesn't include
+the source code, PaddlePaddle's different versions of Docker Image can be referred here:
+[Docker Installation Guide](http://paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_en.html)),
+so we run this Docker Image and download the training data, and then commit the whole
+Container to be a new Docker Image.
 
-Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
-And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
-  
 ### Run Docker Container
 
 ```
-$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+$ docker run --name quick_start_data -it paddlepaddle/paddle:cpu-demo-latest
 ```
 
 ### Download Training Data
@@ -67,7 +76,7 @@ $ docker commit quick_start_data mypaddle/paddle:quickstart
 
 ## Use Kubernetes For Training
 
->We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
 
 ### Create Yaml Files
 
@@ -99,7 +108,7 @@ spec:
       restartPolicy: Never
 ```
 
-### Start Paddle Job
+### Start PaddlePaddle Job
 
 Using the above yaml file to start the Kubernetes job.
 
diff --git a/doc/howto/usage/cluster/openmpi_cn.md b/doc/howto/usage/cluster/openmpi_cn.md
new file mode 100644
index 0000000000..831cafdc03
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_cn.md
@@ -0,0 +1,41 @@
+# 在OpenMPI集群中提交训练作业
+
+## 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点:
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+## 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务:
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/cluster/openmpi_en.md b/doc/howto/usage/cluster/openmpi_en.md
new file mode 100644
index 0000000000..09af46e25e
--- /dev/null
+++ b/doc/howto/usage/cluster/openmpi_en.md
@@ -0,0 +1,41 @@
+# Cluster Training Using OpenMPI
+
+## Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+## Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
diff --git a/doc/howto/usage/k8s/src/Dockerfile b/doc/howto/usage/cluster/src/Dockerfile
similarity index 54%
rename from doc/howto/usage/k8s/src/Dockerfile
rename to doc/howto/usage/cluster/src/Dockerfile
index 3a73606c61..e178bf4da0 100644
--- a/doc/howto/usage/k8s/src/Dockerfile
+++ b/doc/howto/usage/cluster/src/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 MAINTAINER zjsxzong89@gmail.com
 
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/cluster/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/add_security_group.png
rename to doc/howto/usage/cluster/src/add_security_group.png
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/cluster/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/k8s/src/create_efs.png
rename to doc/howto/usage/cluster/src/create_efs.png
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/cluster/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/k8s/src/efs_mount.png
rename to doc/howto/usage/cluster/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/src/k8s-paddle-arch.png b/doc/howto/usage/cluster/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000..b3800c4fe8
Binary files /dev/null and b/doc/howto/usage/cluster/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/cluster/src/k8s_data/Dockerfile
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_data/Dockerfile
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/cluster/src/k8s_data/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/README.md
rename to doc/howto/usage/cluster/src/k8s_data/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/cluster/src/k8s_data/get_data.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_data/get_data.sh
rename to doc/howto/usage/cluster/src/k8s_data/get_data.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
similarity index 77%
rename from doc/howto/usage/k8s/src/k8s_train/Dockerfile
rename to doc/howto/usage/cluster/src/k8s_train/Dockerfile
index c0fca1f9a9..77f021a89a 100644
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/cluster/src/k8s_train/Dockerfile
@@ -1,4 +1,4 @@
-FROM paddledev/paddle:cpu-latest
+FROM paddlepaddle/paddle:latest
 
 COPY start.sh /root/
 COPY start_paddle.py /root/
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/cluster/src/k8s_train/README.md
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/README.md
rename to doc/howto/usage/cluster/src/k8s_train/README.md
diff --git a/doc/howto/usage/k8s/src/k8s_train/start.sh b/doc/howto/usage/cluster/src/k8s_train/start.sh
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start.sh
rename to doc/howto/usage/cluster/src/k8s_train/start.sh
diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/cluster/src/k8s_train/start_paddle.py
similarity index 100%
rename from doc/howto/usage/k8s/src/k8s_train/start_paddle.py
rename to doc/howto/usage/cluster/src/k8s_train/start_paddle.py
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/cluster/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/k8s/src/managed_policy.png
rename to doc/howto/usage/cluster/src/managed_policy.png
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/cluster/src/pserver_and_trainer.png
similarity index 100%
rename from doc/howto/usage/k8s/src/pserver_and_trainer.png
rename to doc/howto/usage/cluster/src/pserver_and_trainer.png
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/cluster/src/route53_create_recordset.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_recordset.png
rename to doc/howto/usage/cluster/src/route53_create_recordset.png
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/cluster/src/route53_create_zone.png
similarity index 100%
rename from doc/howto/usage/k8s/src/route53_create_zone.png
rename to doc/howto/usage/cluster/src/route53_create_zone.png
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/usage/cluster/src/trainer.png
new file mode 100644
index 0000000000..6537d3d565
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer.png differ
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/usage/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000..f9525739cc
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
new file mode 100644
index 0000000000..9a65f14628
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
new file mode 100644
index 0000000000..2afce9a66e
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+
+    return cluster_reader
+
+
+def main():
+    # get arguments from env
+
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/usage/cluster/src/word2vec/prepare.py
new file mode 100644
index 0000000000..ade01c378e
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+
+SPLIT_COUNT = 3
+N = 5
+
+
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+
+
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/cluster/src/worker_security_group.png
similarity index 100%
rename from doc/howto/usage/k8s/src/worker_security_group.png
rename to doc/howto/usage/cluster/src/worker_security_group.png
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index f7aa525054..2dea231ca5 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -63,7 +63,7 @@
 </tr>
 
 <tr>
-<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left" rowspan="14">训练</td><td class="left">dot_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
deleted file mode 100644
index 4c3dc81ed3..0000000000
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# Kubernetes 简介
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统,其提供应用部署、维护、扩展机制等功能,利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行,且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws),[Azure](http://kubernetes.io/docs/getting-started-guides/azure/),[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前,需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识,下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点,这个节点可以是物理机或者虚拟机,Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器,pod是Kubernetes的最小调度单元,一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET,PID,IPC,UTS等Linux namespace。由于容器之间共享NET namespace,所以它们使用同一个IP地址,可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业,一次作业称为一个job,通常每个job包括一个或者多个pods,job启动后会创建这些pod并开始执行一个程序,等待这个程序执行成功并返回0则成功退出,如果执行失败,也可以配置不同的重试机制。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷,是pod内的容器都可以访问的共享目录,也是容器与node之间共享文件的方式,因为容器内的文件都是暂时存在的,当容器因为各种原因被销毁时,其内部的文件也会随之消失。通过volume,就可以将这些文件持久化存储。Kubernetes支持多种volume,例如hostPath(宿主机目录),gcePersistentDisk,awsElasticBlockStore等。
-
-- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间,在kubernetes中创建的所有资源对象(例如上文的pod,job)等都属于一个命名空间,在同一个命名空间中,资源对象的名字是唯一的,不同空间的资源名可以重复,命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
-
-- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合,将外部的存储服务在Kubernetes中描述成为统一的资源形式,便于存储资源管理和Pod引用。
-
-## 部署Kubernetes集群
-
-Kubernetes提供了多种集群部署的方案,本文档内不重复介绍。这里给出集中常见的部署方法:
-
-- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器,便于本地验证和测试。
-- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统,不同主机(Bare-Metal, AWS, GCE)条件下,快速部署集群。
-- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
-- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
-
-可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
-
-## 选择存储方案
-
-容器不会保留在运行时生成的数据,job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务,需要有一个外部的存储服务来保存训练所需数据和训练输出。
-常见的可选存储服务包括:
-
-- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单,可以用于小量数据的验证。不提供分布式存储,高可用,冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
-- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统,可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
-- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统,支持rbd,POSIX API接口(ceph fs)和对象存储API,参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
-- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
-
-## 配置kubectl
-
-### 安装kubectl
-```
-# OS X
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
-
-# Linux
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
-
-# Windows
-curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
-```
-
-### 配置kubectl访问你的kubernetes集群
-
-编辑`~/.kube/config`这个配置文件,修改`Master-IP`的地址。如果使用SSL认证,则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问(比如通过8080端口),也可以去掉这些证书的配置。
-```
-apiVersion: v1
-clusters:
-- cluster:
-    certificate-authority: /path/to/ca.crt
-    server: https://[Master-IP]:443
-  name: minikube
-contexts:
-- context:
-    cluster: minikube
-    user: minikube
-  name: minikube
-current-context: minikube
-kind: Config
-preferences: {}
-users:
-- name: minikube
-  user:
-    client-certificate: /path/to/apiserver.crt
-    client-key: /Users/wuyi/.minikube/apiserver.key
-```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
deleted file mode 100644
index 2183a232ad..0000000000
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 9279bac7f4..ada51c2d73 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,3 +8,4 @@ PaddlePaddle 文档
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
+  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 168c7667c6..23b64b6cad 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,4 @@ PaddlePaddle Documentation
   getstarted/index_en.rst
   howto/index_en.rst
   api/index_en.rst
-  about/index_en.rst
+  mobile/index_en.rst
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
new file mode 100644
index 0000000000..ae24ced770
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -0,0 +1,177 @@
+# Android平台编译指南
+
+用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
+
+## 基于Docker容器的编译方式
+Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
+
+### 构建PaddlePaddle的Android开发镜像
+我们把PaddlePaddle的交叉编译环境打包成一个镜像,称为开发镜像,里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t username/paddle-android:dev . -f Dockerfile.android
+```
+
+用户也可以使用PaddlePaddle提供的官方开发镜像:
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+### 编译PaddlePaddle C-API库
+构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
+Android的Docker开发镜像向用户提供两个可配置的参数:
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库
+
+  ```bash
+  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+  ```
+
+- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
+
+  ```bash
+  $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+  ```
+
+执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+
+## 基于Linux交叉编译环境的编译方式
+本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
+
+### 准备交叉编译环境
+
+从源码交叉编译PaddlePaddle,用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn),用户可自行前往下载预编译好的版本,也可通过以下命令获取:
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK中包含了所有Android API级别、所有架构(arm/arm64/x86/mips)需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别,构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
+
+- 构建`armeabi-v7a`、 `Android API 21`的独立工具链:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+- 构建`arm64-v8a`、 `Android API 21`的独立工具链:
+
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
+
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+### 配置交叉编译参数
+
+CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
+
+交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
+- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。
+- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
+
+Android平台可选配置参数:
+
+- `ANDROID_STANDALONE_TOOLCHAIN`,独立工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别;否则,用户需要在cmake时手动设置这些值。无默认值。
+- `ANDROID_TOOLCHAIN`,目标工具链。可设置`gcc/clang`,默认值为`clang`。
+	- CMake 3.7以上,将会始终使用`clang`工具链;CMake 3.7以下,可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
+- `ANDROID_ABI`,目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`,默认值为`armeabi-v7a`。
+- `ANDROID_NATIVE_API_LEVEL`,工具链的Android API级别。若没有显式设置,PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
+- `ANROID_ARM_MODE`,是否使用ARM模式。
+	- `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
+	- `ANDROID_ABI=arm64-v8a`时,不需要设置。
+- `ANDROID_ARM_NEON`,是否使用NEON指令。
+	- `ANDROID_ABI=armeabi-v7a`时,可设置`ON/OFF`,默认值为`ON`;
+	- `ANDROID_ABI=arm64-v8a`时,不需要设置。
+
+其他配置参数:
+
+- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
+
+常用的cmake配置如下:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
+- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
+
+### 编译和安装
+
+CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```bash
+make
+make install
+```
+
+注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后,`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录,其中`include`中包含C-API的头文件,`lib`中包含若干个不同Android ABI的PaddlePaddle库,`third_party`中包含所依赖的所有第三方库。自此,PaddlePaddle的已经安装完成,用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中,调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
new file mode 100644
index 0000000000..0cf50181df
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -0,0 +1,183 @@
+# Build PaddlePaddle for Android
+
+There are two approaches to build PaddlePaddle for Android: 
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux) 
+
+## Cross-Compiling Using Docker
+
+Docker-based cross-compiling is the recommended approach because Docker runs on all major operating systems, including Linux, Mac OS X, and Windows.
+
+### Build the Docker Image
+
+The following steps pack all the tools that we need to build PaddlePaddle into a Docker image.
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t paddle:dev-android . -f Dockerfile.android
+```
+
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
+### Build the Inference Library
+
+We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
+
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" paddle:dev-android
+```
+
+The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
+
+<table class="docutils">
+<colgroup>
+  <col width="25%" />
+  <col width="50%" />
+  <col width="25%" />
+</colgroup>
+<thead valign="bottom">
+  <tr class="row-odd">
+  <th class="head">Argument</th>
+  <th class="head">Optional Values</th>
+  <th class="head">Default</th>
+</tr>
+</thead>
+<tbody valign="top">
+  <tr class="row-even">
+  <td>ANDROID_ABI</td>
+  <td>armeabi-v7a, arm64-v8a</td>
+  <td>armeabi-v7a</td>
+</tr>
+<tr class="row-odd">
+  <td>ANDROID_API</td>
+  <td>>= 16</td>
+  <td>21</td>
+</tr>
+</tbody>
+</table>
+
+The ARM-64 architecture (`arm64-v8a`) requires at least level 21 of Android API.
+
+The default entry-point of the Docker image, [`paddle/scripts/docker/build_android.sh`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh) generates the [Android cross-compiling standalone toolchain](https://developer.android.com/ndk/guides/standalone_toolchain.html) based on the argument: `ANDROID_ABI` or `ANDROID_API`.  For information about other configuration arguments, please continue reading.
+
+The above command generates and outputs the inference library in `$PWD/install_android` and puts third-party libraries in `$PWD/install_android/third_party`.
+
+## Cross-Compiling on Linux
+
+The Linux-base approach to cross-compile is to run steps in `Dockerfile.android` manually on a Linux x64 computer.
+
+### Setup the Environment
+
+To build for Android's, we need [Android NDK](
+https://developer.android.com/ndk/downloads/index.html):
+
+```bash
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+```
+
+Android NDK includes everything we need to build the [*standalone toolchain*](https://developer.android.com/ndk/guides/standalone_toolchain.html), which in then used to build PaddlePaddle for Android.  (We plan to remove the intermediate stage of building the standalone toolchain in the near future.)
+
+- To build the standalone toolchain for `armeabi-v7a` and Android API level 21:
+
+  ```bash
+  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+          --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
+  ```
+  
+  The generated standalone toolchain will be in `your/path/to/arm_standalone_toolchain`.
+
+- To build the standalone toolchain for `arm64-v8a` and Android API level 21:
+
+  ```bash
+  your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+          --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+  ```
+
+  The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
+
+### Cross-Compiling Arguments
+
+CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake.  `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
+
+Some other CMake arguments you need to know:
+
+- `CMAKE_SYSTEM_NAME` must be `Android`.  This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
+- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
+- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
+
+Some Android-specific arguments:
+
+- `ANDROID_STANDALONE_TOOLCHAIN`: the absolute path of the Android standalone toolchain, or the path relative to the CMake build directory.  PaddlePaddle's CMake extensions would derive the cross-compiler, sysroot and Android API level from this argument.
+- `ANDROID_TOOLCHAIN`: could be `gcc` or `clang`.  The default value is `clang`.
+  - For CMake >= 3.7, it should anyway be `clang`.  For older versions, it could be `gcc`.
+  - Android's official `clang` requires `glibc` >= 2.15.
+- `ANDROID_ABI`: could be `armeabi-v7a` or `arm64-v8a`.  The default value is `armeabi-v7a`.
+- `ANDROID_NATIVE_API_LEVEL`: could be derived from the value of `ANDROID_STANDALONE_TOOLCHAIN`.
+- `ANROID_ARM_MODE`:
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+- `ANDROID_ARM_NEON`: indicates if to use NEON instructions.
+  - could be `ON` or `OFF`, and defaults to `ON`, when `ANDROID_ABI=armeabi-v7a`;
+  - no need to specify when `ANDROID_ABI=arm64-v8a`.
+
+Other useful arguments:
+
+- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen.  Could be `ON` or `OFF`, defaults to `OFF`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS.  It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
+
+Some frequent configurations for your reference:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+
+There are some other arguments you might want to configure.
+
+- `CMAKE_BUILD_TYPE=MinSizeRel` minimizes the size of library.
+- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
+
+Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
+- `CMAKE_BUILD_TYPE=Release`
+- `ANDROID_TOOLCHAIN=clang`
+- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
+
+### Build and Install
+
+After running `cmake`, we can run `make; make install` to build and install.
+
+Before building, you might want to remove the `third_party` and `build` directories including pre-built libraries for other architectures.
+
+After building,in the directory `CMAKE_INSTALL_PREFIX`, you will find three sub-directories:
+
+- `include`: the header file of the inference library,
+- `lib`: the inference library built for various Android ABIs,
+- `third_party`: dependent third-party libraries built for Android.
diff --git a/doc/mobile/cross_compiling_for_ios_cn.md b/doc/mobile/cross_compiling_for_ios_cn.md
new file mode 100644
index 0000000000..d5196d9a4c
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_cn.md
@@ -0,0 +1,117 @@
+# iOS平台编译指南
+交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
+
+## 准备交叉编译环境
+Apple官方为iOS开发提供了完整的交叉编译工具和集成开发环境,用户从App Store下载安装Xcode即可。也可自行前往官网下载,[Xcode](https://developer.apple.com/cn/xcode/)。安装完成之后,可在命令行执行`xcodebuild -version`,判断是否安装成功。
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## 配置交叉编译参数
+
+PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake),以提供一些默认的编译器和编译参数配置。
+
+交叉编译iOS版本的PaddlePaddle库时,有一些必须配置的参数:
+
+- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`iOS`。在设置`CMAKE_SYSTEM_NAME=iOS`后,PaddlePaddle的CMake系统会自动编译所有的第三方依赖库,并且强制设置一些PaddlePaddle参数的值(`WITH_C_API=ON`、`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
+- `WITH_C_API`,是否编译C-API预测库,必须设置为ON。在iOS平台上只支持使用C-API来预测。
+- `WITH_SWIG_PY`,必须设置为`OFF`。在iOS平台上不支持通过swig调用来训练或者预测。
+
+iOS平台可选配置参数:
+
+- `IOS_PLATFORM`,可设置为`OS`(默认值)或`SIMULATOR`。
+  - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
+  - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
+- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`,最小的iOS部署版本,默认值为`7.0`。
+- `IOS_ENABLE_BITCODE`,是否使能[Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3),可设置`ON/OFF`,默认值为`ON`。
+- `IOS_USE_VECLIB_FOR_BLAS`,是否使用[vecLib](https://developer.apple.com/documentation/accelerate/veclib)框架进行BLAS矩阵计算,可设置`ON/OFF`,默认值为`OFF`。
+- `IOS_DEVELOPMENT_ROOT`,`Developer`目录,可显式指定为`/path/to/platform/Developer`。若未显式指定,PaddlePaddle将会根据`IOS_PLATFORM`自动选择`Xcode`对应`platform`的`Developer`目录。
+- `IOS_SDK_ROOT`,所使用`SDK`的根目录,可显式指定为`/path/to/platform/Developer/SDKs/SDK`。若未显式指定,PaddlePaddle将会自动选择`IOS_DEVELOPMENT_ROOT`目录下最新的`SDK`版本。
+
+其他配置参数:
+
+- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算,在`IOS_USE_VECLIB_FOR_BLAS=OFF`时有效。可设置`ON/OFF`,默认值为`OFF`。
+- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。默认值为环境变量`CC/CXX`的值;若环境变量`CC/CXX`未设置,则使用`cc/c++`编译器。
+
+常用的cmake配置如下:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望得到最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
+
+**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
+
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 设置`IOS_USE_VECLIB_FOR_BLAS=ON`,调用`vecLib`框架提供的BLAS函数进行矩阵计算。
+
+## 编译和安装
+
+CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
+
+```
+$ make
+$ make install
+```
+
+注意:如果你曾在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后,`your/path/to/install`目录中会包含以下内容:
+
+- `include`目录,其中包含所有C-API的头文件
+- `lib`目录,其中包含PaddlePaddle的C-API静态库
+- `third_party`目录,其中包含所依赖的所有第三方库
+
+注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
+
+自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
new file mode 100644
index 0000000000..19bfe86c51
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -0,0 +1,120 @@
+# Build PaddlePaddle for iOS
+
+This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
+
+## Preparation
+
+Apple provides Xcode for cross-compiling and IDE for iOS development. Download from App store or [here](https://developer.apple.com/cn/xcode/). To verify your installation, run command as follows
+
+```bash
+$ xcodebuild -version
+Xcode 9.0
+Build version 9A235
+```
+
+## Cross-compiling configurations
+
+PaddlePaddle provides cross-compiling toolchain configuration documentation [cmake/cross_compiling/ios.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/ios.cmake), which has some default settings for frequently used compilers.
+
+There are some mandatory environment variables need to be set before cross compiling PaddlePaddle for iOS:
+
+- `CMAKE_SYSTEM_NAME`, CMake compiling target platform name, has to be `iOS`. PaddlePaddle CMake will compile all the third party dependencies and enforce some parameters (`WITH_C_API=ON`, `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`,`WITH_RDMA=OFF`) when this variable is set with value `iOS`.
+
+- `WITH_C_API`, Whether to compile inference C-API library, has to be `ON`, since C-API is the only supported interface for inferencing in iOS.
+- `WITH_SWIG_PY`, has to be `OFF`. It's not supported to inference or train via swig in iOS.
+
+Optional environment variables for iOS are:
+
+- `IOS_PLATFORM`, either `OS` (default) or `SIMULATOR`.
+  - `OS`, build targets ARM-based physical devices like iPhone or iPad.
+  - `SIMULATOR`, build targets x86 architecture simulators.
+- `IOS_ARCH`, target architecture. By default, all architecture types will be compiled. If you need to specify the architecture to compile for, please find valid values for different `IOS_PLATFORM` settings from the table below:
+
+    <table class="docutils">
+    <colgroup>
+      <col width="35%" />
+      <col width="65%" />
+    </colgroup>
+    <thead valign="bottom">
+      <tr class="row-odd">
+      <th class="head">IOS_PLATFORM</th>
+      <th class="head">IOS_ARCH</th>
+    </tr>
+    </thead>
+    <tbody valign="top">
+      <tr class="row-even">
+      <td>OS</td>
+      <td>armv7, armv7s, arm64 </td>
+    </tr>
+    <tr class="row-odd">
+      <td>SIMULATOR</td>
+      <td>i386, x86_64 </td>
+    </tr>
+    </tbody>
+    </table>
+
+- `IOS_DEPLOYMENT_TARGET`, minimum iOS version to deployment, `7.0` by default.
+- `IOS_ENABLE_BITCODE`, whether to enable [Bitcode](https://developer.apple.com/library/content/documentation/IDEs/Conceptual/AppDistributionGuide/AppThinning/AppThinning.html#//apple_ref/doc/uid/TP40012582-CH35-SW3), values can be `ON/OFF`, `ON` by default.
+- `IOS_USE_VECLIB_FOR_BLAS`, whether to use [vecLib](https://developer.apple.com/documentation/accelerate/veclib) framework for BLAS computing. values can be `ON/OFF`, `OFF` by default.
+- `IOS_DEVELOPMENT_ROOT`, the path to `Developer` directory, can be explicitly set with your `/path/to/platform/Developer`. If left blank, PaddlePaddle will automatically pick the Xcode corresponding `platform`'s `Developer` directory based on your `IOS_PLATFORM` value.
+- `IOS_SDK_ROOT`, the path to `SDK` root, can be explicitly set with your  `/path/to/platform/Developer/SDKs/SDK`. if left black, PaddlePaddle will pick the latest SDK in the directory of `IOS_DEVELOPMENT_ROOT`.
+
+other settings:
+
+- `USE_EIGEN_FOR_BLAS`, whether to use Eigen for matrix computing. effective when `IOS_USE_VECLIB_FOR_BLAS=OFF`. Values can be `ON/OFF`, `OFF` by default.
+- `HOST_C/CXX_COMPILER`, host C/C++ compiler. Uses value from environment variable `CC/CXX` by default or `cc/c++` if `CC/CXX` doesn't exist.
+
+some typical cmake configurations:
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DIOS_ARCH="armv7;arm64" \
+      -DIOS_ENABLE_BITCODE=ON \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+```bash
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=SIMULATOR \
+      -DIOS_ARCH="x86_64" \
+      -DIOS_USE_VECLIB_FOR_BLAS=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_C_API=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+You can set other compiling parameters for your own need. I.E. if you are trying to minimize the library size, set `CMAKE_BUILD_TYPE` with `MinSizeRel`; or if the performance is your concern, set `CMAKE_BUILD_TYPE` with `Release`. You can even manipulate the PaddlePaddle compiling procedure by manually set `CMAKE_C/CXX_FLAGS` values.
+
+**TIPS for a better performance**:
+
+- set `CMAKE_BUILD_TYPE` with `Release`
+- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
+
+## Build and install
+
+After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
+
+```
+$ make
+$ make install
+```
+
+Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
+
+`your/path/to/install` directory will have following directories after `make install`:
+
+- `include`, contains all the C-API header files.
+- `lib`, contains PaddlePaddle C-API static library.
+- `third_party` contains all the 3rd party libraries.
+
+Please note: if PaddlePaddle library need to support both physical devices and simulators, you will need to compile correspondingly, then merge fat library with `lipo`.
+
+Now you will have PaddlePaddle library compiled and installed, the fat library can be used in deep learning related iOS APPs. Please refer to C-API documentation for usage guides.
diff --git a/doc/mobile/cross_compiling_for_raspberry_cn.md b/doc/mobile/cross_compiling_for_raspberry_cn.md
new file mode 100644
index 0000000000..f8ef9dc803
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_cn.md
@@ -0,0 +1,62 @@
+# Raspberry Pi平台编译指南
+
+通常有两个方法来构建基于 Rasspberry Pi 的版本:
+
+1. 通过ssh等方式登录到Raspberry Pi系统上来构建。所需的开发工具和第三方库可以参考 [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile)。
+
+1. 另一个方法是交叉编译。这篇文档介绍在 Linux/x64 上交叉编译Raspberry Pi平台上适用的PaddlePaddle的方法和步骤。
+
+## 安装交叉编译器
+
+克隆下面 Github repo
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+即可在 `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64` 目录里找到交叉编译器 arm-linux-gnueabihf-gcc 4.8.3。运行该编译工具链需要一台 Linux x64 机器上以及 2.14版本以上的 glibc。
+
+## 配置交叉编译参数
+
+CMake[支持交叉编译](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。PaddlePaddle for Raspberry Pi的配置信息在[cmake/cross_compiling/raspberry_pi.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake)。
+
+交叉编译Raspberry Pi版本PaddlePaddle库时,有一些必须配置的参数:
+
+- `CMAKE_SYSTEM_NAME`:CMake编译的目标平台,必须配置为`RPi`。在设置`CMAKE_SYSTEM_NAME=RPi`后,PaddlePaddle的CMake系统才认为在是在交叉编译Raspberry Pi系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及目标机版OpenBLAS库。
+
+- `RPI_TOOLCHAIN`:编译工具链所在的绝对路径,或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动设置需要使用的交叉编译器;否则,用户需要在cmake时手动设置这些值。无默认值。
+
+- `RPI_ARM_NEON`:是否使用NEON指令。目前必须设置成`ON`,默认值为`ON`。
+
+- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
+
+一个常用的CMake配置如下:
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+其中`WITH_C_API=ON`表示需要构建推理库。
+
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。
+
+## 编译和安装
+
+CMake配置完成后,执行以下命令,PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle。
+
+```bash
+make
+make install
+```
+
+注意:如果你曾经在源码目录下编译过其他平台的PaddlePaddle库,请先使用`rm -rf`命令删除`third_party`目录和`build`目录,以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
+
+执行完安装命令后,`your/path/to/install`目录中会包含`include`和`lib`目录,其中`include`中包含C-API的头文件,`lib`中包含一个Raspberry Pi版本的库。
diff --git a/doc/mobile/cross_compiling_for_raspberry_en.md b/doc/mobile/cross_compiling_for_raspberry_en.md
new file mode 100644
index 0000000000..3c1a5950ff
--- /dev/null
+++ b/doc/mobile/cross_compiling_for_raspberry_en.md
@@ -0,0 +1,62 @@
+# Build PaddlePaddle for Raspberry Pi
+
+You may use any of the following two approaches to build the inference library of PaddlePaddle for Raspberry Pi:
+
+1. Build using SSH: Log in to a Raspberry Pi using SSH and build the library. The required development tools and third-party dependencies are listed in here: [`/Dockerfile`](https://github.com/PaddlePaddle/Paddle/blob/develop/Dockerfile).
+
+1. Cross-compile: We talk about how to cross-compile PaddlePaddle for Raspberry Pi on a Linux/x64 machine, in more detail in this article.
+
+## The Cross-Compiling Toolchain
+
+Step 1. Clone the Github repo by running the following command.
+
+```bash
+git clone https://github.com/raspberrypi/tools.git
+```
+
+Step 2. Use the pre-built cross-compiler found in `./tools/tree/master/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64`.  To run it on a Linux computer, glibc version >= 2.14 is needed.
+
+## CMake Arguments
+
+CMake supports [cross-compiling](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling).  All CMake configuration arguments required for the cross-compilation for Raspberry Pi can be found in [`cmake/cross_compiling/raspberry_pi.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/raspberry_pi.cmake).
+
+Some important arguments that need to be set:
+
+- `CMAKE_SYSTEM_NAME`: The target platform.  Must be `RPi`.
+
+- `RPI_TOOLCHAIN`: The absolute path of the cross-compiling toolchain.
+
+- `RPI_ARM_NEON`: Use ARM NEON Intrinsics. This is a required argument and set default to `ON`.
+
+- `HOST_C/CXX_COMPILER`: The C/C++ compiler for the host.  It is used to build building tools running on the host, for example, protoc.
+
+A commonly-used CMake configuration is as follows:
+
+```
+cmake -DCMAKE_SYSTEM_NAME=RPi \
+      -DRPI_TOOLCHAIN=your/path/to/arm-bcm2708/gcc-linaro-arm-linux-gnueabihf-raspbian-x64 \
+      -DRPI_ARM_NEON=ON \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \
+      -DWITH_GPU=OFF \
+      -DWITH_C_API=ON \
+      -DWITH_PYTHON=OFF \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
+To build the inference library, please set the argument WITH\_C\_API to ON: `WITH_C_API=ON`.
+
+You can add more arguments. For example, to minimize the size of the generated inference library, you may use `CMAKE_BUILD_TYPE=MinSizeRel`. For performance optimization, you may use `CMAKE_BUILD_TYPE=Release`.
+
+## Build and Install
+
+The following commands build the inference library of PaddlePaddle for Raspberry Pi and third-party dependencies.
+
+```bash
+make
+make install
+```
+
+ The intermediate files will be stored in `build`. Third-party libraries will be located in `build/third_party`. If you have already built it for other platforms like Android or iOS, you may want to clear these directories by running the command: `rm -rf build`.
+
+The infernece library will be in `your/path/to/install/lib`, with related header files in `your/path/to/install/include`.
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
new file mode 100644
index 0000000000..1d99666e58
--- /dev/null
+++ b/doc/mobile/index_cn.rst
@@ -0,0 +1,9 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_cn.md
+  cross_compiling_for_ios_cn.md
+  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
new file mode 100644
index 0000000000..ef421dacad
--- /dev/null
+++ b/doc/mobile/index_en.rst
@@ -0,0 +1,9 @@
+MOBILE
+======
+
+..  toctree::
+  :maxdepth: 1
+
+  cross_compiling_for_android_en.md
+  cross_compiling_for_ios_en.md
+  cross_compiling_for_raspberry_en.md
diff --git a/doc/survey/cluster_bootstrapping_tools.md b/doc/survey/cluster_bootstrapping_tools.md
new file mode 100644
index 0000000000..1cd9962700
--- /dev/null
+++ b/doc/survey/cluster_bootstrapping_tools.md
@@ -0,0 +1,71 @@
+# Cluster bootstrapping tool survey
+## Abstract
+In order to bring up a cluster from bare metal machine to a fully functional kubernetes cluster for Paddlepaddle to run, we need to utilize some tools. Here we are going to compare [Sextant](https://github.com/k8sp/sextant) and [Tectonic installer](https://github.com/coreos/tectonic-installer)
+
+## Basic assumptions
+Here are some basic assumptions before we move on to  details
+1. You are an administrator of a bare metal machine cluster, which means:
+  * you have full control to each of the machines.
+  * you have full control to the network which machines are connected to.
+2. Machines can be booted from network with PEX or iPXE
+3. You understand the [general procedure to bring up a cluster](#appendix-general-procedure-to-bring-up-a-cluster)
+
+if your cluster is able to mark above items with checkmarks, then keep reading.
+
+## Comparing Sextant and Tectonic installer
+### Sextant
+Sextant is an end2end solution to bring up a bare metal cluster to a fully functional k8s cluster, it integrates DHCP, name service, PEX, cloud-config-service, docker registry services altogether. 
+
+#### Pros
+1. End2End: basically all admin need to do is to config the cluster.yaml and power on the cluster.
+2. Offline cluster configuration: Sextant has 2 phases during working with it, config time and deploy time. when admin is configuring, it requires admin's machine has internet connectivity, which will download some images, etc. But in deploy time, it's completely OK to go offline since all dependencies are ready during config time.
+3. docker registry integrated.
+4. GPU machine took care of.
+
+### Cons
+1. k8s API server is not deployed with high availability in considering by default.
+2. No grouping support.
+3. No API interface, a one-off service.
+
+
+### Tectonic installer
+First of all, Tectonic is not free, it requires coreos.com account as a step of installation, and free user can only create less than 10 nodes.
+
+Tectonic is a suite of software which wraps around k8s and providing more utility regarding dev ops, ie, 
+Tectonic installer as it's named, it installs Tectonic to a bare metal cluster which means it's not totally an equivalent of Sextant. At the "booting a cluster" part, it mostly utilizes [Matchbox](https://github.com/coreos/matchbox), which is a general cluster bootstrapper.
+
+Matchbox's Approach is similar to Sexstant.
+
+### Pros
+1. supports grouping machines.
+2. supports running provisioning service in rtk. (not a big deal though).
+3. supports http/gRPC API interface.
+4. supports multi-template.
+
+### Cons
+1. Not an e2e solution to bring up a cluster, need a lot of extra work and other software.
+2. [Not fully supporting](https://github.com/coreos/matchbox/issues/550) centOS deployment yet.
+
+## Conclusion
+Sextant is a better solution overall for paddle cloud deploying to a bare metal cluster. It would be great if Sextant can also 1) deploy k8s api server with high availability by default; 2) not designed as a one-off service.
+
+
+
+## Appendix: General procedure to bring up a cluster
+It's physically impossible for a cluster admin to manually install OS and applications into cluster nodes one by one, here is what an admin would do in cloud industry:
+1. setup a bootstrap machine with static IP in the cluster, which has following services:
+  * DHCP: assigns ip address for rest of the nodes.
+  * name service: to map node name to a IP
+  * PXE related services: the booting related info will be delivered to newly booted machines as their IP is assigned via DHCP service, PXE service will provide further booting and installing info and image with TFTP and http protocol. 
+  * cluster config service: this is for providing cluster node with OS config via http
+  * optional docker registry: a built-in docker registry makes the whole cluster independent from connecting internet, and speeds up software distribution.
+2. New node powers on, it will
+  * broadcast the request for an IP address
+  * DHCP server assigns the IP address, and deliver the PXE booting related info to the node.
+  * cluster node will request config files with booting info delivered with DHCP via the TFTP service, and in most of the cases, the config file will point to a http service for the booting image.
+  * Since PXE is configured with initrd, it will utilize the cloud config service and do further installations like coreOS or K8s installations.
+  * then restart the node.
+
+For further understanding, following 2 links from Matchbox are some good readings:
+* [Machine lifecycle](https://github.com/coreos/matchbox/blob/master/Documentation/machine-lifecycle.md)
+* [PXE booting](https://github.com/coreos/matchbox/blob/master/Documentation/network-booting.md)
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 95cad835b1..41b35b5b23 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,22 +13,18 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index b477f0120c..5822c2481d 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2
 
 
 MarkdownParser = parser.CommonMarkParser
@@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
deleted file mode 100644
index 2b4a79fbbf..0000000000
--- a/doc/tutorials/embedding_model/index_cn.md
+++ /dev/null
@@ -1,139 +0,0 @@
-# 中文词向量模型的使用 #
-----------
-本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
-
-在此感谢 @lipeng 提出的代码需求,并给出的相关模型格式的定义。
-
-## 介绍 ###
-### 中文字典 ###
-我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下: "《红楼梦》"将被分为 "《","红楼梦","》",和 "《红楼梦》"。字典采用UTF8编码,输出有2列:词本身和词频。字典共包含 3206326个词和4个特殊标记:
-  - `<s>`: 分词序列的开始
-  - `<e>`: 分词序列的结束
-  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符,没有实际意义
-  - `<unk>`: 未知词
-
-### 中文词向量的预训练模型 ###
-遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法,模型采用 n-gram 语言模型,结构如下图:6元上下文作为输入层->全连接层->softmax层 。对应于字典,我们预训练得到4种不同维度的词向量,分别为:32维、64维、128维和256维。
-<center>![](./neural-n-gram-model.png)</center>
-<center>Figure 1. neural-n-gram-model</center>
-
-### 下载和数据抽取 ###
-运行以下的命令下载和获取我们的字典和预训练模型:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    ./pre_DictAndModel.sh
-
-## 中文短语改写的例子 ##
-以下示范如何使用预训练的中文字典和词向量进行短语改写。
-
-### 数据的准备和预处理 ###
-首先,运行以下的命令下载数据集。该数据集(utf8编码)包含20个训练样例,5个测试样例和2个生成式样例。
-
-    cd $PADDLE_ROOT/demo/seqToseq/data
-    ./paraphrase_data.sh
-
-第二步,将数据处理成规范格式,在训练数集上训练生成词向量字典(数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`):
-
-    cd $PADDLE_ROOT/demo/seqToseq/
-    python preprocess.py -i data/paraphrase [--mergeDict]
-
-- 其中,如果使用`--mergeDict`选项,源语言短语和目标语言短语的字典将被合并(源语言和目标语言共享相同的编码字典)。本实例中,源语言和目标语言都是相同的语言,因此可以使用该选项。
-
-
-### 使用用户指定的词向量字典 ###
-使用如下命令,从预训练模型中,根据用户指定的字典,抽取对应的词向量构成新的词表:
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
-
-- `--preModel PREMODEL`: 预训练词向量字典模型的路径
-- `--preDict PREDICT`:  预训练模型使用的字典的路径
-- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
-- `--usrDict USRDICT`: 用户指定新的字典的路径,用于构成新的词表
-- `-d DIM`: 参数(词向量)的维度
-
-此处,你也可以简单的运行以下的命令:
-
-    cd $PADDLE_ROOT/demo/seqToseq/data/
-    ./paraphrase_model.sh
-
-运行成功以后,你将会看到以下的模型结构:
-
-    paraphrase_model
-    |--- _source_language_embedding
-    |--- _target_language_embedding
-
-### 在PaddlePaddle平台训练模型 ###
-首先,配置模型文件,配置如下(可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置):
-
-    from seqToseq_net import *
-    is_generating = False
-
-    ################## Data Definition #####################
-    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                                 job_mode = job_mode)
-
-    ############## Algorithm Configuration ##################
-    settings(
-          learning_method = AdamOptimizer(),
-          batch_size = 50,
-          learning_rate = 5e-4)
-
-    ################# Network configure #####################
-    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
-
-这个配置与`demo/seqToseq/translation/train.conf` 基本相同
-
-然后,使用以下命令进行模型训练:
-
-    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
-    ./train.sh
-
-其中,`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同,只有2个配置不一样:
-
-- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
-- `--load_missing_parameter_strategy`:如果参数模型文件缺失,除词向量模型外的参数将使用正态分布随机初始化
-
-如果用户想要了解详细的数据集的格式、模型的结构和训练过程,请查看 [Text generation Tutorial](../text_generation/index_cn.md).
-
-## 可选功能 ##
-###  观测词向量
-PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
-
-- `-i INPUT`: 输入的(二进制)词向量模型名称
-- `-o OUTPUT`: 输出的文本模型名称
-- `-d DIM`: (词向量)参数维度
-
-运行完以上命令,用户可以在输出的文本模型中看到:
-
-    0,4,32156096
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
-
-- 其中,第一行是`PaddlePaddle` 输出文件的格式说明,包含3个属性::
-  - `PaddlePaddle`的版本号,本例中为0
-  - 浮点数占用的字节数,本例中为4
-  - 总计的参数个数,本例中为32,156,096
-- 其余行是(词向量)参数行(假设词向量维度为32)
-  - 每行打印32个参数以','分隔
-  - 共有32,156,096/32 = 1,004,877行,也就是说,模型共包含1,004,877个被向量化的词
-
-### 词向量模型的修正
-`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
-
-- `-i INPUT`: 输入的文本词向量模型名称
-- `-o OUTPUT`: 输出的二进制词向量模型名称
-
-请注意,输入的文本格式如下:
-
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
-- 输入文本中没有头部(格式说明)行
-- (输入文本)每行存储一个词,以逗号','分隔
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md
deleted file mode 100644
index 9525f64f9b..0000000000
--- a/doc/tutorials/embedding_model/index_en.md
+++ /dev/null
@@ -1,140 +0,0 @@
-# Chinese Word Embedding Model Tutorial #
-----------
-This tutorial is to guide you through the process of using a Pretrained Chinese Word Embedding Model in the PaddlePaddle standard format.
-
-We thank @lipeng for the pull request that defined the model schemas and pretrained the models.
-
-## Introduction ###
-### Chinese Word Dictionary ###
-Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《","红楼梦","》",and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token:
-  - `<s>`: the start of a sequence
-  - `<e>`: the end of a sequence
-  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding
-  - `<unk>`: a word not included in dictionary
-
-### Pretrained Chinese Word Embedding Model ###
-Inspired by paper [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf), our model architecture (**Embedding joint of six words->FullyConnect->SoftMax**) is as following graph. And for our dictionary, we pretrain four models with different word vector dimenstions, i.e 32, 64, 128, 256.
-<center>![](./neural-n-gram-model.png)</center>
-<center>Figure 1. neural-n-gram-model</center>
-
-### Download and Extract ###
-To download and extract our dictionary and pretrained model, run the following commands.
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    ./pre_DictAndModel.sh
-
-## Chinese Paraphrasing Example ##
-We provide a paraphrasing task to show the usage of pretrained Chinese Word Dictionary and Embedding Model.
-
-### Data Preparation and Preprocess ###
-
-First, run the following commands to download and extract the in-house dataset. The dataset (using UTF-8 format) has 20 training samples, 5 testing samples and 2 generating samples.
-
-    cd $PADDLE_ROOT/demo/seqToseq/data
-    ./paraphrase_data.sh
-
-Second, preprocess data and build dictionary on train data by running the following commands, and the preprocessed dataset is stored in `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`:
-
-    cd $PADDLE_ROOT/demo/seqToseq/
-    python preprocess.py -i data/paraphrase [--mergeDict]
-
-- `--mergeDict`: if using this option, the source and target dictionary are merged, i.e, two dictionaries have the same context. Here, as source and target data are all chinese words, this option can be used.
-
-### User Specified Embedding Model ###
-The general command of extracting desired parameters from the pretrained embedding model based on user dictionary is:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
-
-- `--preModel PREMODEL`: the name of pretrained embedding model
-- `--preDict PREDICT`: the name of pretrained dictionary
-- `--usrModel USRMODEL`: the name of extracted embedding model
-- `--usrDict USRDICT`: the name of user specified dictionary
-- `-d DIM`: dimension of parameter
-
-Here, you can simply run the command:
-
-    cd $PADDLE_ROOT/demo/seqToseq/data/
-    ./paraphrase_model.sh
-
-And you will see following embedding model structure:
-
-    paraphrase_model
-    |--- _source_language_embedding
-    |--- _target_language_embedding
-
-### Training Model in PaddlePaddle ###
-First, create a model config file, see example `demo/seqToseq/paraphrase/train.conf`:
-
-    from seqToseq_net import *
-    is_generating = False
-
-    ################## Data Definition #####################
-    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
-                                 job_mode = job_mode)
-
-    ############## Algorithm Configuration ##################
-    settings(
-          learning_method = AdamOptimizer(),
-          batch_size = 50,
-          learning_rate = 5e-4)
-
-    ################# Network configure #####################
-    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
-
-This config is almost the same as `demo/seqToseq/translation/train.conf`.
-
-Then, train the model by running the command:
-
-    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
-    ./train.sh
-
-where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the only difference is following two command arguments:
-
-- `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
-- `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
-
-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/index_en.md).
-
-## Optional Function ##
-###  Embedding Parameters Observation
-For users who want to observe the embedding parameters, this function can convert a PaddlePaddle binary embedding model to a text model by running the command:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
-
-- `-i INPUT`: the name of input binary embedding model
-- `-o OUTPUT`: the name of output text embedding model
-- `-d DIM`: the dimension of parameter
-
-You will see parameters like this in output text model:
-
-    0,4,32156096
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
-
-- 1st line is **PaddlePaddle format file head**, it has 3 attributes:
-  - version of PaddlePaddle, here is 0
-  - sizeof(float), here is 4
-  - total number of parameter, here is 32156096
-- Other lines print the paramters (assume `<dim>` = 32)
-  - each line print 32 paramters splitted by ','
-  - there is 32156096/32 = 1004877 lines, meaning there is 1004877 embedding words
-
-### Embedding Parameters Revision
-For users who want to revise the embedding parameters, this function can convert a revised text embedding model to a PaddlePaddle binary model by running the command:
-
-    cd $PADDLE_ROOT/demo/model_zoo/embedding
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
-
-- `-i INPUT`: the name of input text embedding model.
-- `-o OUTPUT`: the name of output binary embedding model
-
-Note that the format of input text model is as follows:
-
-    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
-    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
-    ......
-- there is no file header in 1st line
-- each line stores parameters for one word, the separator is commas ','
diff --git a/doc/tutorials/embedding_model/neural-n-gram-model.png b/doc/tutorials/embedding_model/neural-n-gram-model.png
deleted file mode 100644
index f70b765b3f..0000000000
Binary files a/doc/tutorials/embedding_model/neural-n-gram-model.png and /dev/null differ
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
deleted file mode 100644
index 0eafd7cb49..0000000000
Binary files a/doc/tutorials/gan/gan.png and /dev/null differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
deleted file mode 100644
index ac9ed37b22..0000000000
--- a/doc/tutorials/gan/index_en.md
+++ /dev/null
@@ -1,137 +0,0 @@
-# Generative Adversarial Networks (GAN) 
-
-This demo implements GAN training described in the original [GAN paper](https://arxiv.org/abs/1406.2661) and deep convolutional generative adversarial networks [DCGAN paper](https://arxiv.org/abs/1511.06434).
-
-The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
-
-<center>![](./gan.png)</center>
-<p align="center">
-    Figure 1. GAN-Model-Structure
-    <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
-</p>
-
-The generator and discriminator take turn to be trained using SGD. The objective function of the generator is for its generated images being classified as real by the discriminator, and the objective function of the discriminator is to correctly classify real and fake images. When the GAN model is trained to converge to the equilibrium state, the generator will transform the given noise distribution to the distribution of real images, and the discriminator will not be able to distinguish between real and fake images at all. 
-
-## Implementation of GAN Model Structure
-Since GAN model involves multiple neural networks, it requires to use paddle python API. So the code walk-through below can also partially serve as an introduction to the usage of Paddle Python API.
-
-There are three networks defined in gan_conf.py, namely **generator_training**, **discriminator_training** and **generator**. The relationship to the model structure we defined above is that **discriminator_training** is the discriminator, **generator** is the generator, and the **generator_training** combined the generator and discriminator since training generator would require the discriminator to provide loss function. This relationship is described in the following code:
-```python
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim)
-
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
-```
-
-In order to train the networks defined in gan_conf.py, one first needs to initialize a Paddle environment, parse the config, create GradientMachine from the config and create trainer from GradientMachine as done in the code chunk below:
-```python
-import py_paddle.swig_paddle as api
-# init paddle environment
-api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-               '--log_period=100', '--gpu_id=' + args.gpu_id,
-               '--save_dir=' + "./%s_params/" % data_source)
-
-# Parse config
-gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
-generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
-
-# Create GradientMachine
-dis_training_machine = api.GradientMachine.createFromConfigProto(
-dis_conf.model_config)
-gen_training_machine = api.GradientMachine.createFromConfigProto(
-gen_conf.model_config)
-generator_machine = api.GradientMachine.createFromConfigProto(
-generator_conf.model_config)
-
-# Create trainer
-dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
-gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
-```
-
-In order to balance the strength between generator and discriminator, we schedule to train whichever one is performing worse by comparing their loss function value. The loss function value can be calculated by a forward pass through the GradientMachine.
-```python
-def get_training_loss(training_machine, inputs):
-    outputs = api.Arguments.createArguments(0)
-    training_machine.forward(inputs, outputs, api.PASS_TEST)
-    loss = outputs.getSlotValue(0).copyToNumpyMat()
-    return numpy.mean(loss)
-```
-
-After training one network, one needs to sync the new parameters to the other networks. The code below demonstrates one example of such use case:
-```python
-# Train the gen_training
-gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
-
-# Copy the parameters from gen_training to dis_training and generator
-copy_shared_parameters(gen_training_machine,
-dis_training_machine)
-copy_shared_parameters(gen_training_machine, generator_machine)
-```
-
-
-## A Toy Example 
-With the infrastructure explained above, we can now walk you through a toy example of generating two dimensional uniform distribution using 10 dimensional Gaussian noise. 
-
-The Gaussian noises are generated using the code below:
-```python
-def get_noise(batch_size, noise_dim):
-    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
-```
-
-The real samples (2-D uniform) are generated using the code below:
-```python
-# synthesize 2-D uniform data in gan_trainer.py:114
-def load_uniform_data():
-    data = numpy.random.rand(1000000, 2).astype('float32')
-    return data
-```
-
-The generator and discriminator network are built using fully-connected layer and batch_norm layer, and are defined in gan_conf.py. 
-
-To train the GAN model, one can use the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-```bash
-$python gan_trainer.py -d uniform --useGpu 1
-```
-The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
-
-<center>![](./uniform_sample.png)</center>
-<p align="center">
-    Figure 2. Uniform Sample
-</p>
-
-## MNIST Example
-### Data preparation
-To download the MNIST data, one can use the following commands:
-```bash
-$cd data/
-$./get_mnist_data.sh
-```
-
-### Model description
-Following the DC-Gan paper (https://arxiv.org/abs/1511.06434), we use convolution/convolution-transpose layer in the discriminator/generator network to better deal with images. The details of the network structures are defined in gan_conf_image.py. 
-
-### Training the model
-To train the GAN model on mnist data, one can use the following command:
-```bash
-$python gan_trainer.py -d mnist --useGpu 1
-```
-The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<center>![](./mnist_sample.png)</center>
-<p align="center">
-    Figure 3. MNIST Sample
-</p>
diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/tutorials/gan/mnist_sample.png
deleted file mode 100644
index f9c7bf7ddd..0000000000
Binary files a/doc/tutorials/gan/mnist_sample.png and /dev/null differ
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
deleted file mode 100644
index e716c48e78..0000000000
Binary files a/doc/tutorials/gan/uniform_sample.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
deleted file mode 100644
index f54a0c5883..0000000000
Binary files a/doc/tutorials/image_classification/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
deleted file mode 100644
index 14f2558050..0000000000
Binary files a/doc/tutorials/image_classification/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md
deleted file mode 100644
index 87f465522a..0000000000
--- a/doc/tutorials/image_classification/index_cn.md
+++ /dev/null
@@ -1,205 +0,0 @@
-图像分类教程
-==========
-
-在本教程中,我们将使用CIFAR-10数据集训练一个卷积神经网络,并使用这个神经网络来对图片进行分类。如下图所示,卷积神经网络可以辨识图片中的主体,并给出分类结果。
-<center>![Image Classification](./image_classification.png)</center>
-
-## 数据准备
-首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址:
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-我们准备了一个脚本,可以用于从官方网站上下载CIFAR-10数据集,转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装:
-
-1. 安装pillow
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. 下载数据集
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类,每个类包含6000张。其中50000张图片作为训练集,10000张作为测试集。
-
-下图展示了所有的图片类别,每个类别中随机抽取了10张图片。
-<center>![Image Classification](./cifar.png)</center>
-
-脚本运行完成后,我们应当会得到一个名为cifar-out的文件夹,其下子文件夹的结构如下
-
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-cifar-out下包含`train`和`test`两个文件夹,其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹,每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后,我们就可以着手对分类模型进行训练了。
-
-## 预处理
-数据下载之后,还需要进行预处理,将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作:
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` 使用如下参数:
-
-- `-i` 或 `--input` 给出输入数据所在路径;
-- `-s` 或 `--size` 给出图片尺寸;
-- `-c` 或 `--color` 标示图片是彩色图或灰度图
-
-## 模型训练
-在开始训练之前,我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**,这里的列出的和`vgg_16_cifar.py`文件稍有差别,因为该文件可适用于预测。
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-在第一行中我们载入用于定义网络的函数。
-```python
-from paddle.trainer_config_helpers import *
-```
-
-之后定义的`define_py_data_sources2`使用Python数据提供器,其中 `args`将在`image_provider.py`进行使用,该文件负责产生图片数据并传递给Paddle系统
- - `meta`: 训练集平均值。
- - `mean_img_size`: 平均特征图的高度及宽度。
- - `img_size`:输入图片的高度及宽度。
- - `num_classes`:类别个数。
- - `use_jpeg`:处理过程中数据存储格式。
- - `color`:标示是否为彩色图片。
- 
- `settings`用于设置训练算法。在下面的例子中,learning rate被设置为0.1除以batch size,而weight decay则为0.0005乘以batch size。
- 
- ```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考:[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-配置创建完毕后,可以运行脚本train.sh来训练模型。
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境,可以设置`use_gpu=0`。
-- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。
-- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败,也许是因为需要安装`matplotlib`。
-在训练完成后,训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例:
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-## 预测
-在训练完成后,模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。
-
-要对一个图片的进行分类预测,我们可以使用`predict.sh`,该脚本将输出预测分类的标签:
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## 练习
-在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载,其中包含了200种鸟类的照片(主要来自北美洲)。
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## 细节探究
-### 卷积神经网络
-卷积神经网络是一种使用卷积层的前向神经网络,很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示:
-
-![Convolutional Neural Network](./lenet.png)
-
-一个卷积神经网络包含如下层:
-
-- 卷积层:通过卷积操作从图片或特征图中提取特征
-- 池化层:使用max-pooling对特征图下采样
-- 全连接层:使输入层到隐藏层的神经元是全部连接的。
-
-卷积神经网络在图片分类上有着惊人的性能,这是因为它发掘出了图片的两类重要信息:局部关联性质和空间不变性质。通过交替使用卷积和池化处理, 卷积神经网络能够很好的表示这两类信息。
-
-关于如何定义网络中的层,以及如何在层之间进行连接,请参考Layer文档。
diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md
deleted file mode 100644
index 60c81a6a53..0000000000
--- a/doc/tutorials/image_classification/index_en.md
+++ /dev/null
@@ -1,221 +0,0 @@
-Image Classification Tutorial
-==============================
-
-This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
-As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
-
-<center>![Image Classification](./image_classification.png)</center>
-
-## Data Preparation
-First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website.
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset.
-It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents.
-Consider the following commands:
-
-1. install pillow dependents
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. download data and preparation
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.
-
-Here are the classes in the dataset, as well as 10 random images from each:
-<center>![Image Classification](./cifar.png)</center>
-
-
-After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format:
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model.
-
-## Preprocess
-After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing.
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data.
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` has the following arguments
-
-- `-i` or `--input` specifes  the input data directory.
-- `-s` or `--size` specifies the processed size of images.
-- `-c` or `--color` specifes whether images are color images or gray images.
-
-
-## Model Training
-We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction.
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-The first line imports python functions for defining networks.
-```python
-from paddle.trainer_config_helpers import *
-```
-
-Then define an `define_py_data_sources2` which use python data provider
-interface. The arguments in `args` are used in `image_provider.py` which
-yeilds image data and transform them to Paddle.
- - `meta`: the mean value of training set.
- - `mean_img_size`: the size of mean feature map.
- - `img_size`: the height and width of input image.
- - `num_classes`: the number of classes.
- - `use_jpeg`: the data storage type when preprocessing.
- - `color`: specify color image.
-
-`settings` specifies the training algorithm. In the following example,
-it specifies learning rate as 0.1, but divided by batch size, and the weight decay
-is 0.0005 and multiplied by batch size.
-```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network
-for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-After writing the config, we can train the model by running the script train.sh.
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-
-- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`.
-
-- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags.
-
-- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.
-
-
-After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-
-## Prediction
-After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`.
-
-To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication.
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## Exercise
-Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American).
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## Delve into Details
-### Convolutional Neural Network
-A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below:
-
-![Convolutional Neural Network](./lenet.png)
-
-Convolutional Neural Network contains the following layers:
-
-- Convolutional layer: It uses convolution operation to extract features from an image or a feature map.
-- Pooling layer: It uses max-pooling to downsample feature maps.
-- Fully Connected layer: It uses fully connected connections to transform features.
-
-Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images.
-
-
-For more details of how to define layers and their connections, please refer to the documentation of layers.
diff --git a/doc/tutorials/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
deleted file mode 100644
index 1e6f2b32ba..0000000000
Binary files a/doc/tutorials/image_classification/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
deleted file mode 100644
index a31f99791c..0000000000
Binary files a/doc/tutorials/image_classification/plot.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
deleted file mode 100644
index f54a0c5883..0000000000
Binary files a/doc/tutorials/image_classification/src/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
deleted file mode 100644
index 14f2558050..0000000000
Binary files a/doc/tutorials/image_classification/src/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
deleted file mode 100644
index 1e6f2b32ba..0000000000
Binary files a/doc/tutorials/image_classification/src/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
deleted file mode 100644
index a31f99791c..0000000000
Binary files a/doc/tutorials/image_classification/src/plot.png and /dev/null differ
diff --git a/doc/tutorials/imagenet_model/resnet_block.jpg b/doc/tutorials/imagenet_model/resnet_block.jpg
deleted file mode 100644
index e16bd3c624..0000000000
Binary files a/doc/tutorials/imagenet_model/resnet_block.jpg and /dev/null differ
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/tutorials/imagenet_model/resnet_model_cn.md
deleted file mode 100644
index 82ec9d70b3..0000000000
--- a/doc/tutorials/imagenet_model/resnet_model_cn.md
+++ /dev/null
@@ -1,284 +0,0 @@
-# Model Zoo - ImageNet #
-
-[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。
-
-## ResNet 介绍
-
-论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练,所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中,而右图的瓶颈连接模块用于50层,101层和152层的网络结构中。
-
-<center>![resnet_block](./resnet_block.jpg)</center>
-<center>图 1. ResNet 网络模块</center>
-
-本教程中我们给出了三个ResNet模型,这些模型都是由原作者提供的模型<https://github.com/KaimingHe/deep-residual-networks>转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率,其中输入图像的颜色通道顺序为**BGR**,保持宽高比缩放到短边为256,只截取中心方形的图像区域。分类错误率和模型大小由下表给出。
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">ResNet</th>
-<th scope="col" class="left">Top-1</th>
-<th scope="col" class="left">Model Size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">ResNet-50</td>
-<td class="left">24.9%</td>
-<td class="left">99M</td>
-</tr>
-<tr>
-<td class="left">ResNet-101</td>
-<td class="left">23.7%</td>
-<td class="left">173M</td>
-</tr>
-<tr>
-<td class="left">ResNet-152</td>
-<td class="left">23.2%</td>
-<td class="left">234M</td>
-</tr>
-</tbody>
-
-</table></center>
-<br>
-
-## ResNet 模型
-
-50层,101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。
-
-### 网络可视化
-
-你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件,然后可以转换为图片。需要安装graphviz来转换dot文件为图片。
-
-```
-cd demo/model_zoo/resnet
-./net_diagram.sh
-```
-
-### 模型下载
-
-```
-cd demo/model_zoo/resnet
-./get_model.sh
-```
-你可以执行上述命令来下载所有的模型和均值文件,如果下载成功,这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。
-
-```
-mean_meta_224  resnet_101  resnet_152  resnet_50
-```
-   * resnet_50: 50层网络模型。
-   * resnet_101: 101层网络模型。
-   * resnet_152: 152层网络模型。
-   * mean\_meta\_224: 均值图像文件,图像大小为3 x 224 x 224,颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。
-
-### 参数信息
-
-* **卷积层权重**
-
-  由于每个卷积层后面连接的是batch normalization层,因此该层中没有偏置(bias)参数,并且只有一个权重。
-  形状: `(Co, ky, kx, Ci)`
-   * Co: 输出特征图的通道数目
-   * ky: 滤波器核在垂直方向上的尺寸
-   * kx: 滤波器核在水平方向上的尺寸
-   * Ci: 输入特征图的通道数目
-
-  二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。
-
-* **全连接层权重**
-
-  二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。
-
-* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) 层权重**
-
-本层有四个参数,实际上只有.w0和.wbias是需要学习的参数,另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">参数名</th>
-<th scope="col" class="left">尺寸</th>
-<th scope="col" class="left">含义</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">_res2_1_branch1_bn.w0</td>
-<td class="left">256</td>
-<td class="left">gamma, 缩放参数</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w1</td>
-<td class="left">256</td>
-<td class="left">特征图均值</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w2</td>
-<td class="left">256</td>
-<td class="left">特征图方差</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.wbias</td>
-<td class="left">256</td>
-<td class="left">beta, 偏置参数</td>
-</tr>
-</tbody>
-
-</table></center>
-<br>
-
-### 参数读取
-
-使用者可以使用下面的Python脚本来读取参数值:
-
-```
-import sys
-import numpy as np
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-
-if __name__=='__main__':
-    weight = load(sys.argv[1])
-```
-
-或者直接使用下面的shell命令:
-
-```
-od -j 16 -f _res2_1_branch1_bn.w0
-```
-
-## 特征提取
-
-我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据,详细地展示了整个特征提取的过程。
-
-### C++接口
-
-首先,在配置文件中的`define_py_data_sources2`里指定图像数据列表,具体请参照示例`demo/model_zoo/resnet/resnet.py`。
-
-```
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args={
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224, 'crop_size': 224,
-        'color': True,'swap_channel:': [2, 1, 0]}
-    define_py_data_sources2(train_list,
-                           'example/test.list',
-                           module="example.image_list_provider",
-                           obj="processData",
-                           args=args)
-```
-
-第二步,在`resnet.py`文件中指定要提取特征的网络层的名字。例如,
-
-```
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-```
-
-第三步,在`extract_fea_c++.sh`文件中指定模型路径和输出的目录,然后执行下面的命令。
-
-```
-cd demo/model_zoo/resnet
-./extract_fea_c++.sh
-```
-
-如果执行成功,特征将会存到`fea_output/rank-00000`文件中,如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。
-
-```
--0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
--0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
-```
-
-* 每行存储的是一个样本的特征。其中,第一行存的是图像`example/dog.jpg`的特征,第二行存的是图像`example/cat.jpg`的特征。
-* 不同层的特征由分号`;`隔开,并且它们的顺序与`Outputs()`中指定的层顺序一致。这里,左边是`res5_3_branch2c_conv`层的特征,右边是`res5_3_branch2c_bn`层特征。
-
-### Python接口
-
-示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下:
-
-```
-cd demo/model_zoo/resnet
-./extract_fea_py.sh
-```
-
-extract_fea_py.sh:
-
-```
-python classify.py \
-     --job=extract \
-     --conf=resnet.py\
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
-
-```
-* \--job=extract:              指定工作模式来提取特征。
-* \--conf=resnet.py:           网络配置文件。
-* \--use_gpu=1:                指定是否使用GPU。
-* \--model=model/resnet_50:    模型路径。
-* \--data=./example/test.list: 数据列表。
-* \--output_layer="xxx,xxx":   指定提取特征的层。
-* \--output_dir=features:      输出目录。
-
-如果运行成功,你将会看到特征存储在`features/batch_0`文件中,该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件,它将返回如下的字典:
-
-```
-{
-'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
-'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
-}
-```
-
-仔细观察,这些特征值与上述使用C++接口提取的结果是一致的。
-
-## 预测
-
-`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`,它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。
-
-```
-cd demo/model_zoo/resnet
-./predict.sh
-```
-
-predict.sh调用了`classify.py`:
-
-```
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --multi_crop \
-     --model=model/resnet_50 \
-     --use_gpu=1 \
-     --data=./example/test.list
-```
-* \--job=extract:              指定工作模型进行预测。
-* \--conf=resnet.py:           网络配置文件。network configure.
-* \--multi_crop:               使用10个裁剪图像块,预测概率取平均。
-* \--use_gpu=1:                指定是否使用GPU。
-* \--model=model/resnet_50:    模型路径。
-* \--data=./example/test.list: 数据列表。
-
-如果运行成功,你将会看到如下结果,其中156和285是这些图像的分类标签。
-
-```
-Label of example/dog.jpg is: 156
-Label of example/cat.jpg is: 282
-```
diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/tutorials/imagenet_model/resnet_model_en.md
deleted file mode 100644
index 478ad06193..0000000000
--- a/doc/tutorials/imagenet_model/resnet_model_en.md
+++ /dev/null
@@ -1,284 +0,0 @@
-# Model Zoo - ImageNet #
-
-[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provides convolutional neural network(CNN) models for ImageNet.
-
-## ResNet Introduction
-
-ResNets from paper [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) won the 1st place on the ILSVRC 2015 classification task. They present residual learning framework to ease the training of networks that are substantially deeper than those used previously. The residual connections are shown in following figure. The left building block is used in network of 34 layers and the right bottleneck building block is used in network of 50, 101, 152 layers .
-
-<center>![resnet_block](./resnet_block.jpg)</center>
-<center>Figure 1. ResNet Block</center>
-
-We present three ResNet models, which are converted from the models provided by the authors <https://github.com/KaimingHe/deep-residual-networks>.  The classfication errors tested in PaddlePaddle on 50,000 ILSVRC validation set with input images channel order of **BGR** by single scale with the shorter side of 256 and single crop as following table.
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">ResNet</th>
-<th scope="col" class="left">Top-1</th>
-<th scope="col" class="left">Model Size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">ResNet-50</td>
-<td class="left">24.9%</td>
-<td class="left">99M</td>
-</tr>
-<tr>
-<td class="left">ResNet-101</td>
-<td class="left">23.7%</td>
-<td class="left">173M</td>
-</tr>
-<tr>
-<td class="left">ResNet-152</td>
-<td class="left">23.2%</td>
-<td class="left">234M</td>
-</tr>
-</tbody>
-
-</table></center>
-<br>
-
-## ResNet Model
-
-See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like ```--config_args=layer_num=50``` in command line arguments.
-
-### Network Visualization
-
-You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert.
-
-```
-cd demo/model_zoo/resnet
-./net_diagram.sh
-```
-
-### Model Download
-
-```
-cd demo/model_zoo/resnet
-./get_model.sh
-```
-You can run above command to download all models and mean file and save them in ```demo/model_zoo/resnet/model``` if downloading successfully.
-
-```
-mean_meta_224  resnet_101  resnet_152  resnet_50
-```
-   * resnet_50: model of 50 layers.
-   * resnet_101: model of 101 layers.
-   * resnet_152: model of 152 layers.
-   * mean\_meta\_224: mean file with 3 x 224 x 224 size in **BGR** order. You also can use three mean values: 103.939, 116.779, 123.68.
-
-### Parameter Info
-
-* **Convolution Layer Weight**
-
-  As batch normalization layer is connected after each convolution layer, there is no parameter of bias and only one weight in this layer.
-  shape: `(Co, ky, kx, Ci)`
-   * Co: channle number of output feature map.
-   * ky: filter size in vertical direction.
-   * kx: filter size in horizontal direction.
-   * Ci: channle number of input feature map.
-
-  2-Dim matrix: (Co * ky * kx, Ci), saved in row-major order.
-
-* **Fully connected Layer Weight**
-
-  2-Dim matrix: (input layer size, this layer size), saved in row-major order.
-
-* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) Layer Weight**
-
-There are four parameters in this layer. In fact, only .w0 and .wbias are the learned parameters. The other two are therunning mean and variance respectively. They will be loaded in testing. Following table shows parameters of a batch normzalization layer.
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-<thead>
-<tr>
-<th scope="col" class="left">Parameter Name</th>
-<th scope="col" class="left">Number</th>
-<th scope="col" class="left">Meaning</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">_res2_1_branch1_bn.w0</td>
-<td class="left">256</td>
-<td class="left">gamma, scale parameter</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w1</td>
-<td class="left">256</td>
-<td class="left">mean value of feature map</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.w2</td>
-<td class="left">256</td>
-<td class="left">variance of feature map</td>
-</tr>
-<tr>
-<td class="left">_res2_1_branch1_bn.wbias</td>
-<td class="left">256</td>
-<td class="left">beta, shift parameter</td>
-</tr>
-</tbody>
-
-</table></center>
-<br>
-
-### Parameter Observation
-
-Users who want to observe the parameters can use Python to read:
-
-```
-import sys
-import numpy as np
-
-def load(file_name):
-    with open(file_name, 'rb') as f:
-        f.read(16) # skip header for float type.
-        return np.fromfile(f, dtype=np.float32)
-
-if __name__=='__main__':
-    weight = load(sys.argv[1])
-```
-
-or simply use following shell command:
-
-```
-od -j 16 -f _res2_1_branch1_bn.w0
-```
-
-## Feature Extraction
-
-We provide both C++ and Python interfaces to extract features. The following examples use data in `demo/model_zoo/resnet/example` to show the extracting process in detail.
-
-### C++ Interface
-
-First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.
-
-```
-    train_list = 'train.list' if not is_test else None
-    # mean.meta is mean file of ImageNet dataset.
-    # mean.meta size : 3 x 224 x 224.
-    # If you use three mean value, set like:
-    # "mean_value:103.939,116.779,123.68;"
-    args={
-        'mean_meta': "model/mean_meta_224/mean.meta",
-        'image_size': 224, 'crop_size': 224,
-        'color': True,'swap_channel:': [2, 1, 0]}
-    define_py_data_sources2(train_list,
-                           'example/test.list',
-                           module="example.image_list_provider",
-                           obj="processData",
-                           args=args)
-```
-
-Second, specify layers to extract features in `Outputs()` of `resnet.py`. For example,
-
-```
-Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-```
-
-Third, specify model path and output directory in `extract_fea_c++.sh`, and then run the following commands.
-
-```
-cd demo/model_zoo/resnet
-./extract_fea_c++.sh
-```
-
-If successful, features are saved in `fea_output/rank-00000` as follows. And you can use `load_feature_c` interface in `load_feature.py ` to load such a file.
-
-```
--0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
--0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
-```
-
-* Each line stores features of a sample. Here, the first line stores features of `example/dog.jpg` and second line stores features of `example/cat.jpg`.
-* Features of different layers are splitted by `;`, and their order is consistent with the layer order in `Outputs()`. Here, the left features are `res5_3_branch2c_conv` layer and right features are `res5_3_branch2c_bn` layer.
-
-### Python Interface
-
-`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
-
-```
-cd demo/model_zoo/resnet
-./extract_fea_py.sh
-```
-
-extract_fea_py.sh:
-
-```
-python classify.py \
-     --job=extract \
-     --conf=resnet.py\
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
-
-```
-* \--job=extract:              specify job mode to extract feature.
-* \--conf=resnet.py:           network configure.
-* \--use_gpu=1:             speficy GPU mode.
-* \--model=model/resnet_5:     model path.
-* \--data=./example/test.list: data list.
-* \--output_layer="xxx,xxx":   specify layers to extract features.
-* \--output_dir=features:      output diretcoty.
-
-If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
-
-```
-{
-'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
-'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
-}
-```
-
-Observed carefully, these feature values are consistent with the above results extracted by C++ interface.
-
-## Prediction
-
-`classify.py` also can be used to predict. We provide an example script `predict.sh` to predict data in `example/test.list` using a ResNet model with 50 layers.
-
-```
-cd demo/model_zoo/resnet
-./predict.sh
-```
-
-predict.sh calls the `classify.py`:
-
-```
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --multi_crop \
-     --model=model/resnet_50 \
-     --use_gpu=1 \
-     --data=./example/test.list
-```
-* \--job=extract:              speficy job mode to predict.
-* \--conf=resnet.py:           network configure.
-* \--multi_crop:               use 10 crops and average predicting probability.
-* \--use_gpu=1:             speficy GPU mode.
-* \--model=model/resnet_50:    model path.
-* \--data=./example/test.list: data list.
-
-If run successfully, you will see following results, where 156 and 285 are labels of the images.
-
-```
-Label of example/dog.jpg is: 156
-Label of example/cat.jpg is: 282
-```
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
deleted file mode 100644
index 6a27004d58..0000000000
--- a/doc/tutorials/index_cn.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# 完整教程
-
-* [快速入门](quick_start/index_cn.rst)
-* [个性化推荐](rec/ml_regression_cn.rst)
-* [图像分类](image_classification/index_cn.md)
-* [情感分析](sentiment_analysis/index_cn.md)
-* [语义角色标注](semantic_role_labeling/index_cn.md)
-* [机器翻译](text_generation/index_cn.md)
-
-## 常用模型
-
-* [ResNet模型](imagenet_model/resnet_model_cn.md)
-* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
deleted file mode 100644
index 77331a703b..0000000000
--- a/doc/tutorials/index_en.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# TUTORIALS
-There are several examples and demos here.
-
-* [Quick Start](quick_start/index_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
-* [Image Classification](image_classification/index_en.md)
-* [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-* [Text Generation](text_generation/index_en.md)
-* [Image Auto-Generation](gan/index_en.md)
-
-## Model Zoo
-* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
-* [Embedding: Chinese Word](embedding_model/index_en.md)
diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/tutorials/quick_start/index_cn.rst
deleted file mode 100644
index d565fcf95e..0000000000
--- a/doc/tutorials/quick_start/index_cn.rst
+++ /dev/null
@@ -1,397 +0,0 @@
-=============
-快速入门教程
-=============
-
-我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
-介绍PaddlePaddle的基本使用方法。
-
-安装
-====
-
-请参考 :ref:`install_steps` 安装PaddlePaddle。
-
-使用概述
-========
-
-**文本分类问题**:对于给定的一条文本,我们从提前给定的类别集合中选择其所属类别。
-
-比如, 在购物网站上,通过查看买家对某个产品的评价反馈, 评估该产品的质量。
-
-- 这个显示器很棒! (好评)
-- 用了两个月之后这个显示器屏幕碎了。(差评)
-
-使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。
-
-    ..  image:: src/Pipeline_cn.jpg
-        :align: center
-        :scale: 80%
-
-1. 数据格式准备
-    - 本例每行保存一条样本,类别Id和文本信息用 ``Tab`` 间隔,文本中的单词用空格分隔(如果不切词,则字与字之间用空格分隔),例如:``类别Id '\t' 这 个 显 示 器 很 棒 !``
-2. 向系统传送数据
-    - PaddlePaddle可以执行用户的python脚本程序来读取各种格式的数据文件。
-    - 本例的所有字符都将转换为连续整数表示的Id传给模型。
-3. 描述网络结构和优化算法
-    - 本例由易到难展示4种不同的文本分类网络配置:逻辑回归模型,词向量模型,卷积模型,时序模型。
-    - 常用优化算法包括Momentum, RMSProp,AdaDelta,AdaGrad,Adam,Adamax等,本例采用Adam优化方法,加了L2正则和梯度截断。
-4. 训练模型
-5. 应用模型
-
-数据格式准备
-------------
-
-接下来我们将展示如何用PaddlePaddle训练一个文本分类模型,将 `Amazon电子产品评论数据 <http://jmcauley.ucsd.edu/data/amazon/>`_ 分为好评(正样本)和差评(负样本)两种类别。
-`源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录里提供了该数据的下载脚本和预处理脚本,你只需要在命令行输入以下命令,就能够很方便的完成数据下载和相应的预处理工作。
-
-.. code-block:: bash
-
-    cd demo/quick_start
-    ./data/get_data.sh
-    ./preprocess.sh
-
-数据预处理完成之后,通过配置类似于 ``dataprovider_*.py`` 的数据读取脚本和类似于 ``trainer_config.*.py`` 的训练模型脚本,PaddlePaddle将以设置参数的方式来设置
-相应的数据读取脚本和训练模型脚本。接下来,我们将对这两个步骤给出了详细的解释,你也可以先跳过本文的解释环节,直接进入训练模型章节, 使用 ``sh train.sh`` 开始训练模型,
-查看`train.sh`内容,通过 **自底向上法** (bottom-up approach)来帮助你理解PaddlePaddle的内部运行机制。
-
-
-向系统传送数据
-==============
-
-Python脚本读取数据
-------------------
-
-`DataProvider` 是PaddlePaddle负责提供数据的模块,主要职责在于将训练数据传入内存或者显存,让模型能够得到训练更新,其包括两个函数:
-
-* initializer:PaddlePaddle会在调用读取数据的Python脚本之前,先调用initializer函数。在下面例子里,我们在initialzier函数里初始化词表,并且在随后的读取数据过程中填充词表。
-* process:PaddlePaddle调用process函数来读取数据。每次读取一条数据后,process函数会用yield语句输出这条数据,从而能够被PaddlePaddle 捕获 (harvest)。
-
-``dataprovider_bow.py`` 文件给出了完整例子:
-
-..  literalinclude:: ../../../demo/quick_start/dataprovider_bow.py
-     :language: python
-     :lines: 21-70
-     :linenos:
-     :emphasize-lines: 8,33
-
-详细内容请参见 :ref:`api_dataprovider` 。
-
-配置中的数据加载定义
---------------------
-
-在模型配置中通过 ``define_py_data_sources2`` 接口来加载数据:
-
-..  literalinclude:: ../../../demo/quick_start/trainer_config.emb.py
-     :language: python
-     :lines: 19-35
-     :linenos:
-     :emphasize-lines: 12
-
-
-以下是对上述数据加载的解释:
-
-- data/train.list,data/test.list: 指定训练数据和测试数据
-- module="dataprovider_bow": 处理数据的Python脚本文件
-- obj="process": 指定生成数据的函数
-- args={"dictionary": word_dict}: 额外的参数,这里指定词典
-
-更详细数据格式和用例请参考 :ref:`api_pydataprovider2` 。
-
-模型网络结构
-============
-
-本小节我们将介绍模型网络结构。
-
-    ..  image:: src/PipelineNetwork_cn.jpg
-        :align: center
-        :scale: 80%
-
-
-我们将以最基本的逻辑回归网络作为起点,并逐渐展示更加深入的功能。更详细的网络配置连接请参考 :ref:`api_trainer_config_helpers_layers` 。
-所有配置都能在 `源代码 <https://github.com/PaddlePaddle/Paddle>`_ 的 ``demo/quick_start`` 目录下找到。
-
-逻辑回归模型
-------------
-
-具体流程如下:
-
-    ..  image:: src/NetLR_cn.jpg
-        :align: center
-        :scale: 80%
-
-- 获取利用 `one-hot vector <https://en.wikipedia.org/wiki/One-hot>`_ 表示的每个单词,维度是词典大小
-
-    .. code-block:: python
-
-        word = data_layer(name="word",  size=word_dim)
-
-- 获取该条样本类别Id,维度是类别个数。
-
-    .. code-block:: python
-
-        label = data_layer(name="label", size=label_dim)
-
-- 利用逻辑回归模型对该向量进行分类,同时会计算分类准确率
-
-    .. code-block:: python
-
-        # Define a fully connected layer with logistic activation (also called softmax activation).
-        output = fc_layer(input=word,
-                        size=label_dim,
-                        act_type=SoftmaxActivation())
-        # Define cross-entropy classification loss and error.
-        classification_cost(input=output, label=label)
-
-
- - input: 除去data层,每个层都有一个或多个input,多个input以list方式输入
- - size: 该层神经元个数
- - act_type: 激活函数类型
-
-**效果总结**:我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构,我们总结了各个网络的复杂度和效果。
-
-    =====================  ===============================  =================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  =================
-    逻辑回归                      252 KB                       8.652 %
-    =====================  ===============================  =================
-
-词向量模型
-----------
-
-embedding模型需要稍微改变提供数据的Python脚本,即 ``dataprovider_emb.py``,词向量模型、
-卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
-
-.. code-block:: python
-
-    def initializer(settings, dictionary, **kwargs):
-        settings.word_dict = dictionary
-        settings.input_types = [
-            # Define the type of the first input as sequence of integer.
-            # The value of the integers range from 0 to len(dictrionary)-1
-            integer_value_sequence(len(dictionary)),
-            # Define the second input for label id
-            integer_value(2)]
-
-    @provider(init_hook=initializer)
-    def process(settings, file_name):
-        ...
-        # omitted, it is same as the data provider for LR model
-
-该模型依然使用逻辑回归分类网络的框架, 只是将句子用连续向量表示替换为用稀疏向量表示, 即对第三步进行替换。句子表示的计算更新为两步:
-
-..  image:: src/NetContinuous_cn.jpg
-    :align: center
-    :scale: 80%
-
-- 利用单词Id查找该单词对应的连续向量(维度为word_dim), 输入N个单词,输出为N个word_dim维度向量
-
-    .. code-block:: python
-
-        emb = embedding_layer(input=word, size=word_dim)
-
-- 将该句话包含的所有单词向量求平均, 得到句子的表示
-
-    .. code-block:: python
-
-        avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-
-其它部分和逻辑回归网络结构一致。
-
-**效果总结:**
-
-    =====================  ===============================  ==================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  ==================
-    词向量模型                      15 MB                       8.484 %
-    =====================  ===============================  ==================
-
-卷积模型
------------
-
-卷积网络是一种特殊的从词向量表示到句子表示的方法, 也就是将词向量模型进一步演化为三个新步骤。
-
-..  image:: src/NetConv_cn.jpg
-    :align: center
-    :scale: 80%
-
-文本卷积分可为三个步骤:
-
-1. 首先,从每个单词左右两端分别获取k个相邻的单词, 拼接成一个新的向量;
-
-2. 其次,对该向量进行非线性变换(例如Sigmoid变换), 使其转变为维度为hidden_dim的新向量;
-
-3. 最后,对整个新向量集合的每一个维度取最大值来表示最后的句子。
-
-这三个步骤可配置为:
-
-.. code-block:: python
-
-    text_conv = sequence_conv_pool(input=emb,
-                                context_start=k,
-                                context_len=2 * k + 1)
-
-**效果总结:**
-
-    =====================  ===============================  ========================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  ========================
-    卷积模型                      16 MB                       5.628 %
-    =====================  ===============================  ========================
-
-时序模型
-----------
-
-..  image:: src/NetRNN_cn.jpg
-    :align: center
-    :scale: 80%
-
-时序模型,也称为RNN模型, 包括简单的 `RNN模型 <https://en.wikipedia.org/wiki/Recurrent_neural_network>`_, `GRU模型 <https://en.wikipedia.org/wiki/Gated_recurrent_unit>`_ 和 `LSTM模型 <https://en.wikipedia.org/wiki/Long_short-term_memory>`_ 等等。
-
-- GRU模型配置:
-
-    .. code-block:: python
-
-        gru = simple_gru(input=emb, size=gru_size)
-
-
-- LSTM模型配置:
-
-    .. code-block:: python
-
-        lstm = simple_lstm(input=emb, size=lstm_size)
-
-本次试验,我们采用单层LSTM模型,并使用了Dropout,**效果总结:**
-
-    =====================  ===============================  =========================
-    网络名称                        参数数量                    错误率
-    =====================  ===============================  =========================
-    时序模型                      16 MB                       4.812 %
-    =====================  ===============================  =========================
-
-优化算法
-=========
-
-`优化算法 <http://www.paddlepaddle.org/doc/ui/api/trainer_config_helpers/optimizers_index.html>`_ 包括
-Momentum, RMSProp,AdaDelta,AdaGrad,ADAM,Adamax等,这里采用Adam优化方法,同时使用了L2正则(L2 Regularization)和梯度截断(Gradient Clipping)。
-
-.. code-block:: python
-
-    settings(batch_size=128,
-            learning_rate=2e-3,
-            learning_method=AdamOptimizer(),
-            regularization=L2Regularization(8e-4),
-            gradient_clipping_threshold=25)
-
-训练模型
-=========
-
-在数据加载和网络配置完成之后, 我们就可以训练模型了。
-
-..  image:: src/PipelineTrain_cn.jpg
-    :align: center
-    :scale: 80%
-
-训练模型,我们只需要运行 ``train.sh`` 训练脚本:
-
-    .. code-block:: bash
-
-        ./train.sh
-
-``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下:
-
-    .. code-block:: bash
-
-        paddle train \
-        --config=trainer_config.py \
-        --log_period=20 \
-        --save_dir=./output \
-        --num_passes=15 \
-        --use_gpu=false
-
-这里只简单介绍了单机训练,如何进行分布式训练,请参考 :ref:`cluster_train` 。
-
-预测
-=====
-
-当模型训练好了之后,我们就可以进行预测了。
-
-..  image:: src/PipelineTest_cn.jpg
-    :align: center
-    :scale: 80%
-
-之前配置文件中 ``test.list`` 指定的数据将会被测试,这里直接通过预测脚本 ``predict.sh`` 进行预测,
-更详细的说明,请参考 :ref:`api_swig_py_paddle` 。
-
-    .. code-block:: bash
-
-        model="output/pass-00003"
-        paddle train \
-            --config=trainer_config.lstm.py \
-            --use_gpu=false \
-            --job=test \
-            --init_model_path=$model \
-            --config_args=is_predict=1 \
-            --predict_output_dir=. \
-
-        mv rank-00000 result.txt
-
-这里以 ``output/pass-00003`` 为例进行预测,用户可以根据训练日志,选择测试结果最好的模型来预测。
-
-预测结果以文本的形式保存在 ``result.txt`` 中,一行为一个样本,格式如下:
-
-    .. code-block:: bash
-
-        预测ID;ID为0的概率 ID为1的概率
-        预测ID;ID为0的概率 ID为1的概率
-
-总体效果总结
-==============
-
-在 ``/demo/quick_start`` 目录下,能够找到这里使用的所有数据, 网络配置, 训练脚本等等。
-对于Amazon-Elec测试集(25k), 如下表格,展示了上述网络模型的训练效果:
-
-    =====================  ===============================  =============  ==================================
-    网络名称                       参数数量                    错误率          配置文件
-    =====================  ===============================  =============  ==================================
-    逻辑回归模型                      252 KB                     8.652%          trainer_config.lr.py
-    词向量模型                         15 MB                      8.484%         trainer_config.emb.py
-    卷积模型                        16 MB                     5.628%          trainer_config.cnn.py
-    时序模型                         16 MB                     4.812%          trainer_config.lstm.py
-    =====================  ===============================  =============  ==================================
-
-
-附录
-=====
-
-命令行参数
-----------
-
-* \--config:网络配置
-* \--save_dir:模型存储路径
-* \--log_period:每隔多少batch打印一次日志
-* \--num_passes:训练轮次,一个pass表示过一遍所有训练样本
-* \--config_args:命令指定的参数会传入网络配置中。
-* \--init_model_path:指定初始化模型路径,可用在测试或训练时指定初始化模型。
-
-默认一个pass保存一次模型,也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
-可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考 命令行参数文档(链接待补充)。
-
-输出日志
----------
-
-.. code-block:: bash
-
-    TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-
-模型训练会看到类似上面这样的日志信息,详细的参数解释,请参考如下表格:
-
-    ===========================================  ==============================================================
-    名称                                             解释
-    ===========================================  ==============================================================
-    Batch=20                                      表示过了20个batch
-    samples=2560                                  表示过了2560个样本
-    AvgCost                                          每个pass的第0个batch到当前batch所有样本的平均cost
-    CurrentCost                                      当前log_period个batch所有样本的平均cost
-    Eval: classification_error_evaluator          每个pass的第0个batch到当前batch所有样本的平均分类错误率
-    CurrentEval: classification_error_evaluator      当前log_period个batch所有样本的平均分类错误率
-    ===========================================  ==============================================================
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
deleted file mode 100644
index ca110431cf..0000000000
--- a/doc/tutorials/quick_start/index_en.md
+++ /dev/null
@@ -1,562 +0,0 @@
-# Quick Start
-
-This tutorial will teach the basics of deep learning (DL), including how to implement many different models in PaddlePaddle. You will learn how to:
-  - Prepare data into the standardized format that PaddlePaddle accepts.
-  - Write data providers that read data into PaddlePaddle.
-  - Configure neural networks in PaddlePaddle layer by layer.
-  - Train models.
-  - Perform inference with trained models.
-
-
-## Install
-
-To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification.
-
-To install PaddlePaddle, please follow the instructions here: <a href = "../../getstarted/build_and_install/index_en.html" >Build and Install</a>.
-
-## Overview
-For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence  website, and you want to analyze the sentiment of user reviews to evaluate product quality.
-
-For example, given the input
-
-```
-This monitor is fantastic.
-```
-
-Your classifier should output “positive”, since this text snippet shows that the user is satisfied with the product. Given this input:
-
-```
-The monitor breaks down two months after purchase.
-```
-
-the classifier should output “negative“.
-
-To build your text classification system, your code will need to perform five steps:
-<center> ![](./src/Pipeline_en.jpg) </center>
-
-  - Preprocess data into a standardized format.
-  - Provide data to the learning model.
-  - Specify the neural network structure.
-  - Train the model.
-  - Inference (make prediction on test examples).
-
-
-1. Preprocess data into standardized format
-    - In the text classification example, you will start with a text file with one training example per line. Each line contains category id (in machine learning, often denoted the target y), followed by the input text (often denoted x); these two elements are separated by a Tab. For example: ```positive [tab] This monitor is fantastic```. You will preprocess this raw data into a format that Paddle can use.
-
-2. Provide data to the learning model.
-    - You can write data providers in Python. For any required data preprocessing step, you can add the preprocessing code to the PyDataProvider Python file.
-    - In our text classification example, every word or character will be converted into an integer id, specified in a dictionary file. It perform a dictionary lookup in PyDataProvider to get the id.
-3. Specify neural network structure.  (From easy to hard, we provide 4 kinds of network configurations)
-    - A logistic regression model.
-    - A word embedding model.
-    - A convolutional neural network model.
-    - A sequential recurrent neural network model.
-    - You will also learn different learning algorithms.
-4. Training model.
-5. Inference.
-
-## Preprocess data into standardized format
-In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.
-
-`demo/quick_start` in the [source code](https://github.com/PaddlePaddle/Paddle) provides script for downloading the preprocessed data as shown below. (If you want to process the raw data, you can use the script `demo/quick_start/data/proc_from_raw_data/get_data.sh`).
-
-```bash
-cd demo/quick_start
-./data/get_data.sh
-```
-
-## Transfer Data to Model
-### Write Data Provider with Python
-The following `dataprovider_bow.py` gives a complete example of writing data provider with Python. It includes the following parts:
-
-* initalizer: define the additional meta-data of the data provider and the types of the input data.
-* process: Each `yield` returns a data sample. In this case, it return the text representation and category id. The order of features in the returned result needs to be consistent with the definition of the input types in `initalizer`.
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = [
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        integer_value(2)]
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield word_vector, int(label)
-```
-
-### Define Python Data Provider in Configuration files.
-You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
-
-- The path of the training and testing data (`data/train.list`, `data/test.list`).
-- The location of the data provider file (`dataprovider_bow`).
-- The function to call to get data. (`process`).
-- Additional arguments or data. Here it passes the path of word dictionary.
-
-```python
-from paddle.trainer_config_helpers import *
-
-file = "data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list='data/train.list',
-                        test_list='data/test.list',
-                        module="dataprovider_bow",
-                        obj="process",
-                        args={"dictionary": word_dict})
-```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
-
-## Network Architecture
-We will describe four kinds of network architectures in this section.
-<center> ![](./src/PipelineNetwork_en.jpg) </center>
-
-First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
-
-### Logistic Regression
-The architecture is illustrated in the following picture:
-<center> ![](./src/NetLR_en.png) </center>
-
-- You need define the data for text features. The size of the data layer is the number of words in the dictionary.
-
-```python
-word = data_layer(name="word",  size=voc_dim)
-```
-
-- You also need to define the category id for each example. The size of the data layer is the number of labels.
-
-```python
-label = data_layer(name="label", size=label_dim)
-```
-
-- It uses logistic regression model to classify the vector, and it will output the classification error during training.
-    - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
-    - *size* for each layer means the number of neurons of the layer.
-    - *act_type* means activation function applied to the output of each neuron independently.
-    - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
-```python
-# Define a fully connected layer with logistic activation (also called softmax activation).
-output = fc_layer(input=word,
-                  size=label_dim,
-                  act_type=SoftmaxActivation())
-# Define cross-entropy classification loss and error.
-classification_cost(input=output, label=label)
-```
-
-Performance summary: You can refer to the training and testing scripts later. In order to compare different network architectures, the model complexity and test classification error are listed in the following table:
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">Logistic regression</td>
-<td class="left">252 KB</td>
-<td class="left">8.652%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-### Word Embedding Model
-In order to use the word embedding model, you need to change the data provider a little bit to make the input words as a sequence of word IDs. The revised data provider `dataprovider_emb.py` is listed below. You only need to change initializer() for the type of the first input. It is changed from sparse_binary_vector to sequence of intergers.  process() remains the same. This data provider can also be used for later sequence models.
-
-```python
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-    settings.input_types = [
-        # Define the type of the first input as a sequence of integers.
-        integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        integer_value(2)]
-
-@provider(init_hook=initializer)
-def process(settings, file_name):
-    ...
-    # omitted, it is same as the data provider for LR model
-```
-
-This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
-<center> ![](./src/NetContinuous_en.png) </center>
-
-- It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
-
-```python
-emb = embedding_layer(input=word, dim=word_dim)
-```
-
-- It averages all the word embedding in a sentence to get its sentence representation.
-
-```python
-avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-```
-
-The other parts of the model are the same as logistic regression network.
-
-The performance is summarized in the following table:
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">Word embedding model</td>
-<td class="left">15 MB</td>
-<td class="left">8.484%</td>
-</tr>
-
-</tbody>
-</table>
-</html></center>
-<br>
-
-### Convolutional Neural Network Model
-Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
-<center> ![](./src/NetConv_en.png) </center>
-
-
-Text convolution has 3 steps:
-1. Get K nearest neighbor context of each word in a sentence, stack them into a 2D vector representation.
-2. Apply temporal convolution to this representation to produce a new hidden_dim dimensional vector.
-3. Apply max-pooling to the new vectors at all the time steps in a sentence to get a sentence representation.
-
-```python
-# context_len means convolution kernel size.
-# context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
-text_conv = sequence_conv_pool(input=emb,
-                               context_start=k,
-                               context_len=2 * k + 1)
-```
-
-The performance is summarized in the following table:
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">Convolutional model</td>
-<td class="left">16 MB</td>
-<td class="left">5.628%</td>
-</tr>
-
-</tbody>
-</table></center>
-<br>
-
-### Recurrent Model
-<center> ![](./src/NetRNN_en.png) </center>
-
-You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
-
-- GRU model can be specified via:
-
-```python
-gru = simple_gru(input=emb, size=gru_size)
-```
-
-- LSTM model can be specified via:
-
-```python
-lstm = simple_lstm(input=emb, size=lstm_size)
-```
-
-You can use single layer LSTM model with Dropout for our text classification problem. The performance is summarized in the following table:
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Test error</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">Recurrent model</td>
-<td class="left">16 MB</td>
-<td class="left">4.812%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-## Optimization Algorithm
-<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
-
-```python
-settings(batch_size=128,
-         learning_rate=2e-3,
-         learning_method=AdamOptimizer(),
-         regularization=L2Regularization(8e-4),
-         gradient_clipping_threshold=25)
-```
-
-## Training Model
-After completing data preparation and network architecture specification, you will run the training script.
-<center> ![](./src/PipelineTrain_en.png) </center>
-
-Training script: our training script is in `train.sh` file. The training arguments are listed below:
-
-```bash
-paddle train \
---config=trainer_config.py \
---log_period=20 \
---save_dir=./output \
---num_passes=15 \
---use_gpu=false
-```
-
-We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/usage/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
-
-## Inference
-You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
-<center> ![](./src/PipelineTest_en.png) </center>
-
-The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
-
-```bash
-paddle train \
---config=trainer_config.lstm.py \
---use_gpu=false \
---job=test \
---init_model_path=./output/pass-0000x
-```
-
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial,or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
-
-inference script (predict.sh):
-
-```bash
-model="output/pass-00003"
-paddle train \
-    --config=trainer_config.lstm.py \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-
-mv rank-00000 result.txt
-```
-User can choose the best model base on the training log instead of model `output/pass-00003`. There are several differences between training and inference network configurations.
-- You do not need labels during inference.
-- Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet.
-- batch_size = 1.
-- You need to specify the location of `test_list` in the test data.
-
-The results in `result.txt` is as follows, each line is one sample.
-
-```
-predicted_label_id;probability_of_label_0 probability_of_label_1  # the first sample
-predicted_label_id;probability_of_label_0 probability_of_label_1  # the second sample
-```
-
-
-```python
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-obj = 'process' if not is_predict else 'process_pre'
-batch_size = 128 if not is_predict else 1
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid,output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label) outputs(cls)
-```
-
-## Summary
-The scripts of data downloading, network configurations, and training scrips are in `/demo/quick_start`. The following table summarizes the performance of our network architecture on Amazon-Elec dataset(25k):
-
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Network name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">Error rate</th>
-<th scope="col" class="left">Configuration file name</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">Logistic regression model(BOW)</td>
-<td class="left"> 252KB </td>
-<td class="left">8.652%</td>
-<td class="left">trainer_config.lr.py</td>
-</tr>
-
-<tr>
-<td class="left">Word embedding</td>
-<td class="left"> 15MB </td>
-<td class="left"> 8.484%</td>
-<td class="left">trainer_config.emb.py</td>
-</tr>
-
-<tr>
-<td class="left">Convolution model</td>
-<td class="left"> 16MB </td>
-<td class="left"> 5.628%</td>
-<td class="left">trainer_config.cnn.py</td>
-</tr>
-
-<tr>
-<td class="left">Time sequence model</td>
-<td class="left"> 16MB </td>
-<td class="left"> 4.812%</td>
-<td class="left">trainer_config.lstm.py</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
-
-## Appendix
-### Command Line Argument
-
-* \--config:network architecture path.
-* \--save_dir:model save directory.
-* \--log_period:the logging period per batch.
-* \--num_passes:number of training passes. One pass means the training would go over the whole training dataset once.
-* \--config_args:Other configuration arguments.
-* \--init_model_path:The path of the initial model parameter.
-
-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/usage/cmd_parameter/index_en.html">command line argument documentation</a>。
-
-### Log
-
-```
-TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-```
-During model training, you will see the log like the examples above:
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Name</th>
-<th scope="col" class="left">Explanation</th>
-</tr>
-</thead>
-
-<tr>
-<td class="left">Batch=20</td>
-<td class="left"> You have trained 20 batches. </td>
-</tr>
-
-<tr>
-<td class="left">samples=2560</td>
-<td class="left"> You have trained 2560 examples. </td>
-</tr>
-
-<tr>
-<td class="left">AvgCost</td>
-<td class="left"> The average cost from the first batch to the current batch. </td>
-</tr>
-
-<tr>
-<td class="left">CurrentCost</td>
-<td class="left"> the average cost of the last log_period batches </td>
-</tr>
-
-<tr>
-<td class="left">Eval: classification_error_evaluator</td>
-<td class="left"> The average classification error from the first batch to the current batch.</td>
-</tr>
-
-<tr>
-<td class="left">CurrentEval: classification_error_evaluator</td>
-<td class="left"> The average error rate of the last log_period batches </td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
deleted file mode 100755
index b18e452a48..0000000000
Binary files a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetContinuous_en.png b/doc/tutorials/quick_start/src/NetContinuous_en.png
deleted file mode 100644
index 7bdef1aa36..0000000000
Binary files a/doc/tutorials/quick_start/src/NetContinuous_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetConv_cn.jpg b/doc/tutorials/quick_start/src/NetConv_cn.jpg
deleted file mode 100755
index 0f5ebfa52f..0000000000
Binary files a/doc/tutorials/quick_start/src/NetConv_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetConv_en.png b/doc/tutorials/quick_start/src/NetConv_en.png
deleted file mode 100644
index ad618d1d6f..0000000000
Binary files a/doc/tutorials/quick_start/src/NetConv_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetLR_cn.jpg b/doc/tutorials/quick_start/src/NetLR_cn.jpg
deleted file mode 100755
index ee65d1f412..0000000000
Binary files a/doc/tutorials/quick_start/src/NetLR_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetLR_en.png b/doc/tutorials/quick_start/src/NetLR_en.png
deleted file mode 100644
index 9d514bf1b1..0000000000
Binary files a/doc/tutorials/quick_start/src/NetLR_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetRNN_cn.jpg b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
deleted file mode 100755
index f8bc081827..0000000000
Binary files a/doc/tutorials/quick_start/src/NetRNN_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/NetRNN_en.png b/doc/tutorials/quick_start/src/NetRNN_en.png
deleted file mode 100644
index 180f273d32..0000000000
Binary files a/doc/tutorials/quick_start/src/NetRNN_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
deleted file mode 100755
index 7e68891d7a..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
deleted file mode 100644
index e779aed06d..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
deleted file mode 100755
index 01715db886..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineTest_en.png b/doc/tutorials/quick_start/src/PipelineTest_en.png
deleted file mode 100644
index 7e7ef520b5..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineTest_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
deleted file mode 100755
index 8049d3e53c..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_en.png b/doc/tutorials/quick_start/src/PipelineTrain_en.png
deleted file mode 100644
index 132d29bfd5..0000000000
Binary files a/doc/tutorials/quick_start/src/PipelineTrain_en.png and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/Pipeline_cn.jpg b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
deleted file mode 100755
index d5d99253ea..0000000000
Binary files a/doc/tutorials/quick_start/src/Pipeline_cn.jpg and /dev/null differ
diff --git a/doc/tutorials/quick_start/src/Pipeline_en.jpg b/doc/tutorials/quick_start/src/Pipeline_en.jpg
deleted file mode 100644
index 21a7a7bb6a..0000000000
Binary files a/doc/tutorials/quick_start/src/Pipeline_en.jpg and /dev/null differ
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
deleted file mode 100644
index 2207a776f0..0000000000
--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ /dev/null
@@ -1,105 +0,0 @@
-```eval_rst
-.. _demo_ml_dataset:
-
-```
-
-# MovieLens数据集
-
-[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
-该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模,该数据及有很多不同的版本。
-我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
-集,其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
-
-## 数据集特征
-
-在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
-(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件,
-分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
-
-### 评分文件描述(ratings.dat)
-
-
-所有的评分数据都包含在"ratings.dat"文件中,遵循如下的格式:
-
-用户ID::电影ID::评分::时间戳
-
-- 用户ID范围从1到6040
-- 电影ID范围从1到3952
-- 评分被调整为5星的规模(只允许整数的星级)
-- 时间戳表示为从1970-01-01(UTC)来的秒数,与time(2)的返回值一致
-- 每位用户至少有20条评分
-
-### 用户文件描述(users.dat)
-
-所有的用户信息都包含在"users.dat"文件中,遵循如下的格式:
-
-用户ID::性别::年龄::职业::邮编
-
-所有的人口统计学信息由用户自愿提供,没有进行正确性的检查。只有含有人
-口统计学信息的用户才被包含在数据集中。
-
-- 性别,用"M"表示男性,"F"表示女性
-- 年龄从下列列表范围中选取:
-
-	*   1:	"18岁以下"
-	*  18:	"18-24岁"
-	*  25:	"25-34岁"
-	*  35:	"35-44岁"
-	*  45:	"45-49岁"
-	*  50:	"50-55岁"
-	*  56:	"56+"
-
-- 职业从下面所列中选择:
-
-	*   0:  "其他"或不确定
-	*   1:  "学术/教育工作者"
-	*   2:  "艺术家"
-	*   3:  "文书工作/管理员"
-	*   4:  "大学生/研究生"
-	*   5:  "客户服务"
-	*   6:  "医生/医疗保健"
-	*   7:  "行政工作/管理人员"
-	*   8:  "农民"
-	*   9:  "操持家务者"
-	*  10:  "高中毕业生"
-	*  11:  "律师"
-	*  12:  "程序员"
-	*  13:  "退休人员"
-	*  14:  "销售/市场"
-	*  15:  "科学家"
-	*  16:  "自由职业者"
-	*  17:  "技术员/工程师"
-	*  18:  "推销员/手工艺者"
-	*  19:  "无业人士"
-	*  20:  "作家"
-
-### 电影文件描述(movies.dat)
-
-所有的电影信息都包含在"movies.dat"文件中,遵循如下的格式:
-
-电影ID::电影名称::电影类型
-
-- 电影名称(包括发行时间)与IMDB网站提供的一致
-- 电影类型如符合多种用管道符号|分割,选自下列类型:
-
-	*	动作片
-	*	冒险片
-	*	动画片
-	*	儿童片
-	*	喜剧片
-	*	犯罪片
-	*	纪录片
-	*	戏剧
-	*	奇幻片
-	*	黑色电影
-	*	恐怖片
-	*	音乐剧
-	*	悬疑片
-	*	浪漫片
-	*	科幻片
-	*	惊险电影
-	*	战争片
-	*	西部片
-
-- 由于意外的副本记录和测试记录,有些电影ID可能与实际电影不相符合
-- 电影大部分是手工输入数据,因此可能会有一些错误和不一致发生
diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md
deleted file mode 100644
index 25dea5c4af..0000000000
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ /dev/null
@@ -1,111 +0,0 @@
-```eval_rst
-..  _demo_ml_dataset:
-```
-
-# MovieLens Dataset
-
-The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
-The data set contains some user information, movie information, and many movie ratings from \[1-5\].
-The data sets have many version depending on the size of set.
-We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains
-1 million ratings from 6000 users on 4000 movies. Released 2/2003.
-
-## Dataset Features
-
-In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset.
-The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip)
-is basically CSV file that delimiter is "::". The description in README we quote here.
-
-### RATINGS FILE DESCRIPTION(ratings.dat)
-
-
-All ratings are contained in the file "ratings.dat" and are in the
-following format:
-
-UserID::MovieID::Rating::Timestamp
-
-- UserIDs range between 1 and 6040
-- MovieIDs range between 1 and 3952
-- Ratings are made on a 5-star scale (whole-star ratings only)
-- Timestamp is represented in seconds since the epoch as returned by time(2)
-- Each user has at least 20 ratings
-
-### USERS FILE DESCRIPTION(users.dat)
-
-User information is in the file "users.dat" and is in the following
-format:
-
-UserID::Gender::Age::Occupation::Zip-code
-
-All demographic information is provided voluntarily by the users and is
-not checked for accuracy.  Only users who have provided some demographic
-information are included in this data set.
-
-- Gender is denoted by a "M" for male and "F" for female
-- Age is chosen from the following ranges:
-
-	*  1:  "Under 18"
-	* 18:  "18-24"
-	* 25:  "25-34"
-	* 35:  "35-44"
-	* 45:  "45-49"
-	* 50:  "50-55"
-	* 56:  "56+"
-
-- Occupation is chosen from the following choices:
-
-	*  0:  "other" or not specified
-	*  1:  "academic/educator"
-	*  2:  "artist"
-	*  3:  "clerical/admin"
-	*  4:  "college/grad student"
-	*  5:  "customer service"
-	*  6:  "doctor/health care"
-	*  7:  "executive/managerial"
-	*  8:  "farmer"
-	*  9:  "homemaker"
-	* 10:  "K-12 student"
-	* 11:  "lawyer"
-	* 12:  "programmer"
-	* 13:  "retired"
-	* 14:  "sales/marketing"
-	* 15:  "scientist"
-	* 16:  "self-employed"
-	* 17:  "technician/engineer"
-	* 18:  "tradesman/craftsman"
-	* 19:  "unemployed"
-	* 20:  "writer"
-
-### MOVIES FILE DESCRIPTION(movies.dat)
-
-Movie information is in the file "movies.dat" and is in the following
-format:
-
-MovieID::Title::Genres
-
-- Titles are identical to titles provided by the IMDB (including
-year of release)
-- Genres are pipe-separated and are selected from the following genres:
-
-	* Action
-	* Adventure
-	* Animation
-	* Children's
-	* Comedy
-	* Crime
-	* Documentary
-	* Drama
-	* Fantasy
-	* Film-Noir
-	* Horror
-	* Musical
-	* Mystery
-	* Romance
-	* Sci-Fi
-	* Thriller
-	* War
-	* Western
-
-- Some MovieIDs do not correspond to a movie due to accidental duplicate
-entries and/or test entries
-- Movies are mostly entered by hand, so errors and inconsistencies may exist
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
deleted file mode 100644
index 9278c9f603..0000000000
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ /dev/null
@@ -1,349 +0,0 @@
-MovieLens数据集评分回归模型
-===========================
-
-这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
-该示例将展示paddle如何进行词向量嵌入,处理相似度回归,针对文本
-的单词级别的卷积神经网络,以及paddle如何处理多种类型的输入。
-需要注意的是,该模型网络只是用于进行demo展示paddle如何工作,而
-没有进行结构的微调。
-
-
-**我们非常欢迎您用PADDLEPADDLE构建更好的示例,如果您有好的建议来
-让这个示例变得更好,希望能让我们知晓。**
-
-数据准备
-`````````
-下载并解压数据集
-'''''''''''''''''
-这里我们使用 :ref:`demo_ml_dataset` 。
-要下载和解压数据集,只需要简单的运行下面的命令即可。
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	./ml_data.sh
-
-:code:`demo/recommendation/data/ml-1m` 的目录结构为:
-
-.. code-block:: text
-
-	+--ml-1m
-		+--- movies.dat 	# 电影特征
-		+--- ratings.dat 	# 评分
-		+--- users.dat 		# 用户特征
-		+--- README 		# 数据集描述
-
-字段配置文件
-'''''''''''''
-**字段配置文件** 用来具体说明数据集的字段和文件格式,
-例如,说明每个特征文件具体字段是 **什么** 类型。
-
-ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
-其具体说明了字段类型和文件名称:
-
-1) 用户文件中有四种类型的字段\: 编号,性别,年龄和职业;
-
-2) 文件名称为"users.dat",文件的分隔符为"::"。
-
-.. include:: ../../../demo/recommendation/data/config.json
-   :code: json
-   :literal:
-
-准备数据
-`````````
-你需要安装python的第三方库。
-**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
-
-.. code-block:: bash
-
-	pip install -r requirements.txt
-
-预处理数据一般的命令为:
-
-.. code-block:: bash
-
-	cd demo/recommendation
-	./preprocess.sh
-
-下面介绍预处理过程具体的步骤。
-
-提取电影或用户的特征并生成python对象
-'''''''''''''''''''''''''''''''''''''
-
-在movielens 1m数据集中,电影和用户有许多的特征。
-评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
-我们首先处理电影或用户的特征文件,然后用pickle命令将特征( **Meta** )对象存储为文件。
-
-Meta配置文件
-.............
-
-**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
-该文件可以从字段配置文件生成,或是手动编辑生成。文件的格式可以
-为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
-
-要将字段配置文件转化为meta配置文件,只需要运行:
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	python config_generator.py config.json > meta_config.json
-
-生成的meta配置文件如下所示:
-
-.. include:: ../../../demo/recommendation/data/meta_config.json
-	:code: json
-	:literal:
-
-在meta文件中有两种特征\: 电影和用户。
-
-* 在电影文件movies.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征:
-		* name是电影名
-		* 利用正则表达式来解析该特征
-		* 基于字母的词嵌入特征
-		* 是序列
-	* pos 2 特征:
-		* name是体裁
-		* type是one hot稠密向量
-		* dictionary由解析自动生成,每一个key由'|'分隔
-* 在用户文件users.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征:
-		* name是性别
-		* 简单的基于字母的词嵌入
-	* pos 2 特征:
-		* name是年龄
-		* 是整个的词嵌入
-		* 嵌入编号会根据单词排序
-	* pos 3 特征:
-		* name是职业
-		* 简单的整个词嵌入
-
-
-Meta文件
-''''''''
-
-有了meta配置文件之后,我们可以生成 **Meta文件** ,该文件是python的pickle对象,
-存储着电影或用户信息。可以运行下面的命令来生成。
-
-.. code-block:: bash
-
-	python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-meta文件 :code:`meta.bin` 的结构如下:
-
-.. code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
-    |      |       |       +
-    |      |       |       |     # 编号字段,我们用编号作为key 
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # 电影名字段,嵌入特征字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # 体裁字段,体裁字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # 电影1的特征
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # 用户1的特征
-           |
-           +--+ 2
-           +--+ ...
-
-
-分割训练/测试文件
-''''''''''''''''''
-
-我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是:对于每位用户,我们将评分分成两部分。
-这样的话每位用户在测试文件中将与训练文件含有同样的信息。
-
-用 :code:`separate.py` 来分离训练和测试文件。
-
-.. code-block:: bash
-
-	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-这样就会生成两个文件::code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
-将他们移动到目录 :code:`data` ,然后进行随机打乱,再为paddle的训练过程提供文件列表。
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-神经网络结构配置
-`````````````````
-
-训练器配置文件
-'''''''''''''''
-
-网络结构如下图所示:
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示:
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-在文件 :code:`trainer_config.py` 中,我们仅仅是将每个特征种类映射到一个特征向量中,以下
-展示了如何将每个特征映射到一个向量。
-
-* :code:`id` \: 仅仅是简单的嵌入,然后添加一个全连接层。
-* :code:`embedding` \:
-    - 如果是序列,则先做嵌入,然后再做一次文本卷积网络操作,
-      然后得到平均采样的结果。
-    - 如果不是序列,则先做嵌入,然后添加一个全连接层。
-* :code:`one_host_dense` \:
-    - 仅仅是两个全连接层。
-
-然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征,
-并且对用户的特征做同样的操作,也得到一个用户特征。然后我们求这两个特征的余弦相似度。
-
-在这些网络中,我们用以下的一些:ref:`api_trainer_config` 中的接口。
-
-*  数据层, :ref:`api_trainer_config_helpers_layers_data_layer`
-*  全连接层, :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  嵌入层, :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  文本投影层, :ref:`api_trainer_config_helpers_layers_context_projection`
-*  采样层, :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  余弦相似度层, :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  文本卷积采样层, :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  声明Python数据源, :ref:`api_trainer_config_helpers_data_sources` 
-
-数据提供脚本
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-数据提供脚本仅仅是读取meta.bin和评分文件,生成训练需要的样本。
-在脚本 :code:`dataprovider.py` 中,我们需要设置:
-
-* obj.slots\: 特征的类型和维度。
-* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
-* process\: 返回数据的每一条样本给 :code:`paddle` 。
-
-数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。
-
-训练
-````
-
-准备好数据,配置了网络,编写好数据提供脚本后,现在我们可以开始paddle训练了。
-
-代码 :code:`run.sh` 如下:
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-该脚本仅仅是开始一个paddle训练过程,将日志写入文件 :code:`log.txt` ,然后
-打印在屏幕上。
-
-脚本 :code:`run.sh` 中的每一行命令,请参考页面 :ref:`cmd_line_index` 。
-这些参数的简短介绍如下:
-
-*  config\: 告诉paddle哪个文件是神经网络的配置文件。
-*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
-*  use_gpu\: 是否使用GPU,默认为不使用。
-*  trainer_count\: 一台机器上面的线程数量。
-*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则,
-   每个测试周期测试: code:`batch_size` 批次的数据。
-*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
-*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
-*  num_passes\: 训练至多: code:`num_passes` 轮。
-
-如果训练过程启动成功的话,输出应该类似如下:
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
-
-模型评估和预测
-```````````````
-
-在训练了几个轮次以后,你可以对模型进行评估,得到最好轮次下的模型。运行下面命令即可:
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-你将看到如下的信息:
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-然后,你可以预测任何用户对于任何一部电影的评价,运行下面命令即可:
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-预测程序将读取用户的输入,然后输出预测分数。用户预测的命令行界面如下:
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst
deleted file mode 100644
index 993b9a516f..0000000000
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ /dev/null
@@ -1,348 +0,0 @@
-Regression MovieLens Ratting
-============================
-
-Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset.
-This demo will show how paddle does (word) embedding job,
-handles the similarity regression,
-the character-level convolutional networks for text, and how does paddle handle
-multiple types of inputs.
-Note that the model structure is not fine-tuned and just a demo to show how paddle works.
-
-
-YOU ARE WELCOME TO BUILD A BETTER DEMO
-BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER.
-
-Data Preparation
-````````````````
-Download and extract dataset
-''''''''''''''''''''''''''''
-We use :ref:`demo_ml_dataset` here. 
-To download and unzip the dataset, simply run the following commands.
-
-..  code-block:: bash
-
-    cd demo/recommendation/data 
-    ./ml_data.sh
-
-And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
-
-..  code-block:: text
-
-    +--ml-1m
-         +--- movies.dat    # movie features
-         +--- ratings.dat   # ratings
-         +--- users.dat     # user features
-         +--- README        # dataset description
-
-Field config file
-'''''''''''''''''
-**Field config file** is used to specify the fields of the dataset and the file format,
-i.e, specific **WHAT** type it is in each feature file.
-
-The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
-It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation;
-2) the filename is "users.dat", and the delimiter of file is "::".
-
-..  include:: ../../../demo/recommendation/data/config.json
-    :code: json
-    :literal:
-
-Preprocess Data
-```````````````
-You need to install python 3rd party libraries.
-IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT.
-
-..  code-block:: bash
-
-    pip install -r requirements.txt
-
-The general command for preprocessing the dataset is:
-
-..  code-block:: bash
-
-    cd demo/recommendation
-    ./preprocess.sh
-    
-And the detail steps are introduced as follows.
-
-Extract Movie/User features to python object
-'''''''''''''''''''''''''''''''''''''''''''''
-
-There are many features in movie or user in movielens 1m dataset.
-Each line of rating file just provides a Movie/User id to refer each movie or user.
-We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file.
-
-Meta config file
-................
-
-**Meta config file** is used to specific **HOW** to parse each field in dataset.
-It could be translated from field config file, or written by hand.
-Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name.
-
-To convert Field config file to meta config file, just run:
-
-..  code-block:: bash
-
-    cd demo/recommendation/data
-    python config_generator.py config.json > meta_config.json
-
-The meta config file shows below:
-
-..  include:: ../../../demo/recommendation/data/meta_config.json
-    :code: json
-    :literal:
-
-There are two kinds of features in meta\: movie and user.
-
-* in movie file, whose name is movies.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-      * name is title.
-      * it uses regex to parse this feature.
-      * it is a char based word embedding feature.
-      * it is a sequence.
-   * pos 2 feature:
-      * name is genres.
-      * type is one hot dense vector.
-      * dictionary is auto generated by parsing, each key is split by '|'
-* in user file, whose name is users.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-       * name is gender
-       * just simple char based embedding.
-   * pos 2 feature:
-       * name is age
-       * just whole word embedding.
-       * embedding id will be sort by word.
-   * pos 3 feature:
-       * name is occupation.
-       * just simple whole word embedding.
-
-
-Meta file
-'''''''''
-
-After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information.
-The following commands could be run to generate it.
-
-..  code-block:: bash
-
-    python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-And the structure of the meta file :code:`meta.bin` is:
-
-..  code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # each feature meta config. list
-    |      |       |       +
-    |      |       |       |     # ID Field, we use id as key
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # Titile field, the dictionary list of embedding.
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # Genres field, the genres dictionary
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # movie 1 features
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # user 1 features
-           |
-           +--+ 2
-           +--+ ...
-
-
-Split Training/Testing files
-''''''''''''''''''''''''''''
-
-We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
-rating by two parts. So each user in testing file will have some rating information in training file.
-
-Use :code:`separate.py` to separate the training and testing file.
-
-..  code-block:: bash
-
-    python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`.
-Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train.
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-Neural Network Configuration
-````````````````````````````
-
-Trainer Config File
-'''''''''''''''''''
-
-The network structure shows below.
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-The demo's neural network config file :code:`trainer_config.py` show as below.
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-In this :code:`trainer_config.py`, we just map each feature type to
-a feature vector, following shows how to map each feature to a vector shows below.
-
-* :code:`id`\: Just simple embedding, and then add to fully connected layer.
-* :code:`embedding`\:
-    - if is_sequence, get the embedding and do a text convolutional operation,
-      get the average pooling result.
-    - if not sequence, get the embedding and add to fully connected layer.
-* :code:`one_host_dense`\:
-    - just two fully connected layer.
-
-Then we combine each features of movie into one movie feature by a
-:code:`fc_layer` with multiple inputs, and do the same thing to user features,
-get one user feature. Then we calculate the cosine similarity of these two
-features.
-
-In these networks, we use several APIs in :ref:`api_trainer_config` . There are
-
-*  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
-*  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection`
-*  Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`.
-
-Data Provider
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-The data provider just read the meta.bin and rating file, yield each sample for training.
-In this :code:`dataprovider.py`, we should set\:
-
-* obj.slots\: The feature types and dimension.
-* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
-* process\: Return each sample of data to :code:`paddle`.
-
-The data provider details document see :ref:`api_pydataprovider2`.
-
-Train
-`````
-
-After prepare data, config network, writting data provider, now we can run paddle training.
-
-The :code:`run.sh` is shown as follow:
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-It just start a paddle training process, write the log to :code:`log.txt`,
-then print it on screen.
-
-Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
-
-*  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into :code:`./output`.
-*  use_gpu\: Use gpu or not. Default is false.
-*  trainer_count\: The compute thread in one machine.
-*  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
-   will test a :code:`batch_size` data in one test period.
-*  log_period\: Print log after train :code:`log_period` batches.
-*  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
-*  num_passes\: Train at most :code:`num_passes`.
-
-If training process starts successfully, the output likes follow:
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want.
-
-Evaluate and Predict
-````````````````````
-
-After training several passes, you can evaluate them and get the best pass. Just run
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-You will see messages like this:
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-Then, you can predict what any user will rate a movie. Just run
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-Predictor will read user input, and predict scores. It has a command-line user interface as follows:
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
deleted file mode 100644
index 7d2b54d4fc..0000000000
Binary files a/doc/tutorials/rec/rec_regression_network.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
deleted file mode 100644
index 0e3310e4ac..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
deleted file mode 100644
index f6061766c0..0000000000
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# 语义角色标注教程 #
-
-语义角色标注(Semantic role labeling, SRL)是浅层语义解析的一种形式,其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的,如信息提取、文档自动分类和问答。 实例如下 [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: 动词
-- A0: 接受者
-- A1: 接受的东西
-- A2: 从……接受
-- A3: 属性
-- AM-MOD: 情态动词 
-- AM-NEG: 否定
-
-给定动词“accept”,句子中的组块将会扮演某些语义角色。这里,标签方案来自 Penn Proposition Bank。
-
-到目前为止,大多数成功的SRL系统是建立在某种形式的句法分析结果之上的,使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆(DB-LSTM)模型[2]的端到端系统来解决SRL任务,这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。
-
-## 数据描述
-相关论文[2]采用 CoNLL-2005&2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因,演示采用 CoNLL-2005 的测试数据集,可以在网站上找到。
-
-用户只需执行以下命令就可以下载并处理原始数据:
-
-```bash
-cd data
-./get_data.sh
-```
-`data `目录会出现如下几个新的文件:
-```bash
-conll05st-release:the test data set of CoNll-2005 shared task 
-test.wsj.words:the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## 训练
-### DB-LSTM
-请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。
-
-与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同,DB-LSTM 采用另一种方法来堆叠LSTM层。首先,标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入,并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。
-
-下图展示了时间扩展的2层 DB-LSTM 网络。
-<center>
-![pic](./network_arch.png)
-</center>
-
-### 特征
-两个输入特征在这个流程中起着至关重要的作用:predicate(pred)和argument(arguments)。 还采用了两个其他特征:谓词上下文(ctx-p)和区域标记(mr)。 因为单个谓词不能精确地描述谓词信息,特别是当相同的词在句子中出现多于一次时。 使用谓词上下文,可以在很大程度上消除歧义。类似地,如果它位于谓词上下文区域中,则使用区域标记 m<sub>r</sub> = 1 来表示参数位置,反之则 m<sub>r</sub> = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示:
-<center>
-![pic](./feature.jpg)
-</center>
-
-在这个示例中,相应的标记句子是:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-在演示中, 我们采用上面的特征模板, 包括:  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。
-
-### 数据提供
-
-`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-相应的数据迭代器如下:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-函数 `process` 返回8个特征list和1个标签list。
-
-### 神经网络配置
-
-`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。
-
-九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量,并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。
-
-### 训练 
-训练的脚本是 `train.sh`,用户只需执行:
-```bash
-  ./train.sh
-```
-`train.sh` 中的内容:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : 网络配置文件
--  \--use_gpu=false: 使用 CPU 训练(如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true,目前 crf_layer 不支持 GPU)
--  \--log_period=500: 每20个batch输出日志
--  \--trainer_count=1: 设置线程数(或 GPU 数)
--  \--show_parameter_stats_period=5000: 每100个batch显示参数统计
--  \--save_dir=./output: 模型输出路径
--  \--num_passes=10000: 设置数据遍历次数,一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次
--  \--average_test_period=10000000:  每个 average_test_period 批次对平均参数进行测试
--  \--init_model_path=./data: 参数初始化路径
--  \--load_missing_parameter_strategy=rand: 随机初始不存在的参数
--  \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-
-
-训练后,模型将保存在目录`output`中。 我们的训练曲线如下:
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### 测试
-测试脚本是 `test.sh`, 执行:
-```bash
-  ./test.sh
-```
-`tesh.sh` 的主要部分:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: 网络配置文件
-  - \--model_list=$model_list.list: 模型列表文件
-  - \--job=test: 指示测试任务
-  - \--config_args=is_test=1: 指示测试任务的标记
-  - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-  
-
-### 预测
-预测脚本是 `predict.sh`,用户只需执行:
-```bash
-  ./predict.sh
-  
-```
-在`predict.sh`中,用户应该提供网络配置文件,模型路径,标签文件,字典文件,特征文件。
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` 是主要的可执行python脚本,其中包括函数:加载模型,加载数据,数据预测。网络模型将输出标签的概率分布。 在演示中,我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。
-
-预测后,结果保存在 `predict.res` 中。
-
-## 引用
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md
deleted file mode 100644
index 92d7c63483..0000000000
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ /dev/null
@@ -1,204 +0,0 @@
-```eval_rst
-..  _semantic_role_labeling:
-```
-
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release:the test data set of CoNll-2005 shared task 
-test.wsj.words:the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./src/network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./src/feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-The `process`function yield 9 lists which are 8 features and label.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
--  \--log_period=500: print log every 20 batches.
--  \--trainer_count=1: set thread number (or GPU count).
--  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
--  \--save_dir=./output: output path to save models.
--  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
--  \--init_model_path=./data: parameter initialization path 
--  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
--  \--test_all_data_in_one_period=1: test all data in one period
-
-
-After training, the models  will be saved in directory `output`. Our training curve is as following:
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  - \--test_all_data_in_one_period=1: test all data in 1 period
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
deleted file mode 100644
index 4ae7864212..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
deleted file mode 100644
index baa35ae7f0..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/curve.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
deleted file mode 100644
index 0e3310e4ac..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
deleted file mode 100644
index 4ae7864212..0000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/index_cn.md b/doc/tutorials/sentiment_analysis/index_cn.md
deleted file mode 100644
index 1323ec1a6a..0000000000
--- a/doc/tutorials/sentiment_analysis/index_cn.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性,给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如:把用户在购物网站、旅游网站、团购网站(亚马逊、天猫、淘宝等)上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如,研究人员分析了几个关于消费者信心和政治观点的调查,结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面,抓取产品的用户评论并分析他们的情感,有助于理解用户对不同公司,不同产品,甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆(LSTM)网络的训练过程,以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)(有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf))的句子的情感 。 此数据集包含电影评论及其相关联的类别标签,即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本,它不仅能够处理IMDB数据,还能处理其他用户自定义的数据。 为了使用提前编写的脚本,需要将标记的训练和测试样本移动到另一个路径,这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功,你将在目录```./demo/sentiment/data```中看到下面的文件:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练,25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7,总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下:
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本,包含12,500个txt文件,每个文件是一个电影评论。
-* neg: 负面评价样本,包含12,500个txt文件,每个文件是一个电影评论。
-* unsup: 未标记的评价样本,包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中,我们只使用已经标注过的训练集和测试集,且默认在训练集上构建字典,而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集, 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义:标签0表示负面的评论,标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务,可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例,每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号,如果你不需要这个操作,在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络(RNN)的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元,忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息,而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内,存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](src/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ,仅仅是一些关键词,如形容词和副词,在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长,例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先,它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二,它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三,它直接学习段落表示,而不是组合上下文级别信息。
-
-在本演示中,我们提供两个网络,即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络,后面连全连接层和softmax层。
-
-<center>![BiLSTM](src/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来,连接三个LSTM隐藏层,并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后,使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降(sgd)算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数,IMDB有两个标签,即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示,默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络,如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练,如果你安装GPU版本的PaddlePaddle,并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数(或GPU个数)。
-* \--num\_passes=15: 设置pass,PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功,输出日志保存在路径 `demo/sentiment/train.log`中,模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下:
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下,我们使用`stacked_lstm_net`网络,当传递相同的样本数时,它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM,只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中,我们默认使用IMDB的测试数据集作为验证。 与训练不同,它需要在这里指定`--job = test`和模型路径,即`--model_list = $model_list`。如果运行成功,日志将保存在“demo / sentiment / test.log”的路径中。例如,在我们的测试中,最好的模型是`model_output / pass-00002`,分类误差是0.115645,如下:
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下:
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
-* `predict.py` : 预测接口脚本。
-* `--tconf=$config` : 设置网络配置。
-* `--model=$model` : 设置模型路径。
-* `--label=$label` : 设置标签类别字典,这个字典是整数标签和字符串标签的一个对应。
-* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
-* `--batch_size=1` : 设置batch size。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果:
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注,并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md
deleted file mode 100644
index bb7681db44..0000000000
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ /dev/null
@@ -1,328 +0,0 @@
-# Sentiment Analysis Tutorial
-
-Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc.
-
-Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2].
-
-On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
-
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
-
-## Data Preparation
-
-### IMDB Data Introduction
-
-Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`.
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: raw dataset downloaded from website.
-* imdb: only contains train and test data.
-* mosesdecoder-master: Moses tool.
-
-IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`.
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: train sets.
-* test : test sets.
-* imdb.vocab: dictionary.
-* imdbEr.txt: expected rating for each token in imdb.vocab.
-* README: data documentation.
-
-The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: positive samples, contains 12,500 txt files, each file is one movie review.
-* neg: negative samples, contains 12,500 txt files, each file is one movie review.
-* unsup: unlabeled samples, contains 50,000 txt files.
-* urls_xx.txt: urls of each reviews.
-* xxBow.feat: already-tokenized bag of words (BoW) features.
-
-### IMDB Data Preparation
-
-In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data.
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: input data directory.
-* preprocess.py: preprocess script.
-
-If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled.
-* train.list and test.list: train and test file lists.
-* dict.txt: dictionary generated on train sets by default.
-* labels.txt: neg  0, pos 1, means label 0 is negative review, label 1 is positive review.
-
-### User-defined Data Preparation
-
-If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows.
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 1st directory.
-* train, test: 2nd directory.
-* class1,class2,...: 3rd directory.
-* text_files: samples with text file format.
-
-All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'.
-
-## Training
-
-In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation.
-
-<center>![LSTM](./lstm.png)</center>
-<center>Figure 1. LSTM [3]</center>
-
-Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework.
-
-In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM.
-
-#### Bidirectional-LSTM
-
-One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2.
-
-<center>![BiLSTM](./bi_lstm.jpg)</center>
-<center>Figure 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5].
-
-<center>![StackedLSTM](./stacked_lstm.jpg)</center>
-<center>Figure 3. Stacked-LSTM for sentiment analysis </center>
-
-**Config**
-
-Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`.
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  average_window=0.5,
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **Data Definition**:
-   * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
-   * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
-
-* **Algorithm Configuration**:
-   * set batch size of 128.
-   * set global learning rate.
-   * use adam optimization.
-   * set average sgd window.
-   * set L2 regularization.
-   * set gradient clipping threshold.
-* **Network Configuration**:
-   * dict_dim: dictionary dimension.
-   * class_dim: category number, IMDB has two label, namely positive and negative label.
-   * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
-   * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
-
-**Training**
-
-Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training.
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: set network config.
-* \--save\_dir=$output: set output path to save models.
-* \--job=train: set job mode to train.
-* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
-* \--trainer\_count=4: set thread number (or GPU count).
-* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
-* \--log\_period=20: print log every 20 batches.
-* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches.
-* \--test\_all_data\_in\_one\_period=1: test all data every testing.
-
-If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows.
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: means passing xx batches.
-- samples=xx: means passing xx samples.
-- AvgCost=xx: averaged cost from 0-th batch to current batch.
-- CurrentCost=xx: current cost of latest log_period batches.
-- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch.
-- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches.
-- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time.
-
-By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`.
-
-## Testing
-
-Testing means evaluating the labeled validation set using trained model.
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows.
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## Prediction
-
-`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running:
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample.
-* `predict.py` : predicting interface.
-* `--tconf=$config` : set network configure.
-* ` --model=$model` : set model path.
-* `--label=$label` : set dictionary about corresponding relation between integer label and string label.
-* `--dict=data/pre-imdb/dict.txt` : set dictionary.
-* `--batch_size=1` : set batch size.
-
-Note you should make sure the default model path `model_output/pass-00002`
-exists or change the model path.
-
-Predicting result of this example:
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-We sincerely appreciate your interest and welcome your contributions.
-
-## Reference
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
deleted file mode 100644
index aaf1fc690d..0000000000
Binary files a/doc/tutorials/sentiment_analysis/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
deleted file mode 100644
index adec1606d6..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
deleted file mode 100644
index aaf1fc690d..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
deleted file mode 100644
index 4239055050..0000000000
Binary files a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
deleted file mode 100644
index 4239055050..0000000000
Binary files a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md
deleted file mode 100644
index 41a87b926d..0000000000
--- a/doc/tutorials/text_generation/index_cn.md
+++ /dev/null
@@ -1,339 +0,0 @@
-# 文本生成教程 #
-
-在语言生成领域中,“序列到序列”(sequence to sequence)的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译(machine translation)、query改写(query rewriting)、图像描述(image captioning)等等。
-
-本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译(NMT)模型来将法语翻译成英语。
-
-我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章,其中详细说明了模型架构,以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。
-
-我们感谢@caoying的pull request,其中定义了模型架构和solver配置。
-
-## 数据准备 ##
-### 下载与解压缩 ###
-从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集,然后解压,并将Develop和Test数据分别放入不同的文件夹。
-
-- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-在Linux下,只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">12</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">2</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">2</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- 每个文件夹都包含法语到英语的平行语料库
-- **XXX.src** 是原始法语文件;**XXX.trg** 是目标英语文件
-- **XXX.src** 和 **XXX.trg** 的行数应该一致
-- 每行都是一个法语或者英语的句子
-- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系
-
-### 用户自定义数据集 ###
-
-如果你想进行诸如语义转述(Paraphrasing)等其他“序列到序列”的任务,你只需要按照如下方式组织数据,并将它们放在`demo/seqToseq/data`目录下:
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-  
-- 一级目录:数据集文件夹名称
-- 二级目录:train、test和gen这三个文件夹是固定的
-- 三级目录:源语言到目标语言的平行语料库文件
-  - **XXX.src** 是源语言的文件,**XXX.trg** 时目标语言的文件
-  - 文件中的每行都必须是一个句子
-  - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系
-
-## 数据预处理 ##
-### 预处理工作流程 ###
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件:
-  - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX**
-  - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行
-- 创建训练数据的“源字典”和“目标字典”,每个字典都有DICTSIZE个单词,包括:
-  - 词频最高的(DICTSIZE - 3)个单词
-  - 3个特殊符号
-  - `<s>`:序列的开始
-  - `<e>`:序列的结束
-  - `<unk>`:未包含在字典中的单词
-
-### 预处理命令和结果
-对数据集进行预处理的基本命令是:
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`:输入的原始数据集路径
-- `-d DICTSIZE`:指定的字典单词数,如果没有设置,字典会包含输入数据集中的所有单词
-- `-m --mergeDict`:合并 “源字典”和“目标字典”,使得两个字典有相同的上下文
-
-你将会看到如下消息:
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-然后你只需要运行以下命令:
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-这将花费数分钟的时间,并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下:
-
-    train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial #
-
-- **train, test, gen**:分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分,首先是法语序列,然后是对应的英语序列。
-- **train.list, test.list, gen.list**:分别为train,test,gen文件夹中的文件列表
-- **src.dict, trg.dict**:源(法语)/目标(英语)字典,每个字典包含总共30000个单词:29997个最高频单词和3个特殊符号
-
-## 模型训练 ##
-### 简介###
-
-神经网络机器翻译(NMT)旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型(encoder–decoder models)的一种。编解码模型将一个源语句编码为一个定长的向量,然后解码器通过这个向量生成一个目标语句。
-
-在这个任务中,我们使用了一个编解码模型的扩展,它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词,它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词,这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释,可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。
-
-这个模型对于编解码模型来说,最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反,它将输入语句编码为向量的序列,其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时,会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来,不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显,在任意长度语句翻译的场景下都可以观察到其效果的提升。
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### 使用PaddlePaddle训练模型 ###
-我们在训练之前需要常见一个模型配置文件,这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network,job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**:在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置,其输入参数如下:
-  - data_dir:训练数据和测试数据的目录
-  - is_generating:这个配置是否用来生成,这里设置为False
-2. **Algorithm Configuration**:在示例中我们使用SGD训练算法(默认),和ADAM学习方法,指定batch_size为50,learning_rate为5e-4
-3. **Network Architecture**:在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器,它模拟了解码翻译过程中在源语句中的搜索。
-
-### 训练模型的命令与结果###
-写完模型配置之后,我们可以通过以下命令来训练模型:
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-`train.sh` 的内容如下所示:
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: 设置神经网络的配置文件
-- save_dir: 设置保存模型的输出路径
-- use_gpu: 是否使用GPU训练,这里设置为使用CPU
-- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次
-- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息
-- trainer_count: 设置CPU线程数或者GPU设备数
-- log_period: 这里每隔10个batch打印一次日志
-- dot_period: 这里每个5个batch打印一个点"."
-
-训练的损失函数默认每隔10个batch打印一次,你将会看到如下消息:
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost:从第0个batch到当前batch的平均cost
-- CurrentCost::当前batch的cost
-- classification\_error\_evaluator(Eval):从第0个评估到当前评估中,每个单词的预测错误率
-- classification\_error\_evaluator(CurrentEval):当前评估中,每个单词的预测错误率
-
-当classification\_error\_evaluator的值低于0.35时,模型就训练成功了。
-
-## 文本生成 ##
-### 简介###
-
-一般而言,NMT模型受制于源语句的编码,并且通过给出当前目标单词来预测下一个目标单词。在训练过程中,当前单词在相比之下总是被当作真值(ground truth)。在生成过程中,当前单词是解码器最后一步的输出,这来自于PaddlePaddle的内存中。
-
-而且,我们使用集束搜索(Beam Search)来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层,生成当前层的所有后继状态,并将它们按照启发代价(heuristic cost)升序排列。但是这种方法在每层只保存预设数量的最优状态(这个数量称为beam size)。
-
-### 预训练的模型 ###
-我们在拥有50个节点的集群中训练模型,每个节点有两个6核CPU。我们在5天里训练了16个pass,其中每条pass花费了7个小时。model_dir中有16个子目录,每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77(参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf))。要下载解压这个模型,只需在linux下运行如下命令:
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### 使用PaddlePaddle生成模型 ###
-在翻译法语句子之前,我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network,job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**:在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置,其输入参数如下:
-  - data_dir:生成数据的目录
-  - is_generating:这个配置是否用来生成,这里设置为True
-  - gen_result:保存生成结果的文件
-2. **Algorithm Configuration**:在生成过程中我们使用SGD训练算法,并指定batch_size为1(每次生成1个序列),learning_rate为0
-3. **Network Architecture**:本质上与训练模型一样
-
-### 生成模型的命令与结果 ###
-写完模型配置之后,我们可以通过以下命令来进行从法语到英语的文本翻译:
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
- `gen.sh` 的内容如下所示。与训练模型不同的是,这里有一些不同的参数需要指定:
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job:设置任务的模式为测试
-- save_dir:存储模型的路径
-- num_passes and test_pass:从test_pass到(num_passes - 1)加载模型参数,这里只加载 `data/wmt14_model/pass-00012`
-
-你将会看到这样的消息:
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示:
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- 这是集束搜索的结果,其中beam size是3
-- 第一行的“0”和第6行的“1”表示生成数据的序列id
-- 其他六行列出了集束搜索的结果
-  - 第二列是集束搜索的得分(从大到小)
-  - 第三列是生成的英语序列
-- 有两个特殊标识:
-  - `<e>`:序列的结尾
-  - `<unk>`:不包含在字典中的单词
-
-### BLEU评估 ###
-对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法,当需要快速或者频繁的评估时,使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统,我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本:
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`,我们可以运行以下命令来做BLEU评估。
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE:生成的结果文件
-- BEAMSIZE:集束搜索中的扩展广度
diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md
deleted file mode 100644
index 5d8e667c20..0000000000
--- a/doc/tutorials/text_generation/index_en.md
+++ /dev/null
@@ -1,338 +0,0 @@
-# Text generation Tutorial #
-
-Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc.
-
-This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English.
-
-We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle.
-
-We thank @caoying for the pull request that defines the model architecture and solver configurations.
-
-## Data Preparation ##
-### Download and Extract ###
-Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder.
-
-- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-We should find that the dataset `wmt14` has three folders as shown in the following table.
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">twelve</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">two</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">two</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- Each folder has French-English parallel corpora
-- **XXX.src** are source French files; **XXX.trg** are target English files.
-- The number of lines of **XXX.src** and **XXX.trg** should be the same.
-- Each line is a French/English sentence.
-- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**.
-
-### User Defined Dataset ###
-
-If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`:
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-- 1st directory: dataset folder name
-- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed.
-- 3rd file: Source-Target parallel corpora files.
-  - **XXX.src** are source files, **XXX.trg** are target files.
-  - Each line of the file must be a sequence.
-  - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**.
-
-## Data Preprocess ##
-### Preprocessing Workflow ###
-- Concat each Source-Target parallel corpora to be one file:
-  - concat each **XXX.src** and **XXX.trg** to be **XXX**.
-  - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg**
-- Build source and target dictionary of train data, each dictionary has DICTSIZE words:
-  - the most frequent (DICTSIZE-3) words
-  - 3 special token:
-    - `<s>`: the start of a sequence
-    - `<e>`: the end of a sequence
-    - `<unk>`: a word not included in dictionary
-
-### Preprocessing Command and Result
-The general command for preprocessing the dataset is:
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`: the path of input original dataset
-- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset
-- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context
-
-And you will see messages like this:
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-Here, you can simply run the command:
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure.
-
-    train test gen train.list test.list gen.list src.dict trg.dict
-
-- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence.
-- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively
-- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token
-
-## Model Training ##
-### Introduction ###
-
-Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence.
-
-In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information.  The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473).
-
-The most distinguishing feature of this model is that it doesn't encode an input sentence into a single fixed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a fixed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length.
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### Training Model in PaddlePaddle ###
-We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode.
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments:
-   - data_dir: directory of train data and test data
-   - is\_generating: whether this config is used for generating, here is false
-2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4.
-3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation.
-
-### Training Command and Result###
-After writing the model config, we can train the model by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-The `train.sh` is shown as follows:
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: set config of neural network
-- save_dir: set output path to save models
-- use_gpu: whether to use GPU to train, here use CPU
-- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time
-- show_parameter_stats_period: here show parameter statistic every 100 batches
-- trainer_count: set number of CPU threads or GPU devices
-- log_period: here print log every 10 batches
-- dot_period: here print '.' every 5 batches
-
-The training loss function is printed every 10 batch by default, and you will see messages like this:
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost: Average Cost from 0th batch to current batch
-- CurrentCost: Cost in current batch
-- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation
-- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation
-
-And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully.
-
-## Text Generation ##
-### Introduction ###
-
-Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle.
-
-Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
-
-### Pretrained model ###
-We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### Generating Model in PaddlePaddle ###
-We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode.
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
-   - data\_dir: directory of gen data
-   - is\_generating: whether this config is used for generating, here is true
-   - gen\_result: file to store the generation result
-2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
-3. **Network Architecture**: Essentially the same as the training model.
-
-### Generating Command and Result ###
-After writing the model config, we can do text translation from French to English by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
-The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify:
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job: set job mode to test
-- save_dir: the path of saved models
-- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012`
-
-You will see messages like this:
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-And the generating result in `demo/seqToseq/translation/gen_result` likes:
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- This is the beam search result, where beam size is 3
-- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data
-- Other six lines list the beam search results
-  - The 2nd-column is the score of beam search (from large to small)
-  - The 3rd-colunm is the generating English sequence
-- There is 2 special tokens:
-  - `<e>`: the end of a sequence
-  - `<unk>`: a word not included in dictionary
-
-### Bleu Evalutaion ###
-Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command:
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE: the generation result file
-- BEAMSIZE: expand width in beam search
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
index 65e61c5f29..9fca69dc4e 100644
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -101,7 +101,7 @@
     </div>
     <div class="site-nav-links">
       <div class="site-menu">
-        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
         <div class="language-switcher dropdown">
           <a type="button" data-toggle="dropdown">
             <span>English</span>
diff --git a/go/.gitignore b/go/.gitignore
new file mode 100644
index 0000000000..398d70ca37
--- /dev/null
+++ b/go/.gitignore
@@ -0,0 +1,3 @@
+vendor/
+.glide/
+proto/*.go
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
new file mode 100644
index 0000000000..29ce909c64
--- /dev/null
+++ b/go/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(pserver/client/c)
+add_subdirectory(cmd/pserver)
+add_subdirectory(cmd/master)
+add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(pserver/client)
+add_subdirectory(utils/networkhelper)
diff --git a/go/cmake/CMakeDetermineGoCompiler.cmake b/go/cmake/CMakeDetermineGoCompiler.cmake
deleted file mode 100644
index a9bb6906c7..0000000000
--- a/go/cmake/CMakeDetermineGoCompiler.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-if(NOT CMAKE_Go_COMPILER)
-  if(NOT $ENV{GO_COMPILER} STREQUAL "")
-    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
-
-    if(CMAKE_Go_FLAGS_ENV_INIT)
-      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
-    endif()
-
-    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
-      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
-    endif()
-
-  endif()
-
-  set(Go_BIN_PATH
-    $ENV{GOPATH}
-    $ENV{GOROOT}
-    $ENV{GOROOT}/../bin
-    $ENV{GO_COMPILER}
-    /usr/bin
-    /usr/local/bin
-    )
-
-  if(CMAKE_Go_COMPILER_INIT)
-    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
-  else()
-    find_program(CMAKE_Go_COMPILER
-      NAMES go
-      PATHS ${Go_BIN_PATH}
-    )
-    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
-    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
-    message("-- The Golang compiler identification is ${VERSION}")
-    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
-  endif()
-
-endif()
-
-mark_as_advanced(CMAKE_Go_COMPILER)
-
-configure_file(${CMAKE_MODULE_PATH}/CMakeGoCompiler.cmake.in
-  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
-
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoCompiler.cmake.in b/go/cmake/CMakeGoCompiler.cmake.in
deleted file mode 100644
index a71f08e064..0000000000
--- a/go/cmake/CMakeGoCompiler.cmake.in
+++ /dev/null
@@ -1,8 +0,0 @@
-set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
-set(CMAKE_Go_COMPILER_LOADED 1)
-
-set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
-set(CMAKE_Go_LINKER_PREFERENCE 40)
-set(CMAKE_Go_OUTPUT_EXTENSION .o)
-set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoInformation.cmake b/go/cmake/CMakeGoInformation.cmake
deleted file mode 100644
index ba51ac93fc..0000000000
--- a/go/cmake/CMakeGoInformation.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-if(NOT CMAKE_Go_COMPILE_OBJECT)
-  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
-endif()
-
-if(NOT CMAKE_Go_LINK_EXECUTABLE)
-  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
-endif()
diff --git a/go/cmake/CMakeTestGoCompiler.cmake b/go/cmake/CMakeTestGoCompiler.cmake
deleted file mode 100644
index b9891b015b..0000000000
--- a/go/cmake/CMakeTestGoCompiler.cmake
+++ /dev/null
@@ -1 +0,0 @@
-set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/go/cmake/flags.cmake b/go/cmake/flags.cmake
deleted file mode 100644
index a167c432a9..0000000000
--- a/go/cmake/flags.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
deleted file mode 100644
index d38d06de23..0000000000
--- a/go/cmake/golang.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle")
-file(MAKE_DIRECTORY ${PADDLE_IN_GOPATH})
-
-function(GO_LIBRARY NAME BUILD_TYPE)
-  if(BUILD_TYPE STREQUAL "STATIC")
-    set(BUILD_MODE -buildmode=c-archive)
-    set(LIB_NAME "lib${NAME}.a")
-  else()
-    set(BUILD_MODE -buildmode=c-shared)
-    if(APPLE)
-      set(LIB_NAME "lib${NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${NAME}.so")
-    endif()
-  endif()
-
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
-
-  # find Paddle directory.
-  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-  get_filename_component(PADDLE_DIR ${PARENT_DIR} DIRECTORY)
-
-  # automatically get all dependencies specified in the source code
-  # for given target.
-  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
-
-  # make a symlink that references Paddle inside $GOPATH, so go get
-  # will use the local changes in Paddle rather than checkout Paddle
-  # in github.
-  add_custom_target(copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
-  add_dependencies(goGet copyPaddle)
-
-  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} goGet)
-
-  if(NOT BUILD_TYPE STREQUAL "STATIC")
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
-  endif()
-endfunction(GO_LIBRARY)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
new file mode 100644
index 0000000000..9e149967e7
--- /dev/null
+++ b/go/cmd/master/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+go_binary(master SRC master.go)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index d1f3d7d76c..f57db1c0a0 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
@@ -6,88 +20,101 @@ import (
 	"net/http"
 	"net/rpc"
 	"os"
-	"path/filepath"
+	"os/signal"
 	"strconv"
 	"strings"
 	"time"
 
+	log "github.com/inconshreveable/log15"
 	"github.com/namsral/flag"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
 )
 
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
-	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
+	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warn, error, crit")
 	flag.Parse()
 
-	if *dataset == "" {
-		panic("no dataset specified.")
+	lvl, err := log.LvlFromString(*logLevel)
+	if err != nil {
+		panic(err)
 	}
 
-	if *faultTolerance {
-		panic("fault tolernance not implemented.")
+	log.Root().SetHandler(
+		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+
+	if *endpoints == "" {
+		log.Warn("-endpoints not set, fault tolerance not be enabled.")
 	}
 
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
+	var store master.Store
+	if *endpoints != "" {
+		eps := strings.Split(*endpoints, ",")
+		ip, err := networkhelper.GetExternalIP()
 		if err != nil {
+			log.Crit("get external ip error", log.Ctx{"error": err})
 			panic(err)
 		}
-		paths = append(paths, match...)
-	}
-
-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
 
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
+		addr := fmt.Sprintf("%s:%d", ip, *port)
+		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
 		if err != nil {
+			log.Crit("error creating etcd client.", log.Ctx{"error": err})
 			panic(err)
 		}
+	} else {
+		store = &master.InMemStore{}
+	}
 
-		index, err := recordio.LoadIndex(f)
+	shutdown := func() {
+		log.Info("shutting down gracefully")
+		err := store.Shutdown()
 		if err != nil {
-			panic(err)
-		}
-		f.Close()
-
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
+			log.Error("shutdown error", log.Ctx{"error": err})
 		}
 	}
 
-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	err := rpc.Register(s)
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
+	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	if err != nil {
+		log.Crit("error creating new service.", log.Ctx{"error": err})
 		panic(err)
 	}
 
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
+	err = rpc.Register(s)
 	if err != nil {
+		log.Crit("error registering to etcd.", log.Ctx{"error": err})
 		panic(err)
 	}
 
-	err = http.Serve(l, nil)
+	rpc.HandleHTTP()
+	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	if err != nil {
+		log.Crit("error listing to port", log.Ctx{"error": err, "port": *port})
 		panic(err)
 	}
+
+	go func() {
+		err = http.Serve(l, nil)
+		if err != nil {
+			log.Crit("error serving HTTP", log.Ctx{"error": err})
+			panic(err)
+		}
+	}()
+
+	<-c
 }
diff --git a/go/cmd/pserver/CMakeLists.txt b/go/cmd/pserver/CMakeLists.txt
new file mode 100644
index 0000000000..51db6dff04
--- /dev/null
+++ b/go/cmd/pserver/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index f0be251c24..1358801c1c 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -1,34 +1,108 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
+	"time"
 
 	"github.com/namsral/flag"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/inconshreveable/log15"
 )
 
 func main() {
-	port := flag.Int("port", 0, "port of the pserver")
+	port := flag.Int("port", 8001, "port of the pserver")
+	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
+	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
+		"comma separated endpoint string for pserver to connect to etcd")
+	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
+	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
+	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warn, error, crit")
 	flag.Parse()
 
-	s := pserver.NewService()
-	err := rpc.Register(s)
+	lvl, err := log.LvlFromString(*logLevel)
 	if err != nil {
 		panic(err)
 	}
 
-	rpc.HandleHTTP()
-	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
+	log.Root().SetHandler(
+		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+
+	var idx int
+
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
+	if *index >= 0 {
+		idx = *index
+	} else {
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
+		idx, err = e.Register(*port)
+		candy.Must(err)
+
+		cp, err = pserver.LoadCheckpoint(e, idx)
+		if err != nil {
+			if err == pserver.ErrCheckpointNotFound {
+				log.Info("load checkpoint error", "error", err)
+			} else {
+				panic(err)
+			}
+		}
 	}
 
-	err = http.Serve(l, nil)
-	if err != nil {
-		panic(err)
+	shutdown := func() {
+		log.Info("shutting down gracefully")
+		sErr := e.Shutdown()
+		if sErr != nil {
+			log.Error("error shutting down", log.Ctx{"error": sErr})
+		}
 	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
+	candy.Must(err)
+
+	err = rpc.Register(s)
+	candy.Must(err)
+
+	rpc.HandleHTTP()
+	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
+	candy.Must(err)
+
+	go func() {
+		log.Info("serving pserver", log.Ctx{"port": *port})
+		err = http.Serve(l, nil)
+		candy.Must(err)
+	}()
+
+	<-c
 }
diff --git a/go/pserver/internal/connection/conn.go b/go/connection/conn.go
similarity index 60%
rename from go/pserver/internal/connection/conn.go
rename to go/connection/conn.go
index 1c04f11725..ffa8db689d 100644
--- a/go/pserver/internal/connection/conn.go
+++ b/go/connection/conn.go
@@ -1,9 +1,25 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package connection
 
 import (
 	"errors"
 	"net/rpc"
 	"sync"
+
+	log "github.com/sirupsen/logrus"
 )
 
 // TODO(helin): add TCP re-connect logic
@@ -21,6 +37,18 @@ func New() *Conn {
 	return c
 }
 
+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.client == nil {
+		return nil
+	}
+
+	return c.client.Close()
+}
+
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +78,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Errorln(err)
+		}
+
 		return errors.New("client already set from a concurrent goroutine")
 	}
 
 	return nil
 }
 
+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
+
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service
diff --git a/go/glide.lock b/go/glide.lock
new file mode 100644
index 0000000000..d15fc934db
--- /dev/null
+++ b/go/glide.lock
@@ -0,0 +1,233 @@
+hash: 107c058cf5c9163a75d40eef2273a793c36112683c25d72aa8288827fdde3a19
+updated: 2017-10-30T03:46:19.137696069Z
+imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
+- name: github.com/coreos/etcd
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
+  subpackages:
+  - alarm
+  - auth
+  - auth/authpb
+  - client
+  - clientv3
+  - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/etcdhttp
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
+  - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
+  - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
+  - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/go-stack/stack
+  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
+- name: github.com/golang/protobuf
+  version: 4bd1920723d7b7c925de087aa32e2187708897f7
+  subpackages:
+  - jsonpb
+  - proto
+- name: github.com/golang/snappy
+  version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/inconshreveable/log15
+  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/mattn/go-colorable
+  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
+- name: github.com/mattn/go-isatty
+  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
+- name: github.com/namsral/flag
+  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
+- name: github.com/PaddlePaddle/recordio
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
+- name: github.com/satori/go.uuid
+  version: 879c5887cd475cd7864858769793b2ceb0d44feb
+- name: github.com/sirupsen/logrus
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
+  - ssh/terminal
+- name: golang.org/x/net
+  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
+  subpackages:
+  - context
+  - http2
+  - http2/hpack
+  - idna
+  - internal/timeseries
+  - lex/httplex
+  - trace
+- name: golang.org/x/sys
+  version: e48874b42435b4347fc52bdee0424a52abc974d7
+  repo: https://github.com/golang/sys.git
+  vcs: git
+  subpackages:
+  - unix
+  - windows
+- name: golang.org/x/text
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
+  subpackages:
+  - secure/bidirule
+  - transform
+  - unicode/bidi
+  - unicode/norm
+- name: google.golang.org/grpc
+  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
+  subpackages:
+  - codes
+  - credentials
+  - grpclog
+  - internal
+  - keepalive
+  - metadata
+  - naming
+  - peer
+  - stats
+  - tap
+  - transport
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
new file mode 100644
index 0000000000..c5d66694ac
--- /dev/null
+++ b/go/glide.yaml
@@ -0,0 +1,33 @@
+package: github.com/PaddlePaddle/Paddle/go
+import:
+- package: github.com/PaddlePaddle/recordio
+- package: github.com/coreos/etcd
+  version: ^3.2.1
+  subpackages:
+  - clientv3
+  - clientv3/concurrency
+  - embed
+  - etcdserver
+- package: github.com/namsral/flag
+  version: ^1.7.4-pre
+- package: github.com/sirupsen/logrus
+  version: ^1.0.0
+- package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+- package: golang.org/x/sys
+  repo: https://github.com/golang/sys.git
+  vcs: git
+- package: golang.org/x/text
+  repo: https://github.com/golang/text.git
+  vcs: git
+- package: github.com/satori/go.uuid
+  version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
+- package: github.com/inconshreveable/log15
+  version: v2.13
+- package: github.com/go-stack/stack
+  version: v1.6.0
+- package: github.com/golang/protobuf
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
new file mode 100644
index 0000000000..93efa4eaf7
--- /dev/null
+++ b/go/master/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
new file mode 100644
index 0000000000..082d9f3f59
--- /dev/null
+++ b/go/master/c/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/master/c/client.go b/go/master/c/client.go
new file mode 100644
index 0000000000..9a3960d59c
--- /dev/null
+++ b/go/master/c/client.go
@@ -0,0 +1,196 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+/*
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#define PADDLE_MASTER_OK    0
+#define PADDLE_MASTER_ERROR -1
+
+#define PADDLE_SAVE_MODEL_OK   1
+#define PADDLE_SAVE_MODEL_SKIP 0
+
+typedef int paddle_master_client;
+*/
+import "C"
+
+import (
+	"strings"
+	"sync"
+	"time"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	log "github.com/inconshreveable/log15"
+)
+
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_master_client]*master.Client)
+var curHandle C.paddle_master_client
+
+func init() {
+	log.Root().SetHandler(
+		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+}
+
+func add(c *master.Client) C.paddle_master_client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+
+func get(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+//export paddle_new_etcd_master_client
+//
+// bufSize is the record buffer size.
+func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
+	p := C.GoString(etcdEndpoints)
+	endpoints := strings.Split(p, ",")
+	c, err := master.NewClient(
+		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
+		master.WithBuffer(bufSize),
+	)
+	if err != nil {
+		panic(err)
+	}
+
+	return add(c)
+}
+
+//export paddle_new_master_client
+//
+// bufSize is the record buffer size.
+func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
+	a := C.GoString(addr)
+	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
+	if err != nil {
+		panic(err)
+	}
+
+	return add(c)
+}
+
+//export paddle_release_master_client
+func paddle_release_master_client(client C.paddle_master_client) {
+	remove(client)
+}
+
+//export paddle_start_get_records
+func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
+	c := get(client)
+	c.StartGetRecords(int(pass))
+}
+
+//export paddle_set_dataset
+func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
+	c := get(client)
+	var paths []string
+	for i := 0; i < int(size); i++ {
+		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
+		str := C.GoString(*ptr)
+		paths = append(paths, str)
+	}
+	err := c.SetDataset(paths)
+	if err != nil {
+		log.Error("error set dataset",
+			log.Ctx{"error": err, "paths": paths})
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	return C.PADDLE_MASTER_OK
+}
+
+// paddle_next_record gets the nexts training record.
+//
+// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
+//
+//export paddle_next_record
+func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
+	c := get(client)
+	r, err := c.NextRecord()
+	if err != nil {
+		// NOTE: use errors to indicate pass ends
+		if err.Error() == master.ErrAllTaskFailed.Error() ||
+			err.Error() == master.ErrNoMoreAvailable.Error() ||
+			err.Error() == master.ErrPassBefore.Error() {
+			return -2
+		}
+		*record = (*C.uchar)(nil)
+		return -1
+	}
+
+	if len(r) == 0 {
+		// Empty record
+		*record = (*C.uchar)(nil)
+		return 0
+	}
+
+	size := C.size_t(len(r))
+	*record = (*C.uchar)(C.malloc(size))
+	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
+	return C.int(size)
+}
+
+// paddle_request_save_model requests the master server to approve the
+// caller to save the model.
+//
+// returns 1 if the save the model request is approved, 0 if the
+// request is rejected because other trainer is saving the model, -1
+// if error happened.
+//
+//export paddle_request_save_model
+func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
+	c := get(client)
+	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
+	if err != nil {
+		log.Error("error request save model", log.Ctx{"error": err})
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	if need {
+		return C.PADDLE_SAVE_MODEL_OK
+	}
+
+	return C.PADDLE_SAVE_MODEL_SKIP
+}
+
+//export mem_free
+func mem_free(p unsafe.Pointer) {
+	// "free" may be a better name for this function, but doing so
+	// will cause calling any function of this library from Python
+	// ctypes hanging.
+	C.free(p)
+}
+
+func main() {}
diff --git a/go/master/client.go b/go/master/client.go
new file mode 100644
index 0000000000..7bcf869553
--- /dev/null
+++ b/go/master/client.go
@@ -0,0 +1,255 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package master
+
+import (
+	"os"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/inconshreveable/log15"
+)
+
+// Client is the client of the master server.
+type Client struct {
+	conn    *connection.Conn
+	ch      chan record
+	bufSize int
+}
+
+type record struct {
+	r   []byte
+	err error
+}
+
+// WithBuffer sets the client to buffer the training record.
+//
+// bufSize is the record buffer size. NextRecord will read from this
+// buffer.
+func WithBuffer(bufSize int) func(*Client) error {
+	return func(c *Client) error {
+		if bufSize <= 0 {
+			return nil
+		}
+		c.bufSize = bufSize
+		return nil
+	}
+}
+
+// WithAddr sets the client to use fixed master address.
+func WithAddr(addr string) func(c *Client) error {
+	return func(c *Client) error {
+		ch := make(chan string, 1)
+		ch <- addr
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// WithEtcd sets the client to use etcd for master discovery.
+func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
+	return func(c *Client) error {
+		var cli *clientv3.Client
+		f := func() error {
+			var err error
+			cli, err = clientv3.New(clientv3.Config{
+				Endpoints:   endpoints,
+				DialTimeout: timeout,
+			})
+			return err
+		}
+		for {
+			err := f()
+			if err != nil {
+				log.Warn("create etcd client error", log.Ctx{"error": err})
+			} else {
+				break
+			}
+			time.Sleep(time.Second)
+		}
+
+		ch := make(chan string, 1)
+		a, err := GetKey(cli, DefaultAddrPath, timeout)
+		if err != nil {
+			return err
+		}
+
+		if a != "" {
+			// Master is registered, send to the master address
+			// channel.
+			ch <- a
+		}
+
+		go watchKey(cli, DefaultAddrPath, ch)
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// NewClient creates a new Client.
+func NewClient(opts ...func(*Client) error) (*Client, error) {
+	c := &Client{}
+	c.conn = connection.New()
+
+	for _, opt := range opts {
+		err := opt(c)
+		if err != nil {
+			return nil, err
+		}
+	}
+	c.ch = make(chan record, c.bufSize)
+	return c, nil
+}
+
+// StartGetRecords must be called at beginning of each pass
+func (c *Client) StartGetRecords(passID int) {
+	go c.getRecords(passID)
+}
+
+func (c *Client) getRecords(passID int) {
+	i := 0
+	for {
+		t, err := c.getTask(passID)
+		if err != nil {
+			if err.Error() == ErrPassBefore.Error() ||
+				err.Error() == ErrNoMoreAvailable.Error() ||
+				err.Error() == ErrAllTaskFailed.Error() {
+				c.ch <- record{nil, err}
+				break
+			}
+
+			if i%60 == 0 {
+				log.Debug("getTask of passID error.",
+					log.Ctx{"error": err, "passID": passID})
+				i = 0
+			}
+
+			// if err.Error() == ErrPassAfter.Error()
+			//   wait util last pass finishes
+			// if other error such as network error
+			//   wait to reconnect or task time out
+			time.Sleep(time.Second * 3)
+			i += 3
+			continue
+		}
+
+		for _, chunk := range t.Chunks {
+			f, e := os.Open(chunk.Path)
+			if e != nil {
+				log.Error("error open chunk", log.Ctx{"error": e})
+				continue
+			}
+
+			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
+			for s.Scan() {
+				c.ch <- record{s.Record(), nil}
+			}
+
+			if s.Err() != nil {
+				c.ch <- record{nil, s.Err()}
+				log.Error(
+					"error scan chunk",
+					log.Ctx{"error": err, "path": chunk.Path},
+				)
+			}
+
+			err = f.Close()
+			if err != nil {
+				log.Error("error close record file", log.Ctx{"error": err})
+			}
+		}
+
+		// We treat a task as finished whenever the last data
+		// instance of the task is read. This is not exactly
+		// correct, but a reasonable approximation.
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Error("task finish callback error.", log.Ctx{"error": err})
+		}
+	}
+}
+
+func (c *Client) monitorMaster(addrCh <-chan string) {
+	lastMaster := ""
+	for curMaster := range addrCh {
+		// connect to the new address once address changed.
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Error("close old master addr error", log.Ctx{"error": err})
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Error("connect to new master addr error", log.Ctx{"error": err})
+
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+			}
+		}
+		lastMaster = curMaster
+	}
+}
+
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times at one pass. But only the first call
+// will be honored.
+//
+// After all tasks are done, another call of SetDataset will start another pass.
+func (c *Client) SetDataset(globPaths []string) error {
+	err := c.conn.Call("Service.SetDataset", globPaths, nil)
+	return err
+}
+
+// getTask gets a new task from the master server.
+func (c *Client) getTask(passID int) (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", passID, &t)
+	return t, err
+}
+
+// TaskFinished tells the master server a task is finished.
+func (c *Client) taskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
+
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
+
+// NextRecord returns next record in the dataset.
+//
+// NextRecord will block until the next record is available. It is
+// thread-safe.
+func (c *Client) NextRecord() ([]byte, error) {
+	r := <-c.ch
+	return r.r, r.err
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
+	var need bool
+	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
+	return need, err
+}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
new file mode 100644
index 0000000000..2f13fd0dcd
--- /dev/null
+++ b/go/master/client_internal_test.go
@@ -0,0 +1,152 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package master
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+)
+
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+
+func TestGetFinishTask(t *testing.T) {
+	const path = "/tmp/master_client_test_0"
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	go func(l net.Listener) {
+		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if sErr != nil {
+			panic(sErr)
+		}
+
+		server := rpc.NewServer()
+		sErr = server.Register(s)
+		if sErr != nil {
+			panic(sErr)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		sErr = http.Serve(l, mux)
+		if sErr != nil {
+			panic(sErr)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
+		// call Close to force RecordIO writing a chunk.
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	// Manually intialize client to avoid calling c.getRecords()
+	c := &Client{}
+	c.conn = connection.New()
+	addr := fmt.Sprintf(":%d", p)
+	ch := make(chan string, 1)
+	ch <- addr
+	go c.monitorMaster(ch)
+
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
+	checkOnePass := func(i int) {
+		var tasks []Task
+		for idx := 0; idx < totalTask; idx++ {
+			task, cErr := c.getTask(i)
+			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+				t.Fatalf("error: %v, pass: %d\n", cErr, i)
+			}
+			tasks = append(tasks, task)
+		}
+
+		// getting task before task finishes should return error
+		_, cErr := c.getTask(i)
+		if cErr == nil {
+			t.Fatalf("Should get error, pass: %d\n", i)
+		}
+
+		cErr = c.taskFinished(tasks[0].Meta.ID)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
+		}
+		// call taskFailed once won't put the task to failed queue, just ensure
+		// the call
+		cErr = c.taskFailed(tasks[0].Meta)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
+		}
+
+		tasks = tasks[1:]
+		_, cErr = c.getTask(i)
+		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
+		}
+
+		for _, task := range tasks {
+			cErr = c.taskFinished(task.Meta.ID)
+			if cErr != nil {
+				t.Fatal(cErr)
+			}
+		}
+	}
+
+	for i := 0; i < 10; i++ {
+		// init pass data
+		c.StartGetRecords(i)
+		checkOnePass(i)
+	}
+}
diff --git a/go/master/client_test.go b/go/master/client_test.go
new file mode 100644
index 0000000000..1963dbfd73
--- /dev/null
+++ b/go/master/client_test.go
@@ -0,0 +1,150 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package master_test
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"runtime"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+
+// tool function for testing output goroutine ids
+func goid() int {
+	var buf [64]byte
+	n := runtime.Stack(buf[:], false)
+	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
+	id, err := strconv.Atoi(idField)
+	if err != nil {
+		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
+	}
+	return id
+}
+
+func TestNextRecord(t *testing.T) {
+	const (
+		path  = "/tmp/master_client_TestFull"
+		total = 50
+	)
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	go func(l net.Listener) {
+		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
+		if err != nil {
+			panic(err)
+		}
+
+		server := rpc.NewServer()
+		err = server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+
+	w := recordio.NewWriter(f, 1, -1)
+	for i := 0; i < total; i++ {
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	err = f.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	// start several client to test task fetching
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		// test for multiple concurrent clients
+		go func() {
+			defer wg.Done()
+			// each go-routine needs a single client connection instance
+			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
+			if e != nil {
+				t.Fatal(e)
+			}
+			e = c.SetDataset([]string{path})
+			if e != nil {
+				panic(e)
+			}
+
+			// test for n passes
+			for pass := 0; pass < 10; pass++ {
+				c.StartGetRecords(pass)
+
+				received := make(map[byte]bool)
+				taskid := 0
+				for {
+					r, e := c.NextRecord()
+					if e != nil {
+						// ErrorPassAfter will wait, else break for next pass
+						if e.Error() == master.ErrPassBefore.Error() ||
+							e.Error() == master.ErrNoMoreAvailable.Error() {
+							break
+						}
+						t.Fatal(pass, taskid, "Read error:", e)
+					}
+					if len(r) != 1 {
+						t.Fatal(pass, taskid, "Length should be 1.", r)
+					}
+					if received[r[0]] {
+						t.Fatal(pass, taskid, "Received duplicate.", received, r)
+					}
+					taskid++
+					received[r[0]] = true
+				}
+			}
+		}()
+	}
+	wg.Wait()
+}
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
new file mode 100644
index 0000000000..2a41d36949
--- /dev/null
+++ b/go/master/etcd_client.go
@@ -0,0 +1,201 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package master
+
+import (
+	"context"
+	"time"
+
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/inconshreveable/log15"
+)
+
+const (
+	// DefaultLockPath is the default etcd master lock path.
+	DefaultLockPath = "/master/lock"
+	// DefaultStatePath is the default etcd key for master state.
+	DefaultStatePath = "/master/state"
+	// DefaultAddrPath is the default etcd key for master address.
+	DefaultAddrPath = "/master/addr"
+)
+
+// EtcdClient is the etcd client that the master uses for fault
+// tolerance and service registry.
+type EtcdClient struct {
+	lockPath  string
+	statePath string
+	client    *clientv3.Client
+	lock      *concurrency.Mutex
+	sess      *concurrency.Session
+}
+
+// NewEtcdClient creates a new EtcdClient.
+func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
+	log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints})
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   endpoints,
+		DialTimeout: dialTimeout,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
+	if err != nil {
+		return nil, err
+	}
+
+	lock := concurrency.NewMutex(sess, lockPath)
+	// It's fine for the lock to get stuck, in this case we have
+	// multiple master servers running (only configured to have
+	// one master running, but split-brain problem may cause
+	// multiple master servers running), and the cluster management
+	// software will kill one of them.
+	log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath})
+	err = lock.Lock(context.TODO())
+	if err != nil {
+		return nil, err
+	}
+	log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath})
+
+	put := clientv3.OpPut(addrPath, addr)
+	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Crit("No longer owns the master lock. Exiting.")
+		panic("No longer owns the master lock. Exiting.")
+	}
+
+	e := &EtcdClient{
+		lockPath:  lockPath,
+		statePath: statePath,
+		client:    cli,
+		lock:      lock,
+		sess:      sess,
+	}
+
+	return e, nil
+}
+
+// Save saves the state into the etcd.
+func (e *EtcdClient) Save(state []byte) error {
+	ctx := context.TODO()
+	put := clientv3.OpPut(e.statePath, string(state))
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return err
+	}
+
+	if !resp.Succeeded {
+		log.Error("No longer owns the lock, trying to lock again")
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		err := e.lock.Lock(ctx)
+		cancel()
+		if err != nil {
+			// We lost the master lock and can not acquire
+			// it back, it means some other master is
+			// already started. We don't want cluster
+			// management system to kill the master server
+			// who is holding the lock and running
+			// correctly. So the most feasible solution is
+			// to kill current master server. The current
+			// state is not saved, but the trainer's RPC
+			// call will fail, so the trainer will retry.
+			log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err})
+			panic("Could not acquire the lock at %s: %v. Exiting.")
+		}
+		log.Info("Successfully acquired lock at %s.", e.lockPath)
+		return e.Save(state)
+	}
+
+	return nil
+}
+
+// Load loads the state from etcd.
+func (e *EtcdClient) Load() ([]byte, error) {
+	ctx := context.TODO()
+	get := clientv3.OpGet(e.statePath)
+
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Error("No longer owns the lock, trying to lock and load again.")
+		err = e.lock.Lock(context.Background())
+		if err != nil {
+			return nil, err
+		}
+
+		return e.Load()
+	}
+
+	kvs := resp.Responses[0].GetResponseRange().Kvs
+	if len(kvs) == 0 {
+		// No state exists
+		return nil, nil
+	}
+
+	state := kvs[0].Value
+	return state, nil
+}
+
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	err := e.sess.Close()
+	newErr := e.client.Close()
+	if newErr != nil {
+		if err == nil {
+			err = newErr
+		} else {
+			log.Error("shutdown error", log.Ctx{"error": newErr})
+		}
+	}
+
+	return err
+}
+
+// GetKey gets the value by the specify key.
+func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := c.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return "", err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return "", nil
+	}
+	v := kvs[0].Value
+	return string(v), nil
+}
+
+// watchKey watches the specify key and send to valChan if there is some event.
+func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
+	rch := c.Watch(context.Background(), key)
+	for wresp := range rch {
+		for _, ev := range wresp.Events {
+			// if received event is DELETE, the value will be an empty string
+			log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value})
+			valChan <- string(ev.Kv.Value)
+		}
+	}
+}
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
new file mode 100644
index 0000000000..a5bd2d4fe1
--- /dev/null
+++ b/go/master/inmem_store.go
@@ -0,0 +1,47 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package master
+
+import "sync"
+
+// InMemStore is an in memory implementation of Store interface.
+//
+// It does not tolerate the fault that causes the program to crash.
+type InMemStore struct {
+	mu  sync.Mutex
+	buf []byte
+}
+
+// Save saves the state into the in-memory store.
+func (m *InMemStore) Save(state []byte) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	m.buf = state
+	return nil
+}
+
+// Load loads the state from the in-memory store.
+func (m *InMemStore) Load() ([]byte, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	return m.buf, nil
+}
+
+// Shutdown shuts down the in mem store.
+func (m *InMemStore) Shutdown() error {
+	return nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index ab17a62f38..f350102880 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -1,41 +1,115 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
+	"bytes"
+	"compress/gzip"
+	"encoding/gob"
 	"errors"
-	"log"
+	"math/rand"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"
 
+	log "github.com/inconshreveable/log15"
+
 	"github.com/PaddlePaddle/recordio"
 )
 
 const (
-	targetTaskCount = 300
+	dialTimeout = 5 * time.Second
 )
 
-// errors
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
-)
+// ErrAllTaskFailed occur when tasks are in done or failed state.
+var ErrAllTaskFailed = errors.New("all task finished")
+
+// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
+var ErrNoMoreAvailable = errors.New("no more available task")
+
+// ErrPassBefore client side pass number does not match with master counter.
+var ErrPassBefore = errors.New("pass number smaller than master")
+
+// ErrPassAfter client side pass number does not match with master counter.
+var ErrPassAfter = errors.New("pass number larger than master")
+
+// Store is the interface for save and load the master state.
+type Store interface {
+	Save([]byte) error
+	Load() ([]byte, error)
+	Shutdown() error
+}
+
+// Chunk is a chunk of data consisted of several data instances.
+type Chunk struct {
+	Path  string
+	Index recordio.Index // chunk index
+}
+
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
+// Task is the basic unit of data instances assigned to trainers.
+type Task struct {
+	Meta   TaskMeta
+	Chunks []Chunk
+}
+
+type taskEntry struct {
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
+}
+
+type masterState struct {
+	Todo    []taskEntry
+	Pending map[int]taskEntry // map from task ID to task entry
+	Done    []taskEntry
+	Failed  []taskEntry
+	CurPass int
+}
 
 // Service is the master server service.
 type Service struct {
-	timeoutDur time.Duration
-	timeoutMax int
+	chunksPerTask int
+	timeoutDur    time.Duration
+	failureMax    int
+	store         Store
 
-	mu         sync.Mutex
-	taskQueues taskQueues
-}
+	ready    chan struct{}
+	initDone bool
 
-// Recover recovers service state from etcd.
-func Recover() (*Service, error) {
-	// TODO(helin): recover from snapshot state from etcd.
-	return nil, nil
+	mu sync.Mutex
+	// State to be persisted to snapshot.
+	state masterState
+	// The trainer that is currently saving model. This state is
+	// transient, does not need to be persisted to snapshot.
+	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	id := 0
+	// generate uniq id across job using nanosecond + randint + counter
+	// FIXME(typhoonzero): this is a workaround, use uuid
+	randStart := rand.Int()
+	counter := 0
+	timestamp := time.Now().Nanosecond()
+	id := timestamp + randStart + counter
 	if chunksPerTask <= 0 {
 		chunksPerTask = 1
 	}
@@ -44,8 +118,9 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
-			id++
+			cur.Task.Meta.ID = id
+			counter++
+			id = timestamp + randStart + counter
 			result = append(result, cur)
 			cur.Task.Chunks = nil
 		}
@@ -54,8 +129,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
-		id++
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
 
@@ -63,116 +137,374 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
-	s.taskQueues = taskQueues{}
-	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
-	return s
-}
+	s.failureMax = failureMax
+	s.state = masterState{}
+	s.state.Pending = make(map[int]taskEntry)
+	s.ready = make(chan struct{})
+	s.store = store
+	recovered, err := s.recover()
+	if err != nil {
+		return nil, err
+	}
 
-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Idx   int // index of the chunk within the file
-	Path  string
-	Index recordio.Index // block index
-}
+	if recovered {
+		// Recovered. Now the state is already initialized,
+		// and the master is ready.
+		s.initDone = true
+		close(s.ready)
+		log.Info("Master recovered from saved state.")
+	}
 
-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	ID     int
-	Chunks []Chunk
+	return s, nil
 }
 
-type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
-}
+// recover recovers service state from etcd.
+func (s *Service) recover() (bool, error) {
+	state, err := s.store.Load()
+	if err != nil {
+		return false, err
+	}
 
-type taskQueues struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []Task
+	if state == nil {
+		log.Info("No state exists, not recovered.")
+		return false, nil
+	}
+
+	log.Info("Loaded snapshot.", log.Ctx{"size": len(state)})
+	gr, err := gzip.NewReader(bytes.NewReader(state))
+	if err != nil {
+		return false, err
+	}
+
+	dec := gob.NewDecoder(gr)
+	var tqs masterState
+	err = dec.Decode(&tqs)
+	if err != nil {
+		return false, err
+	}
+
+	err = gr.Close()
+	if err != nil {
+		// Only close failed, recover actually succeed, so
+		// just log error.
+		log.Error("error close recover file.", log.Ctx{"error": err})
+	}
+
+	s.state = tqs
+	log.Info("Master recovered from snapshot, scheduling pending task timeout check.", s.logCtx())
+	for _, t := range s.state.Pending {
+		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
+	}
+
+	return true, nil
 }
 
-// *must* be called with s.mu being held.
+// snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TODO(helin): snapshot state on etcd.
-	return nil
+	// TODO(helin): etcd request has a size limit, so the snapshot
+	// size is limited by the max request size. We should either
+	// divide the snapshot into smaller chunks and save under
+	// different keys, or configure the request size to be big
+	// enough:
+	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
+	var buf bytes.Buffer
+	gw := gzip.NewWriter(&buf)
+	enc := gob.NewEncoder(gw)
+	err := enc.Encode(s.state)
+	if err != nil {
+		return err
+	}
+	err = gw.Close()
+	if err != nil {
+		return err
+	}
+
+	state := buf.Bytes()
+	log.Info("Saving snapshot.", log.Ctx{"size bytes": len(state)})
+	return s.store.Save(state)
 }
 
-// GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+func readChunks(globPaths []string) ([]Chunk, error) {
+	var chunks []Chunk
+	var paths []string
+
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+
+		count := index.NumChunks()
+		log.Info("reading chunks.", log.Ctx{"path": path, "num chunks": count})
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+
+	return chunks, nil
+}
+
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, _ *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
 
-	if len(s.taskQueues.Todo) == 0 {
-		return ErrNoMoreTask
+	chunks, err := readChunks(globPaths)
+	if err != nil {
+		return err
 	}
 
-	t := s.taskQueues.Todo[0]
-	t.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
-	err := s.snapshot()
+	s.state.Todo = partition(chunks, s.chunksPerTask)
+
+	err = s.snapshot()
 	if err != nil {
+		log.Error("snapshot error", log.Ctx{"error": err})
 		return err
 	}
+	close(s.ready)
+	s.initDone = true
+	return nil
+}
 
-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
-		return func() {
-			s.mu.Lock()
-			defer s.mu.Unlock()
+// processFailedTask retry s.failureMax times for failed task.
+// return true if all task are done or failed.
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
 
-			t, ok := s.taskQueues.Pending[taskID]
-			if !ok {
-				return
-			}
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Error("snapshot error", log.Ctx{"error": err})
+		}
+	}()
 
-			if t.Epoch != epoch {
-				// new epoch, task launched after the
-				// schedule of this timeout check.
-				return
-			}
+	delete(s.state.Pending, t.Task.Meta.ID)
 
-			defer func() {
-				err := s.snapshot()
-				if err != nil {
-					log.Println(err)
-				}
-			}()
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warn("Task failed to many times, discard.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
+		s.state.Failed = append(s.state.Failed, t)
+		return
+	}
 
-			delete(s.taskQueues.Pending, t.Task.ID)
+	log.Warn("Task failed, re-dispatch.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
+	s.state.Todo = append(s.state.Todo, t)
+	return
+}
 
-			t.NumTimeout++
-			if t.NumTimeout > s.timeoutMax {
-				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-				return
-			}
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+	return func() {
+		s.mu.Lock()
+		defer s.mu.Unlock()
+
+		t, ok := s.state.Pending[taskID]
+		if !ok {
+			return
+		}
+
+		s.processFailedTask(t, epoch)
+	}
+}
+
+// must be called with lock held.
+func (s *Service) logCtx() log.Ctx {
+	return log.Ctx{
+		"todoLen":    len(s.state.Todo),
+		"pendingLen": len(s.state.Pending),
+		"doneLen":    len(s.state.Done),
+		"failedLen":  len(s.state.Failed),
+		"curPass":    s.state.CurPass,
+	}
+}
+
+// GetTask gets a new task from the service.
+// passID is the client side pass count
+func (s *Service) GetTask(passID int, task *Task) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if passID < s.state.CurPass {
+		return ErrPassBefore
+	}
+	if passID > s.state.CurPass {
+		// Client may get run to pass after master when one client faster than the
+		// other
+		return ErrPassAfter
+	}
 
-			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	if len(s.state.Todo) == 0 {
+		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
+			log.Warn("All tasks failed, may start next pass", s.logCtx())
+			return ErrAllTaskFailed
 		}
-	}(t.Task.ID, t.Epoch))
+		log.Warn("No more available task.", s.logCtx())
+		return ErrNoMoreAvailable
+	}
+
+	t := s.state.Todo[0]
+	t.Task.Meta.Epoch++
+	s.state.Todo = s.state.Todo[1:]
+	s.state.Pending[t.Task.Meta.ID] = t
+	err := s.snapshot()
+	if err != nil {
+		return err
+	}
+
+	*task = t.Task
+	ctx := s.logCtx()
+	ctx["task meta"] = t.Task.Meta
+	log.Info("Task dispatched.", ctx)
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
 
 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	t, ok := s.taskQueues.Pending[taskID]
+	t, ok := s.state.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		ctx := s.logCtx()
+		ctx["task id"] = taskID
+		log.Warn("Pending task not found.", ctx)
+		return nil
 	}
 
 	// task finished, reset timeout
-	t.NumTimeout = 0
-	s.taskQueues.Done = append(s.taskQueues.Done, t)
-	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+	t.NumFailure = 0
+	s.state.Done = append(s.state.Done, t)
+	delete(s.state.Pending, taskID)
+
+	ctx := s.logCtx()
+	ctx["task id"] = taskID
+	log.Info("Task finished.", ctx)
+	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
+		// increase master side pass count if all tasks finished
+		s.state.CurPass++
+		s.state.Todo = append(s.state.Done, s.state.Failed...)
+		s.state.Done = []taskEntry{}
+		// TODO(typhoonzero): deal with failed tasks
+		s.state.Failed = []taskEntry{}
+		ctx := s.logCtx()
+		ctx["new pass"] = s.state.CurPass
+		log.Warn("all task finished, add new pass data.", ctx)
+	}
+
+	err := s.snapshot()
+	if err != nil {
+		log.Error("snapshot error", log.Ctx{"error": err})
+	}
+	return err
+}
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.state.Pending[meta.ID]
+	if !ok {
+		log.Warn("TaskFailed:Pending task not found.", log.Ctx{"task": t.Task.Meta})
+		return nil
+	}
+
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
+
+// SaveModelRequest is the request for saving model
+type SaveModelRequest struct {
+	TrainerID string
+	BlockDur  time.Duration
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if req.TrainerID == "" {
+		return errors.New("trainer id is empty")
+	}
+
+	if s.savingTrainer == "" {
+		*need = true
+	} else {
+		if req.TrainerID == s.savingTrainer {
+			// save trainer asked to save model again
+			*need = true
+		} else {
+			*need = false
+		}
+	}
+
+	if *need {
+		s.savingTrainer = req.TrainerID
+		time.AfterFunc(req.BlockDur, func() {
+			s.mu.Lock()
+			s.savingTrainer = ""
+			s.mu.Unlock()
+		})
+	}
+
+	return nil
 }
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index bc435b505c..bd1a939a55 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import "testing"
@@ -30,7 +44,8 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		// test auto increament ids
+		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/master/service_test.go b/go/master/service_test.go
new file mode 100644
index 0000000000..2d00c22d6f
--- /dev/null
+++ b/go/master/service_test.go
@@ -0,0 +1,72 @@
+package master_test
+
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/embed"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewServiceWithEtcd(t *testing.T) {
+	// setup an embed etcd server
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	ep := []string{endpoint}
+	masterAddr := "127.0.0.1:3306"
+	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = master.NewService(store, 10, 10, 3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   ep,
+		DialTimeout: 3 * time.Second,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := cli.Close(); err != nil {
+		t.Fatal(err)
+	}
+	// test master process registry itself into etcd server.
+	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
+}
diff --git a/go/proto/.gitignore b/go/proto/.gitignore
new file mode 100644
index 0000000000..5e7d2734cf
--- /dev/null
+++ b/go/proto/.gitignore
@@ -0,0 +1,4 @@
+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
new file mode 100644
index 0000000000..9ac05199e7
--- /dev/null
+++ b/go/pserver/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer gen_proto_go)
+endif()
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/cclient/CMakeLists.txt
deleted file mode 100644
index c017d74656..0000000000
--- a/go/pserver/cclient/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-project(cxx_go C Go)
-
-include(golang)
-include(flags)
-
-go_library(client STATIC)
-add_subdirectory(test)
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
deleted file mode 100644
index 0b4aa79806..0000000000
--- a/go/pserver/cclient/cclient.go
+++ /dev/null
@@ -1,260 +0,0 @@
-package main
-
-/*
-#include <stdlib.h>
-#include <string.h>
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32   = 0,
-  PADDLE_ELEMENT_TYPE_UINT32  = 1,
-  PADDLE_ELEMENT_TYPE_INT64   = 2,
-  PADDLE_ELEMENT_TYPE_UINT64  = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-typedef struct {
-  char*               name;
-  paddle_element_type element_type;
-  unsigned char*      content;
-  int                 content_len;
-} paddle_parameter, paddle_gradient;
-
-static inline void paddle_release_param(paddle_parameter* param) {
-  if (param != NULL) {
-    if (param->name != NULL) {
-      free(param->name);
-    }
-
-    if (param->content != NULL) {
-      free(param->content);
-    }
-
-    free(param);
-  }
-}
-
-typedef int client;
-*/
-import "C"
-
-import (
-	"log"
-	"strings"
-	"sync"
-	"unsafe"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-)
-
-var nullPtr = unsafe.Pointer(uintptr(0))
-var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
-var curHandle C.client
-
-func add(c *pserver.Client) C.client {
-	mu.Lock()
-	defer mu.Unlock()
-	client := curHandle
-	curHandle++
-	handleMap[client] = c
-	return client
-}
-
-func get(client C.client) *pserver.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[client]
-}
-
-func remove(client C.client) *pserver.Client {
-	mu.Lock()
-	defer mu.Unlock()
-	h := handleMap[client]
-	delete(handleMap, client)
-	return h
-}
-
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
-		return nil
-	}
-
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-
-type selector bool
-
-func (s selector) Select() bool {
-	return bool(s)
-}
-
-type lister []pserver.Server
-
-func (l lister) List() []pserver.Server {
-	return l
-}
-
-//export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
-	a := C.GoString(addrs)
-	as := strings.Split(a, ",")
-	servers := make([]pserver.Server, len(as))
-	for i := range as {
-		servers[i].Index = i
-		servers[i].Addr = as[i]
-	}
-	c := pserver.NewClient(lister(servers), len(as), selector(selected != 0))
-	return add(c)
-}
-
-//export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
-	// TODO(helin): fault tolerant pserver client using etcd.
-	panic("not implemented.")
-}
-
-//export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
-	remove(client)
-}
-
-//export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
-	c := get(client)
-	if selected := c.BeginInitParams(); selected {
-		return 1
-	}
-	return 0
-}
-
-//export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
-	et := pserver.ElementType(param.element_type)
-	name := C.GoString(param.name)
-	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
-	pc := pserver.ParameterWithConfig{
-		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
-	}
-	c := get(client)
-	err := c.InitParam(pc)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-
-	return 0
-}
-
-//export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
-	c := get(client)
-	err := c.FinishInitParams()
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-
-	return 0
-}
-
-//export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
-	var gs []pserver.Gradient
-	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
-		et := pserver.ElementType(grad.element_type)
-		name := C.GoString(grad.name)
-		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
-		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
-	}
-
-	c := get(client)
-	err := c.SendGrads(gs)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-
-	return 0
-}
-
-//export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
-	var ns []string
-	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
-		ns = append(ns, C.GoString(name))
-	}
-	c := get(client)
-	ps, err := c.GetParams(ns)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-
-	for i := 0; i < int(total); i++ {
-		if i >= len(ps) {
-			break
-		}
-
-		p := ps[i]
-		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false
-
-		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
-		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
-
-			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
-					contentAllocated = true
-				} else {
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
-				}
-			}
-		}
-
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
-		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
-		param.content_len = C.int(len(p.Content))
-		param.element_type = C.paddle_element_type(p.ElementType)
-	}
-
-	return 0
-}
-
-//export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
-	p := C.GoString(path)
-	c := get(client)
-	err := c.Save(p)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-
-	return 0
-}
-
-func main() {} // Required but ignored
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
deleted file mode 100644
index 16f84648c1..0000000000
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-cmake_minimum_required(VERSION 3.0)
-
-include_directories(${CMAKE_BINARY_DIR})
-
-add_executable(main main.c)
-add_dependencies(main client)
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
diff --git a/go/pserver/cclient/test/main.c b/go/pserver/cclient/test/main.c
deleted file mode 100644
index f75a2110b9..0000000000
--- a/go/pserver/cclient/test/main.c
+++ /dev/null
@@ -1,70 +0,0 @@
-#include <stdio.h>
-
-#include "libclient.h"
-
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
-  exit(-1);
-}
-
-int main() {
-  char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
-retry:
-  if (paddle_begin_init_params(c)) {
-    paddle_parameter param;
-    char name_a[] = "param_a";
-    char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
-    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-    param.name = name_a;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
-      goto retry;
-    }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
-    param.name = name_b;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
-      goto retry;
-    }
-    if (paddle_finish_init_params(c) != 0) {
-      goto retry;
-    }
-  } else {
-    fail();
-  }
-
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-
-  if (!paddle_send_grads(c, grads, 2)) {
-    fail();
-  }
-
-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-
-  // get parameters again by reusing the allocated parameter buffers.
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-
-  if (!paddle_save_model(c, "/tmp/")) {
-    fail();
-  }
-
-  return 0;
-}
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
new file mode 100644
index 0000000000..e295611060
--- /dev/null
+++ b/go/pserver/client/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
new file mode 100644
index 0000000000..4bf05c8538
--- /dev/null
+++ b/go/pserver/client/c/.gitignore
@@ -0,0 +1 @@
+libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
new file mode 100644
index 0000000000..a932791c7c
--- /dev/null
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -0,0 +1,30 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
+go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
+if(WITH_TESTING)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
+endif()
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
new file mode 100644
index 0000000000..2eeec1b6b3
--- /dev/null
+++ b/go/pserver/client/c/cclient.go
@@ -0,0 +1,300 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package main
+
+/*
+#include <string.h>
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  unsigned char*      content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+typedef int paddle_pserver_client;
+#define PSERVER_ERROR -1
+#define PSERVER_OK 0
+*/
+import "C"
+
+import (
+	"strings"
+	"sync"
+	"unsafe"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	log "github.com/inconshreveable/log15"
+)
+
+func init() {
+	log.Root().SetHandler(
+		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+}
+
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_pserver_client]*client.Client)
+var curHandle C.paddle_pserver_client
+
+func add(c *client.Client) C.paddle_pserver_client {
+	mu.Lock()
+	defer mu.Unlock()
+	cli := curHandle
+	curHandle++
+	handleMap[cli] = c
+	return cli
+}
+
+func get(client C.paddle_pserver_client) *client.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+
+func remove(client C.paddle_pserver_client) *client.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+
+func cArrayToSlice(p unsafe.Pointer, len int) []byte {
+	if p == nil {
+		return nil
+	}
+
+	// create a Go clice backed by a C array, reference:
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	//
+	// Go garbage collector will not interact with this data, need
+	// to be freed properly.
+	return (*[1 << 30]byte)(p)[:len:len]
+}
+
+type selector bool
+
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
+}
+
+type lister []client.Server
+
+func (l lister) List() []client.Server {
+	return l
+}
+
+//export paddle_new_pserver_client
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
+	a := C.GoString(addrs)
+	as := strings.Split(a, ",")
+	servers := make([]client.Server, len(as))
+	for i := range as {
+		servers[i].Index = i
+		servers[i].Addr = as[i]
+	}
+	c := client.NewClient(lister(servers), len(as), selector(selected != 0))
+	return add(c)
+}
+
+//export paddle_new_etcd_pserver_client
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
+	return add(c)
+}
+
+//export paddle_pserver_client_release
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
+	remove(client)
+}
+
+// paddle_begin_init_params tells trainer if it needs to init the
+// parameters.
+//
+// returns 1 if the trainer needs to init the parameters. 0 if the
+// trainer does not need to init the parameters.
+//
+//export paddle_begin_init_params
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
+	c := get(client)
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		panic(err)
+	}
+
+	if selected {
+		return 1
+	}
+	return 0
+}
+
+//export paddle_init_param
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
+	et := pserver.ElementType(param.element_type)
+	name := C.GoString(param.name)
+	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
+	pc := pserver.ParameterWithConfig{
+		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
+		Config: cArrayToSlice(paramConfig, int(configLen)),
+	}
+	c := get(client)
+	err := c.InitParam(pc)
+
+	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warn(
+				"parameter already initialized, treat paddle_init_param as successful.",
+				log.Ctx{"parameter": name},
+			)
+			return C.PSERVER_OK
+		}
+		log.Error("error init param", log.Ctx{"error": err})
+		return C.PSERVER_ERROR
+	}
+
+	return C.PSERVER_OK
+}
+
+//export paddle_finish_init_params
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
+	c := get(client)
+	err := c.FinishInitParams()
+	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Warn("parameters already initialized, treat paddle_finish_init_params as successful.")
+			return C.PSERVER_OK
+		}
+
+		log.Error("error finish init params", log.Ctx{"error": err})
+		return C.PSERVER_ERROR
+	}
+
+	return C.PSERVER_OK
+}
+
+//export paddle_send_grads
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
+	var gs []pserver.Gradient
+	for i := 0; i < int(total); i++ {
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		et := pserver.ElementType(grad.element_type)
+		name := C.GoString(grad.name)
+		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
+		gs = append(gs, pserver.Gradient{Name: name, ElementType: et, Content: content})
+	}
+
+	c := get(client)
+	err := c.SendGrads(gs)
+	if err != nil {
+		log.Error("error send grads", log.Ctx{"error": err})
+		return C.PSERVER_ERROR
+	}
+
+	return C.PSERVER_OK
+}
+
+//export paddle_get_params
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
+	var ns []string
+	for i := 0; i < int(total); i++ {
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		ns = append(ns, C.GoString(param.name))
+	}
+	c := get(client)
+	ps, err := c.GetParams(ns)
+	if err != nil {
+		log.Error("error get params", log.Ctx{"error": err})
+		return C.PSERVER_ERROR
+	}
+
+	if len(ps) != len(ns) {
+		pn := make([]string, len(ps))
+		for i, p := range ps {
+			pn[i] = p.Name
+		}
+		log.Error(
+			"pserver returned wrong number of parameters.",
+			log.Ctx{
+				"Requested": strings.Join(pn, ", "),
+				"Returned":  strings.Join(ns, ", "),
+			},
+		)
+		return C.PSERVER_ERROR
+	}
+
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Error(
+				"pserver returned wrong parameters, or not in requested order.",
+				log.Ctx{
+					"Requested": strings.Join(pn, ", "),
+					"Returned":  strings.Join(ns, ", "),
+				},
+			)
+			return C.PSERVER_ERROR
+		}
+	}
+
+	for i := 0; i < int(total); i++ {
+		p := ps[i]
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+
+		if unsafe.Pointer(param) == nil {
+			log.Error("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
+		}
+
+		if unsafe.Pointer(param.content) != nil {
+			if int(param.content_len) != len(p.Content) {
+				log.Error(
+					"the pre-allocated content len does not match parameter content len.",
+					log.Ctx{
+						"Pre-allocated len": param.content_len,
+						"Returned len":      len(p.Content),
+					},
+				)
+				return C.PSERVER_ERROR
+			}
+		}
+
+		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+		param.content_len = C.int(len(p.Content))
+		param.element_type = C.paddle_element_type(p.ElementType)
+	}
+
+	return C.PSERVER_OK
+}
+
+func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
new file mode 100644
index 0000000000..3724ccb60b
--- /dev/null
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
+add_style_check_target(test_cclient test_cclient.c)
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
new file mode 100644
index 0000000000..05ec421fff
--- /dev/null
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpaddle_pserver_cclient.h"
+
+// TODO(helin): Fix: gtest using cmake is not working, using this
+// hacky way for now.
+#define fail()                                          \
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
+  exit(-1);
+
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient *grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+
+  paddle_parameter *params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+  char *config_proto;
+  size_t config_proto_len = 0;
+  ssize_t nread;
+  FILE *fp = fopen("testdata/optimizer.pb", "r");
+  if (!fp) {
+    fail();
+  }
+  while ((nread = getline(&config_proto, &config_proto_len, fp)) != -1) {
+    printf("%s", config_proto);
+  }
+  fclose(fp);
+retry:
+  if (paddle_begin_init_params(c)) {
+    paddle_parameter param;
+    char name_a[] = "param_a";
+    char name_b[] = "param_b";
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+    param.name = name_a;
+    param.content = content_a;
+    param.content_len = 2000;
+    int error =
+        paddle_init_param(c, param, (void *)config_proto, config_proto_len);
+    if (error != 0) {
+      goto retry;
+    }
+
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+    param.name = name_b;
+    param.content = content_b;
+    param.content_len = 3000;
+    error = paddle_init_param(c, param, (void *)config_proto, config_proto_len);
+    if (error != 0) {
+      goto retry;
+    }
+
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
+      goto retry;
+    }
+  }
+
+  int i;
+  for (i = 0; i < 100; i++) {
+    sendGrads(c);
+    getParams(c);
+  }
+
+  return 0;
+}
diff --git a/go/pserver/client/c/test/test_mnist.py b/go/pserver/client/c/test/test_mnist.py
new file mode 100644
index 0000000000..821d9adfcb
--- /dev/null
+++ b/go/pserver/client/c/test/test_mnist.py
@@ -0,0 +1,145 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+    parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
new file mode 100644
index 0000000000..445a8d3aa4
--- /dev/null
+++ b/go/pserver/client/c/test/test_train.py
@@ -0,0 +1,89 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer of new remote updater to pserver
+    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec=etcd_endpoints,
+                                 use_etcd=True)
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            # FIXME: for cloud data reader, pass number is managed by master
+            # should print the server side pass number
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/client/c/test/testdata/optimizer.pb b/go/pserver/client/c/test/testdata/optimizer.pb
new file mode 100644
index 0000000000..27dd3bc5f1
Binary files /dev/null and b/go/pserver/client/c/test/testdata/optimizer.pb differ
diff --git a/go/pserver/client.go b/go/pserver/client/client.go
similarity index 60%
rename from go/pserver/client.go
rename to go/pserver/client/client.go
index f8bd0aa59f..18fce34b37 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client/client.go
@@ -1,19 +1,39 @@
-package pserver
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package client
 
 import (
+	"errors"
 	"hash/fnv"
-	"log"
 	"sort"
 	"time"
 
-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/inconshreveable/log15"
 )
 
 // TODO(helin): add RPC call retry logic
 
-// Selector selects if the client should initialize parameter servers.
+// Selector selects if the client should initialize parameters and
+// reports the initialization process done.
 type Selector interface {
-	Select() bool
+	// Select selects if the client should initialize parameter servers.
+	Select() (bool, error)
+	// Done indicates the initialization process is done.
+	Done() error
 }
 
 // Server is the identification of a parameter Server.
@@ -47,7 +67,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +76,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}
 
-		for i := range knownServers {
-			if knownServers[i].Addr != curServers[i].Addr {
-				err := c.pservers[i].Connect(curServers[i].Addr)
-				if err != nil {
-					log.Println(err)
+		for i := range lastServers {
+			if lastServers[i].Addr == curServers[i].Addr {
+				continue
+			}
 
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
+				if err != nil {
+					log.Error("error closing connection to pserver", log.Ctx{"error": err})
 				}
+
+				continue
 			}
+
+			err := c.pservers[i].Connect(curServers[i].Addr)
+			if err != nil {
+				log.Error("error connecting to pserver", log.Ctx{"error": err})
+
+				// connect to addr failed, set
+				// to last known addr in order
+				// to retry next time.
+				curServers[i].Addr = lastServers[i].Addr
+			}
+
 		}
 
-		knownServers = curServers
+		lastServers = curServers
 	}
 
 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -87,37 +119,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 // servers. Other trainers will be blocked until the initialization is
 // done, and they need to get the initialized parameters from
 // parameter servers using GetParams.
-func (c *Client) BeginInitParams() bool {
+func (c *Client) BeginInitParams() (bool, error) {
 	return c.sel.Select()
 }
 
 // InitParam initializes the parameter on parameter servers.
-func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
+func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }
 
 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
+		err := p.Call("Service.FinishInitParams", 0, nil)
 		if err != nil {
 			return err
 		}
 	}
-	return nil
+	return c.sel.Done()
 }
 
 // SendGrads sends gradients to parameter servers for updating
 // parameters.
-func (c *Client) SendGrads(grads []Gradient) error {
+func (c *Client) SendGrads(grads []pserver.Gradient) error {
+	if len(grads) == 0 {
+		return errors.New("no gradient received")
+	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
-		go func(g Gradient) {
-			var dummy int
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
+		go func(g pserver.Gradient) {
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
 	}
@@ -138,7 +170,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 
 type result struct {
 	idx   int
-	param Parameter
+	param pserver.Parameter
 	err   error
 }
 
@@ -157,12 +189,12 @@ func (r results) Swap(i int, j int) {
 }
 
 // GetParams gets parameters from parameter servers.
-func (c *Client) GetParams(names []string) ([]Parameter, error) {
+func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	rCh := make(chan result, len(names))
 
 	for idx, name := range names {
 		go func(name string, idx int) {
-			var parameter Parameter
+			var parameter pserver.Parameter
 			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
 			rCh <- result{idx: idx, param: parameter, err: err}
 		}(name, idx)
@@ -183,7 +215,7 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) {
 	}
 	sort.Sort(rs)
 
-	ps := make([]Parameter, len(rs))
+	ps := make([]pserver.Parameter, len(rs))
 	for i := range rs {
 		ps[i] = rs[i].param
 	}
@@ -191,36 +223,9 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) {
 	return ps, nil
 }
 
-// Save indicates parameters to save the parameter to the given path.
-func (c *Client) Save(path string) error {
-	errCh := make(chan error, len(c.pservers))
-
-	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.Save", path, &dummy)
-		errCh <- err
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(c.pservers) {
-			break
-		}
-	}
-
-	// TODO(helin): there will be many files under path, need to
-	// merge them into a single file.
-	return nil
-}
-
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
new file mode 100644
index 0000000000..ec832305ee
--- /dev/null
+++ b/go/pserver/client/client_test.go
@@ -0,0 +1,268 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package client_test
+
+import (
+	"context"
+	"io/ioutil"
+	"math/rand"
+	"net"
+	"net/http"
+	"net/rpc"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/inconshreveable/log15"
+)
+
+const (
+	numPserver    = 10
+	etcdEndpoints = "127.0.0.1:2379"
+	timeout       = 2 * time.Second
+)
+
+var pserverClientPorts [numPserver]int
+
+// this function init pserver client and return their ports in an array.
+func initClient() [numPserver]int {
+	var ports [numPserver]int
+	for i := 0; i < numPserver; i++ {
+		l, err := net.Listen("tcp", ":0")
+		if err != nil {
+			panic(err)
+		}
+
+		ss := strings.Split(l.Addr().String(), ":")
+		p, err := strconv.Atoi(ss[len(ss)-1])
+		if err != nil {
+			panic(err)
+		}
+		ports[i] = p
+
+		go func(l net.Listener) {
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, time.Hour, "", nil, cp)
+			if err != nil {
+				panic(err)
+			}
+			server := rpc.NewServer()
+			err = server.Register(s)
+			if err != nil {
+				panic(err)
+			}
+
+			mux := http.NewServeMux()
+			mux.Handle(rpc.DefaultRPCPath, server)
+			err = http.Serve(l, mux)
+			if err != nil {
+				panic(err)
+			}
+		}(l)
+	}
+	return ports
+}
+
+func initNativeClient() {
+	pserverClientPorts = initClient()
+}
+
+func initEtcdClient() {
+	client, err := clientv3.New(clientv3.Config{
+		Endpoints:   []string{etcdEndpoints},
+		DialTimeout: time.Second * time.Duration(1),
+	})
+	if err != nil {
+		log.Error("error init etcd client", log.Ctx{"error": err})
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
+	ports := initClient()
+	for i := 0; i < numPserver; i++ {
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
+	}
+	cancel()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
+}
+
+type selector bool
+
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
+}
+
+type lister []client.Server
+
+func (l lister) List() []client.Server {
+	return l
+}
+
+func testClient(t *testing.T, c *client.Client) {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !selected {
+		t.Fatal("should be selected.")
+	}
+
+	const numParameter = 1000
+	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+
+	var wg sync.WaitGroup
+	for i := 0; i < numParameter; i++ {
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
+	}
+	wg.Wait()
+
+	err = c.FinishInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var grads []pserver.Gradient
+	for i := 0; i < numParameter; i++ {
+		var g pserver.Gradient
+		g.Name = "p_" + strconv.Itoa(i)
+		g.ElementType = pserver.Float32
+		g.Content = make([]byte, (i+1)*100)
+		grads = append(grads, g)
+	}
+
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
+	}
+
+	names := make([]string, numParameter)
+	for i := 0; i < numParameter; i++ {
+		names[i] = "p_" + strconv.Itoa(i)
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
+	}
+
+	wg.Wait()
+}
+
+func TestNativeClient(t *testing.T) {
+	initNativeClient()
+	servers := make([]client.Server, numPserver)
+	for i := 0; i < numPserver; i++ {
+		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
+	}
+	c1 := client.NewClient(lister(servers), len(servers), selector(true))
+	testClient(t, c1)
+}
+
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
+func EtcdClient(t *testing.T) {
+	initEtcdClient()
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
+	testClient(t, c2)
+}
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
new file mode 100644
index 0000000000..16d0c3b943
--- /dev/null
+++ b/go/pserver/client/etcd_client.go
@@ -0,0 +1,266 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package client
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/inconshreveable/log15"
+)
+
+const (
+	defaultEtcdTimeout time.Duration = 5 * time.Second
+
+	initLockPath = "/init_ps/lock"
+	initDonePath = "/init_ps/done"
+	initDoneVal  = "1"
+)
+
+// Etcd is used by pserver client that is a part of trainer process.
+// TODO:
+// 1. add watcher to watch the change state of pservers.
+type Etcd struct {
+	client    *clientv3.Client
+	timeout   time.Duration
+	endpoints []string
+	lock      *concurrency.Mutex
+}
+
+// Desired read ps desired number from etcd.
+func (e *Etcd) Desired() int {
+	var psDesired int
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		resp, err := e.client.Get(ctx, pserver.PsDesired)
+		cancel()
+		if err != nil {
+			log.Error(
+				"Get ps dresire number failed! reconnecting...",
+				log.Ctx{"error": err},
+			)
+			time.Sleep(e.timeout)
+			continue
+		}
+
+		kvs := resp.Kvs
+		if len(kvs) == 0 {
+			log.Info("Waiting for ps desired registered ...")
+			time.Sleep(e.timeout)
+			continue
+		}
+
+		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+		if err != nil {
+			log.Error("atoi failed", log.Ctx{"error": err})
+			time.Sleep(e.timeout)
+			continue
+		}
+
+		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
+		break
+	}
+	return psDesired
+}
+
+// List return the pserver list read from etcd.
+func (e *Etcd) List() []Server {
+	psDesired := e.Desired()
+
+	servers := make([]Server, psDesired)
+	for {
+		for i := 0; i < psDesired; i++ {
+			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+			psKey := pserver.PsPath + strconv.Itoa(i)
+			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
+			resp, err := e.client.Get(ctx, psKey)
+			cancel()
+			if err != nil {
+				log.Info(
+					"Get psKey error",
+					log.Ctx{"ps key": psKey, "error": err},
+				)
+				time.Sleep(e.timeout)
+				continue
+			}
+			kvs := resp.Kvs
+			if len(kvs) == 0 {
+				log.Info("Waiting for ps addr registered ...")
+				time.Sleep(e.timeout)
+				continue
+			}
+
+			psAddr := string(resp.Kvs[0].Value)
+			// TODO(Longfei) check the ps address
+			if psAddr == "" {
+				log.Info(
+					"Value under psKey is empty",
+					log.Ctx{"psKey": psKey},
+				)
+				time.Sleep(e.timeout)
+				continue
+			}
+			log.Debug(
+				"got psAddr given psKey",
+				log.Ctx{"psAddr": psAddr, "psKey": psKey},
+			)
+			servers[i].Index = i
+			servers[i].Addr = psAddr
+		}
+		break
+	}
+	return servers
+}
+
+// NewEtcd create a etcd client to return the state of pserver on etcd.
+func NewEtcd(endpoints string) *Etcd {
+	ep := strings.Split(endpoints, ",")
+	var cli *clientv3.Client
+	var err error
+	for {
+		cli, err = clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: defaultEtcdTimeout,
+		})
+		if err != nil {
+			log.Error("Init etcd connection failed", log.Ctx{"error": err})
+			time.Sleep(defaultEtcdTimeout)
+			continue
+		}
+		break
+	}
+	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
+	client := &Etcd{
+		client:    cli,
+		timeout:   defaultEtcdTimeout,
+		endpoints: ep,
+	}
+	return client
+}
+
+// Select indicates if the current trainer is selected to initialize
+// the pserver parameters.
+func (e *Etcd) Select() (bool, error) {
+	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
+	if err != nil {
+		return false, err
+	}
+
+	lock := concurrency.NewMutex(sess, initLockPath)
+	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
+	// Do not use timeout context here, since we don't know how
+	// long does it take for other trainers to initialize the
+	// parameters.
+	err = lock.Lock(context.Background())
+	if err != nil {
+		return false, err
+	}
+	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
+
+	get := clientv3.OpGet(initDonePath)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
+	cancel()
+	if err != nil {
+		return false, err
+	}
+
+	if !tresp.Succeeded {
+		return false, errors.New("no longer the owner of the lock")
+	}
+
+	resp := tresp.Responses[0].GetResponseRange()
+
+	if len(resp.Kvs) == 0 {
+		// Key value not set, select current trainer.
+		e.lock = lock
+		log.Info("Trainer selected.")
+		return true, nil
+	}
+
+	if string(resp.Kvs[0].Value) == initDoneVal {
+		log.Info("Initialization is already done.")
+		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+		err = lock.Unlock(ctx)
+		cancel()
+		if err != nil {
+			log.Error("error unlocking", log.Ctx{"error": err})
+		}
+		return false, nil
+	}
+
+	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
+}
+
+// Done indicates the parameter initialization process is done.
+func (e *Etcd) Done() error {
+	if e.lock == nil {
+		return errors.New("lock is nil, Done called unexpectedly")
+	}
+
+	put := clientv3.OpPut(initDonePath, initDoneVal)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	cancel()
+	if err != nil {
+		return err
+	}
+
+	if !tresp.Succeeded {
+		return errors.New("no longer the owner of the lock")
+	}
+
+	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+	err = e.lock.Unlock(ctx)
+	cancel()
+	if err != nil {
+		log.Error("error unlocking", log.Ctx{"error": err})
+	} else {
+		e.lock = nil
+	}
+
+	return nil
+}
+
+// Close closes the etcd client.
+func (e *Etcd) Close() error {
+	var err error
+	if e.lock != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		err = e.lock.Unlock(ctx)
+		cancel()
+		if err == nil {
+			e.lock = nil
+		}
+	}
+
+	cErr := e.client.Close()
+	if cErr != nil {
+		if err != nil {
+			log.Error("error closing etcd client", log.Ctx{"error": cErr})
+			return err
+		}
+		return cErr
+	}
+
+	return err
+}
diff --git a/go/pserver/client/etcd_client_test.go b/go/pserver/client/etcd_client_test.go
new file mode 100644
index 0000000000..08742433e7
--- /dev/null
+++ b/go/pserver/client/etcd_client_test.go
@@ -0,0 +1,106 @@
+package client_test
+
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/embed"
+)
+
+func TestSelector(t *testing.T) {
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	var mu sync.Mutex
+	selectedCount := 0
+	var wg sync.WaitGroup
+	selectAndDone := func(c *client.Etcd) {
+		defer wg.Done()
+
+		selected, err := c.Select()
+		if err != nil {
+			panic(err)
+		}
+
+		if selected {
+			mu.Lock()
+			selectedCount++
+			mu.Unlock()
+			err = c.Done()
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+
+	c0 := client.NewEtcd(endpoint)
+	c1 := client.NewEtcd(endpoint)
+	c2 := client.NewEtcd(endpoint)
+	c3 := client.NewEtcd(endpoint)
+	wg.Add(3)
+	go selectAndDone(c0)
+	go selectAndDone(c1)
+	go selectAndDone(c2)
+	wg.Wait()
+
+	// simulate trainer crashed and restarted after the
+	// initialization process.
+	wg.Add(1)
+	go selectAndDone(c3)
+	wg.Wait()
+
+	mu.Lock()
+	if selectedCount != 1 {
+		t.Fatal("selected count wrong:", selectedCount)
+	}
+	mu.Unlock()
+
+	err = c0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c1.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c2.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c3.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
deleted file mode 100644
index a9a0948a51..0000000000
--- a/go/pserver/client_test.go
+++ /dev/null
@@ -1,123 +0,0 @@
-package pserver_test
-
-import (
-	"net"
-	"net/http"
-	"net/rpc"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-)
-
-const numPserver = 10
-
-var port [numPserver]int
-
-func init() {
-	for i := 0; i < numPserver; i++ {
-		l, err := net.Listen("tcp", ":0")
-		if err != nil {
-			panic(err)
-		}
-
-		ss := strings.Split(l.Addr().String(), ":")
-		p, err := strconv.Atoi(ss[len(ss)-1])
-		if err != nil {
-			panic(err)
-		}
-		port[i] = p
-
-		go func(l net.Listener) {
-			s := pserver.NewService()
-			server := rpc.NewServer()
-			err := server.Register(s)
-			if err != nil {
-				panic(err)
-			}
-
-			mux := http.NewServeMux()
-			mux.Handle(rpc.DefaultRPCPath, server)
-			err = http.Serve(l, mux)
-			if err != nil {
-				panic(err)
-			}
-		}(l)
-	}
-}
-
-type selector bool
-
-func (s selector) Select() bool {
-	return bool(s)
-}
-
-type lister []pserver.Server
-
-func (l lister) List() []pserver.Server {
-	return l
-}
-
-func TestClientFull(t *testing.T) {
-	servers := make([]pserver.Server, numPserver)
-	for i := 0; i < numPserver; i++ {
-		servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])}
-	}
-	c := pserver.NewClient(lister(servers), len(servers), selector(true))
-	selected := c.BeginInitParams()
-	if !selected {
-		t.Fatal("should be selected.")
-	}
-
-	const numParameter = 100
-	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p})
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
-
-	err := c.FinishInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
-		var g pserver.Gradient
-		g.Name = "p_" + strconv.Itoa(i)
-		g.ElementType = pserver.Float32
-		g.Content = make([]byte, (i+1)*100)
-		grads = append(grads, g)
-	}
-
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	names := make([]string, numParameter)
-	for i := 0; i < numParameter; i++ {
-		names[i] = "p_" + strconv.Itoa(i)
-	}
-
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
-
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
-		}
-	}
-}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
new file mode 100644
index 0000000000..08ddb247f2
--- /dev/null
+++ b/go/pserver/etcd_client.go
@@ -0,0 +1,253 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package pserver
+
+import (
+	"context"
+	"errors"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/inconshreveable/log15"
+)
+
+const (
+	// PsDesired is etcd path for store desired pserver count
+	PsDesired = "/ps_desired"
+	// PsPath is the base dir for pserver to store their addr
+	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
+
+	retryTimeout = 5 * time.Second
+)
+
+// EtcdClient is the etcd client that the pserver uses for fault
+// tolerance, service registry and coordination.
+type EtcdClient struct {
+	numPservers int
+	endpoints   string
+	client      *clientv3.Client
+	sess        *concurrency.Session
+	dialTimeout time.Duration
+	ttlSec      int
+	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
+	externalIP string
+	// desired number of pservers in the job.
+	// assume desired will not change during one training job.
+	desired int
+}
+
+// NewEtcdClient creates an EtcdClient
+func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
+	return &EtcdClient{
+		dialTimeout: dialtimeout,
+		ttlSec:      ttlSec,
+		numPservers: numPservers,
+		endpoints:   endpoints,
+	}
+}
+
+// Register registers the pserver on etcd
+//
+// Register returns the index of the current pserver.
+func (e *EtcdClient) Register(port int) (int, error) {
+	var err error
+	e.externalIP, err = networkhelper.GetExternalIP()
+	if err != nil {
+		return 0, err
+	}
+
+	// initialize connection to etcd.
+	ep := strings.Split(e.endpoints, ",")
+	for {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: e.dialTimeout,
+		})
+		if err != nil {
+			log.Error("connect to etcd error", log.Ctx{"error": err})
+			time.Sleep(retryTimeout)
+			continue
+		}
+		e.client = cli
+		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
+		if err != nil {
+			log.Error("create etcd session error", log.Ctx{"error": err})
+			time.Sleep(retryTimeout)
+			continue
+		}
+		e.sess = sess
+		log.Debug("connected to etcd", log.Ctx{"endpoint": e.endpoints})
+		break
+	}
+	// init /ps_desired using transaction, for multiple pservers may want to write
+	// it at the same time.
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		_, err := e.initDesiredPservers(ctx, e.numPservers)
+		cancel()
+		if err != nil {
+			log.Warn("pserver init error", log.Ctx{"error": err, "num pservers": e.numPservers})
+			time.Sleep(retryTimeout)
+			continue
+		}
+		break
+	}
+	// TODO: when implementing extending or reducing pservers, /ps_desired is
+	// changed, then we need to watch /ps_desired node for events. For now, just
+	// write once when init and read from it.
+	// wait and set s.desired init value
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		resp, err := e.client.Get(ctx, PsDesired)
+		cancel()
+		if err != nil {
+			log.Error("get etcd key error", log.Ctx{"key": PsDesired, "error": err})
+			time.Sleep(retryTimeout)
+			continue
+		}
+		if len(resp.Kvs) != 0 {
+			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+			if err != nil {
+				log.Error(
+					"psDesired atoi error",
+					log.Ctx{"error": err, "value": string(resp.Kvs[0].Value)},
+				)
+				time.Sleep(retryTimeout)
+				// NOTE: wait util ps_desired value change
+				continue
+			}
+			break
+		}
+	}
+
+	var pserverIdx int
+	// try register pserver node on etcd
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		var err error
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
+		cancel()
+		if err != nil {
+			log.Warn("register pserver on etcd error", log.Ctx{"error": err})
+			time.Sleep(retryTimeout)
+			continue
+		}
+		break
+	}
+
+	return pserverIdx, nil
+}
+
+func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
+		dsStr := c.Get(PsDesired)
+		if dsStr == "" {
+			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
+		}
+		return nil
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+}
+
+// registerPserverEtcd registers pserver node on etcd using transaction.
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
+	var idx int
+	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
+		registered := false
+		for i := 0; i < e.desired; i++ {
+			psKey := PsPath + strconv.Itoa(i)
+			ps := c.Get(psKey)
+			log.Debug(
+				"register pserver got value",
+				log.Ctx{"value": ps, "key": psKey},
+			)
+
+			if ps == "" {
+				// find the first id and write info
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
+				log.Debug("register finished", log.Ctx{"key": psKey, "value": pserverAddr})
+				idx = i
+				registered = true
+				break
+			}
+		}
+		if registered {
+			return nil
+		}
+		return errors.New("not registered, may due to already have enough pservers")
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+
+	if err != nil {
+		return 0, err
+	}
+
+	return idx, nil
+}
+
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.client.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return []byte{}, err
+	}
+
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	var err error
+	if withLease {
+		_, err = e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
+	} else {
+		_, err = e.client.Put(ctx, key, string(value))
+	}
+	cancel()
+	return err
+}
+
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	var err error
+	if e.sess != nil {
+		err = e.sess.Close()
+	}
+
+	if e.client != nil {
+		newErr := e.client.Close()
+		if newErr != nil {
+			if err != nil {
+				log.Error("shutdown error", log.Ctx{"error": newErr})
+			} else {
+				err = newErr
+			}
+		}
+	}
+	return err
+}
diff --git a/go/pserver/optimizer.c b/go/pserver/optimizer.c
deleted file mode 100644
index b8da3ec959..0000000000
--- a/go/pserver/optimizer.c
+++ /dev/null
@@ -1,52 +0,0 @@
-#include <stdlib.h>
-
-#include "optimizer.h"
-
-typedef int (*update_func)(void*, void*, paddle_element_type, const void*, int);
-typedef void (*release_func)(void*);
-
-typedef struct paddle_optimizer {
-  update_func update;
-  release_func release;
-  void* optimizer;
-} paddle_optimizer;
-
-void paddle_release_optimizer(paddle_optimizer* o) {
-  o->release(o->optimizer);
-  free(o);
-}
-
-int paddle_update_parameter(paddle_optimizer* o,
-                            void* buffer,
-                            paddle_element_type element_type,
-                            const void* gradient,
-                            int num_bytes) {
-  return o->update(o->optimizer, buffer, element_type, gradient, num_bytes);
-}
-
-typedef struct { double learning_rate; } SGD_optimizer;
-
-int update_SGD(void* optimizer,
-               void* buffer,
-               paddle_element_type element_type,
-               const void* gradient,
-               int num_bytes) {
-  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
-  return 0;
-}
-
-void release_SGD(void* optimizer) {
-  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // nothing allocated on heap
-}
-
-paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate) {
-  SGD_optimizer* impl = (SGD_optimizer*)malloc(sizeof(SGD_optimizer));
-  impl->learning_rate = learning_rate;
-  paddle_optimizer* opt = (paddle_optimizer*)malloc(sizeof(paddle_optimizer));
-  opt->update = update_SGD;
-  opt->release = release_SGD;
-  opt->optimizer = impl;
-  return opt;
-}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 417f8c5093..6d28cad25a 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,42 +1,123 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
-/*
-#include "optimizer.h"
-*/
+// #cgo CFLAGS: -I ../../
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #include "paddle/optimizer/optimizer.h"
+// #include <stdlib.h>
+// #include <string.h>
 import "C"
+
 import (
 	"fmt"
 	"unsafe"
-)
-
-type optimizerType int
 
-const (
-	sgd optimizerType = iota
+	log "github.com/inconshreveable/log15"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
-	opt *C.struct_paddle_optimizer
+	opt         *C.struct_paddle_optimizer
+	elementType ElementType
+	contentLen  int
+	config      []byte
+}
+
+func cArrayToSlice(p unsafe.Pointer, len int) []byte {
+	if p == nil {
+		return nil
+	}
+
+	// create a Go clice backed by a C array, reference:
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	//
+	// Go garbage collector will not interact with this data, need
+	// to be freed properly.
+	return (*[1 << 30]byte)(p)[:len:len]
 }
 
-func newOptimizer(t optimizerType, learning_rate float64) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
-	o.opt = C.paddle_create_SGD_optimizer(C.double(learning_rate))
+	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
+	p := paramWithConfigs.Param
+	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content))
+	log.Info("New Optimizer Created with config", log.Ctx{
+		"ElementType": p.ElementType,
+		"ParamSize":   paramBufferSize,
+		"ConfigSize":  len(c),
+		"StateSize":   len(s),
+	})
+	var cbuffer unsafe.Pointer
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
+	var cptr (*C.uchar)
+	if len(c) > 0 {
+		cptr = (*C.uchar)(&c[0])
+	} else {
+		log.Error("empty config", "param name", paramWithConfigs.Param.Name)
+	}
+	o.config = c
+	o.opt = C.paddle_create_optimizer(
+		cptr,
+		C.int(len(c)),
+		C.paddle_element_type(p.ElementType),
+		cbuffer,
+		C.int(paramBufferSize),
+		(*C.char)(cstate),
+		C.int(len(s)),
+	)
 	return o
 }
 
-func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
-	if len(p.Content) != len(g.Content) {
-		return fmt.Errorf("Name: %s, parameter and gradient length not match, parameter: %d, gradient: %d", p.Name, len(p.Content), len(g.Content))
+func (o *optimizer) GetWeights() []byte {
+	var buffer unsafe.Pointer
+	// we do not own the buffer, no need to free later.
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
+}
+
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	// we owns the state buffer, need to free later.
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	buf := cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+	cpy := make([]byte, len(buf))
+	copy(cpy, buf)
+	C.free(unsafe.Pointer(cbuffer))
+	return cpy
+}
+
+func (o *optimizer) UpdateParameter(g Gradient) error {
+	if o.elementType != g.ElementType {
+		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
 
-	if p.ElementType != g.ElementType {
-		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", p.Name, p.ElementType, g.ElementType)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
 	}
 
-	r := C.paddle_update_parameter(o.opt, unsafe.Pointer(&p.Content[0]), C.paddle_element_type(p.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -44,8 +125,8 @@ func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
 }
 
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
diff --git a/go/pserver/optimizer.h b/go/pserver/optimizer.h
deleted file mode 100644
index a7e3ff0530..0000000000
--- a/go/pserver/optimizer.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#ifndef PADDLE_PSERVER_OPTIMIZER_H
-#define PADDLE_PSERVER_OPTIMIZER_H
-
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32 = 0,
-  PADDLE_ELEMENT_TYPE_UINT32 = 1,
-  PADDLE_ELEMENT_TYPE_INT64 = 2,
-  PADDLE_ELEMENT_TYPE_UINT64 = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-struct paddle_optimizer;
-struct paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate);
-void paddle_release_optimizer(struct paddle_optimizer* o);
-int paddle_update_parameter(struct paddle_optimizer* o,
-                            void* buffer,
-                            paddle_element_type element_type,
-                            const void* gradient,
-                            int num_bytes);
-
-#endif /* PADDLE_PSERVER_OPTIMIZER_H */
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index 64d6d092aa..565f56dc28 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -1,8 +1,78 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
-import "testing"
+import (
+	"encoding/binary"
+	"io/ioutil"
+	"math"
+	"testing"
+
+	"github.com/stretchr/testify/assert"
+)
+
+func TestOptimizerCreateRelease(t *testing.T) {
+	p := Parameter{
+		Name:        "a",
+		ElementType: Int32,
+	}
+	p.Content = []byte{1, 3}
+	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+	param := ParameterWithConfig{
+		Param:  p,
+		Config: config,
+	}
+	o := newOptimizer(param, nil)
+	o.Cleanup()
+}
+
+func float32Bytes(float float32) []byte {
+	bits := math.Float32bits(float)
+	bytes := make([]byte, 4)
+	binary.LittleEndian.PutUint32(bytes, bits)
+	return bytes
+}
+
+func TestOptimizerState(t *testing.T) {
+	p := Parameter{
+		Name:        "a",
+		ElementType: Int32,
+	}
+	weights := float32Bytes(100)
+	p.Content = weights
+	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+	param := ParameterWithConfig{
+		Param:  p,
+		Config: config,
+	}
+	o := newOptimizer(param, nil)
+	s := o.GetStates()
 
-func TestSGDCreateRelease(t *testing.T) {
-	o := newOptimizer(sgd, 1)
+	// clear param content and check if the state is restored.
+	param.Param.Content = float32Bytes(300)
+	o1 := newOptimizer(param, s)
+	s1 := o1.GetStates()
+	assert.Equal(t, s, s1)
+	assert.Equal(t, weights, o.GetWeights())
+	assert.Equal(t, weights, o1.GetWeights())
 	o.Cleanup()
+	o1.Cleanup()
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index d5787b9708..7484ec90b1 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,18 +1,59 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
+	"bufio"
+	"bytes"
+	"encoding/binary"
+	"encoding/gob"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"hash/crc32"
+	"io/ioutil"
+	"os"
+	"path"
+	"strconv"
+	"strings"
 	"sync"
+	"time"
+
+	"github.com/golang/protobuf/proto"
+	uuid "github.com/satori/go.uuid"
+
+	pb "github.com/PaddlePaddle/Paddle/go/proto"
+
+	log "github.com/inconshreveable/log15"
 )
 
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
-var ErrUninitialized = errors.New("pserver not fully initialized")
+// ErrCheckpointNotFound indicates that the pserver checkpoint could
+// not be found.
+var ErrCheckpointNotFound = errors.New("checkpoint not found in etcd")
 
-// Supported element types
+// RPC error message.
+const (
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+	WrongChecksum      = "checkpoint file checksum validation failed"
+)
+
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -29,41 +70,173 @@ type Parameter struct {
 	Content     []byte
 }
 
+func float32ToString(b []byte) string {
+	f := make([]float32, len(b)/4)
+	buf := bytes.NewReader(b)
+	err := binary.Read(buf, binary.LittleEndian, &f)
+	if err != nil {
+		return ""
+	}
+	return fmt.Sprintf("%v", f)
+}
+
+func float32ByteToString(c []byte) string {
+	var a []byte
+	var b []byte
+	if len(c) <= 80 {
+		a = c
+	} else {
+		a = c[0:40]
+		b = c[len(c)-40:]
+	}
+
+	var s string
+	s = float32ToString(a)
+
+	if b == nil {
+		return s
+	}
+
+	s = strings.Replace(s, "]", "", -1) + "..." + strings.Replace(float32ToString(b), "[", "", -1)
+	return s
+}
+
+func (p Parameter) String() string {
+	if p.ElementType != Float32 {
+		return fmt.Sprintf("name:%v ElementType:%v",
+			p.Name, p.ElementType)
+	}
+
+	return float32ByteToString(p.Content)
+}
+
 // ParameterWithConfig contains the parameter and the configuration.
 type ParameterWithConfig struct {
 	Param  Parameter
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
+// checkpointMeta saves checkpoint metadata
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	Path      string `json:"path"`
+	CRC32     uint32 `json:"crc32"`
+	Timestamp int64  `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file.
+type Checkpoint []parameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
 
 // Service is the RPC service for pserver.
 type Service struct {
-	initialized chan struct{}
+	initialized        chan struct{}
+	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             KVStore
+
+	mu     sync.Mutex
+	optMap map[string]*optimizer
+}
+
+// parameterCheckpoint saves parameter checkpoint.
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+type KVStore interface {
+	GetKey(key string, timeout time.Duration) ([]byte, error)
+	PutKey(key string, value []byte, timeout time.Duration, withLease bool) error
+}
+
+func loadMeta(e KVStore, idx int) (meta checkpointMeta, err error) {
+	v, err := e.GetKey(PsCheckpoint+strconv.Itoa(idx), 3*time.Second)
+	if err != nil {
+		return
+	}
+
+	if len(v) == 0 {
+		err = ErrCheckpointNotFound
+		return
+	}
+
+	if err = json.Unmarshal(v, &meta); err != nil {
+		return
+	}
+
+	return
+}
 
-	mu       sync.Mutex
-	opt      *optimizer
-	paramMap map[string]Parameter
+// LoadCheckpoint loads checkpoint from file.
+func LoadCheckpoint(e KVStore, idx int) (Checkpoint, error) {
+	log.Info("Loading checkpoint", "pserver index", idx)
+	defer traceTime(time.Now(), "load checkpoint")
+
+	cpMeta, err := loadMeta(e, idx)
+	if err != nil {
+		return nil, err
+	}
+
+	content, err := ioutil.ReadFile(cpMeta.Path)
+	if err != nil {
+		return nil, err
+	}
+
+	crc32 := crc32.ChecksumIEEE(content)
+	if crc32 != cpMeta.CRC32 {
+		return nil, errors.New(WrongChecksum)
+	}
+
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	var cp Checkpoint
+	if err = dec.Decode(&cp); err != nil {
+		return nil, err
+	}
+
+	return cp, nil
 }
 
-// NewService creates a new service.
-func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
-	s.paramMap = make(map[string]Parameter)
+// NewService creates a new service, will bypass etcd registration if no
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client KVStore, cp Checkpoint) (*Service, error) {
+	s := &Service{
+		idx:                idx,
+		checkpointInterval: interval,
+		checkpointPath:     path,
+		client:             client,
+	}
+	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
-	return s
+
+	if cp != nil {
+		for _, item := range cp {
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
+		}
+		close(s.initialized)
+	}
+	return s, nil
 }
 
 // InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		log.Warn("init param called but parameters already initialized.")
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
-	// TODO(helin): parse parameter config
+	c := &pb.OptimizerConfig{}
+	proto.Unmarshal(paramWithConfigs.Config, c)
+	log.Debug(fmt.Sprintf("OptimizerConfig:%v", c))
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -71,41 +244,67 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.paramMap[paramWithConfigs.Param.Name] = paramWithConfigs.Param
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
+	log.Info(
+		"init parameter",
+		"name", paramWithConfigs.Param.Name,
+		"config len", len(paramWithConfigs.Config),
+		"param len", len(paramWithConfigs.Param.Content),
+		"type", paramWithConfigs.Param.ElementType,
+	)
 	return nil
 }
 
 // FinishInitParams tells the parameter server that the parameter
 // initialization has finished.
-func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		log.Warn("finished init param called but parameters already initialized.")
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
 	close(s.initialized)
+	go func() {
+		t := time.Tick(s.checkpointInterval)
+		for range t {
+			err := s.checkpoint()
+			if err != nil {
+				log.Error("checkpoint error", log.Ctx{"error": err})
+			}
+		}
+	}()
+
+	log.Info("init parameter finished.")
 	return nil
 }
 
 // SendGrad sends gradient to parameter servers for parameter
 // optimization.
-func (s *Service) SendGrad(g Gradient, dummy *int) error {
+func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		log.Warn("received gradient before initialization.",
+			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
+		return errors.New(Uninitialized)
 	}
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	p, ok := s.paramMap[g.Name]
+	o, ok := s.optMap[g.Name]
 	if !ok {
+		log.Warn("received gradient but can't find name.",
+			"name", g.Name, "size", len(g.Content), "type", g.ElementType)
 		return fmt.Errorf("parameter: %s does not exist", g.Name)
 	}
 
-	return s.opt.UpdateParameter(p, g)
+	log.Debug(Parameter(g).String())
+	log.Info("received gradient from trainer, updating gradient.",
+		"name", g.Name, "size", len(g.Content), "type", g.ElementType)
+	return o.UpdateParameter(g)
 }
 
 // GetParam gets parameters from the parameter server.
@@ -114,8 +313,9 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	p, ok := s.paramMap[name]
+	opt, ok := s.optMap[name]
 	if !ok {
+		log.Warn("trainer wants to get a parameter that does not exist.", "name", name)
 		return fmt.Errorf("parameter: %s does not exist", name)
 	}
 
@@ -125,15 +325,126 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
-	*parameter = p
+	// parameter content.
+	parameter.Name = name
+	parameter.ElementType = opt.elementType
+	parameter.Content = opt.GetWeights()
+	log.Debug(parameter.String())
+	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
 	return nil
 }
 
-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
-	<-s.initialized
+func traceTime(start time.Time, name string) {
+	elapsed := time.Since(start)
+	log.Info("time elapsed", log.Ctx{"name": name, "elapsed": elapsed})
+}
 
-	// TODO
-	return nil
+// checkpoint saves checkpoint to disk.
+//
+// checkpoint should be only called after the parameters are
+// initialized.
+func (s *Service) checkpoint() (err error) {
+	log.Info("Begin save checkpoint.")
+	defer traceTime(time.Now(), "save checkpoint")
+
+	s.mu.Lock()
+	cp := make([]parameterCheckpoint, len(s.optMap))
+	index := 0
+	// TODO(helin): write checkpoint incrementally to reduce memory
+	// footprint during checkpoint.
+	for name, opt := range s.optMap {
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
+		pc.Config = opt.config
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	s.mu.Unlock()
+
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err = encoder.Encode(cp)
+	if err != nil {
+		return
+	}
+
+	if _, err = os.Stat(s.checkpointPath); os.IsNotExist(err) {
+		err = os.MkdirAll(s.checkpointPath, os.ModePerm)
+		if err != nil {
+			return
+		}
+	}
+
+	id := uuid.NewV4().String()
+	p := path.Join(s.checkpointPath, id)
+	f, err := os.Create(p)
+	if err != nil {
+		return
+	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Error("error close checkpoint file", log.Ctx{"error": closeErr})
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
+
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	if err != nil {
+		return
+	}
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	oldMeta, err := loadMeta(s.client, s.idx)
+	if err == ErrCheckpointNotFound {
+		log.Info("old meta not found, skip removing old meta")
+		err = nil
+	} else if err == nil {
+		log.Info("removing old meta")
+		if oldMeta.Path != "" {
+			rmErr := os.Remove(oldMeta.Path)
+			if rmErr != nil {
+				// log error, but still treat checkpoint as
+				// successful.
+				log.Error("remove old meta file error", log.Ctx{"error": rmErr})
+			}
+		}
+	}
+
+	if err != nil {
+		return
+	}
+
+	crc32 := crc32.ChecksumIEEE(buf.Bytes())
+	cpMeta := checkpointMeta{
+		UUID:      id,
+		Timestamp: time.Now().UnixNano(),
+		CRC32:     crc32,
+		Path:      p,
+	}
+
+	json, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
+	err = s.client.PutKey(PsCheckpoint+strconv.Itoa(s.idx), json, 3*time.Second, false)
+	if err != nil {
+		return
+	}
+
+	return
 }
diff --git a/go/pserver/service_internal_test.go b/go/pserver/service_internal_test.go
new file mode 100644
index 0000000000..36eca5112b
--- /dev/null
+++ b/go/pserver/service_internal_test.go
@@ -0,0 +1,86 @@
+package pserver
+
+import (
+	"bytes"
+	"encoding/binary"
+	"fmt"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+)
+
+const testDir = "./test_data"
+
+type myKV struct {
+	m map[string][]byte
+}
+
+func (m *myKV) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	if m.m == nil {
+		m.m = make(map[string][]byte)
+	}
+	return m.m[key], nil
+}
+
+func (m *myKV) PutKey(key string, value []byte, timeout time.Duration, withLease bool) error {
+	if m.m == nil {
+		m.m = make(map[string][]byte)
+	}
+	m.m[key] = value
+	return nil
+}
+
+func TestCheckpoint(t *testing.T) {
+	kv := &myKV{}
+	s, err := NewService(0, time.Hour, testDir, kv, nil)
+	assert.Nil(t, err)
+	err = s.checkpoint()
+	assert.Nil(t, err)
+	_, err = LoadCheckpoint(kv, 0)
+	assert.Nil(t, err)
+}
+
+func float32ToByte(f float32) []byte {
+	var buf bytes.Buffer
+	err := binary.Write(&buf, binary.LittleEndian, f)
+	if err != nil {
+		fmt.Println("binary.Write failed:", err)
+	}
+	return buf.Bytes()
+}
+
+func TestCheckpointWithData(t *testing.T) {
+	kv := &myKV{}
+	s, err := NewService(0, time.Hour, testDir, kv, nil)
+	assert.Nil(t, err)
+
+	var content []byte
+	for i := 0; i < 50000; i++ {
+		content = append(content, float32ToByte(float32(i))...)
+	}
+
+	p1 := Parameter{Name: "p1", ElementType: 1, Content: content}
+	err = s.InitParam(ParameterWithConfig{Param: p1}, nil)
+	assert.Nil(t, err)
+
+	err = s.FinishInitParams(0, nil)
+	assert.Nil(t, err)
+
+	var p2 Parameter
+	err = s.GetParam(p1.Name, &p2)
+	assert.Nil(t, err)
+	assert.Equal(t, p1, p2)
+
+	err = s.checkpoint()
+	assert.Nil(t, err)
+	cp, err := LoadCheckpoint(kv, 0)
+	assert.Nil(t, err)
+	s1, err := NewService(0, time.Hour, testDir, kv, cp)
+	assert.Nil(t, err)
+
+	var p3 Parameter
+	err = s1.GetParam(p1.Name, &p3)
+	assert.Nil(t, err)
+	assert.Equal(t, p1, p3)
+}
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 4c9fac4536..58a743e1fa 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -1,6 +1,22 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver_test
 
 import (
+	"fmt"
+	"io/ioutil"
 	"reflect"
 	"sync"
 	"testing"
@@ -9,57 +25,70 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
 
-func TestFull(t *testing.T) {
-	s := pserver.NewService()
+const (
+	OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
+)
+
+func TestServiceFull(t *testing.T) {
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
+	if err != nil {
+		t.Error(err)
+	}
 	var p pserver.Parameter
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
-		t.FailNow()
+		t.Fatalf("read optimizer proto failed")
+	}
+
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
+	if err != nil {
+		t.Fatal(err)
 	}
 
 	var p1 pserver.Parameter
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+
+	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	// don't compare content, since it's already changed by
@@ -68,54 +97,51 @@ func TestFull(t *testing.T) {
 	p.Content = nil
 
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
 
 func TestMultipleInit(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.FinishInitParams(0, &dummy)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
+	}
+	err = s.FinishInitParams(0, nil)
+	if err != nil {
+		t.Fatal(err)
 	}
 
-	err = s.FinishInitParams(0, &dummy)
-	if err != pserver.ErrAlreadyInitialized {
-		t.FailNow()
+	err = s.FinishInitParams(0, nil)
+	if err.Error() != pserver.AlreadyInitialized {
+		t.Fatal(err)
 	}
 }
 
 func TestUninitialized(t *testing.T) {
-	s := pserver.NewService()
-	var dummy int
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
-	if err != pserver.ErrUninitialized {
-		t.FailNow()
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
+	err = s.SendGrad(pserver.Gradient{}, nil)
+	if err.Error() != pserver.Uninitialized {
+		t.Fatal(err)
 	}
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s := pserver.NewService()
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, time.Hour, "", nil, cp)
+	if err != nil {
+		t.Error(err)
+	}
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
-	wg.Add(1)
-	go func() {
-		var dummy int
-		err := s.Save("", &dummy)
-		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +153,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}
 
@@ -134,16 +162,50 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
-		t.FailNow()
+		t.Fatalf("read optimizer proto failed")
 	}
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 
-	err = s.FinishInitParams(0, &dummy)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
+	}
+
+	err = s.FinishInitParams(0, nil)
+	if err != nil {
+		t.Fatal(err)
 	}
 
 	wg.Wait()
 }
+
+func TestGradientString(t *testing.T) {
+	g := pserver.Parameter{}
+	g.ElementType = pserver.Float32
+	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40, 0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
+	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
+		t.Fatal("get float data error!")
+	}
+
+	g.Content = []byte{0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40,
+		0x18, 0x2d, 0x44, 0x54, 0xfb, 0x21, 0x09, 0x40}
+	if g.String() != "[3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699...3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699 3.3702806e+12 2.142699]" {
+		t.Fatal("get float data error!", g.String())
+	}
+	fmt.Println(g)
+}
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
new file mode 100644
index 0000000000..9233264ff3
--- /dev/null
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
new file mode 100644
index 0000000000..c3fc747bda
--- /dev/null
+++ b/go/utils/networkhelper/helper.go
@@ -0,0 +1,59 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package networkhelper
+
+import (
+	"errors"
+	"net"
+)
+
+// GetExternalIP returns the ip address of local network interface, not the
+// loopback device.
+func GetExternalIP() (string, error) {
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", err
+	}
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			continue // interface down
+		}
+		if iface.Flags&net.FlagLoopback != 0 {
+			continue // loopback interface
+		}
+		addrs, err := iface.Addrs()
+		if err != nil {
+			return "", err
+		}
+		for _, addr := range addrs {
+			var ip net.IP
+			switch v := addr.(type) {
+			case *net.IPNet:
+				ip = v.IP
+			case *net.IPAddr:
+				ip = v.IP
+			}
+			if ip == nil || ip.IsLoopback() {
+				continue
+			}
+			ip = ip.To4()
+			if ip == nil {
+				continue // not an ipv4 address
+			}
+			return ip.String(), nil
+		}
+	}
+	return "", errors.New("are you connected to the network?")
+}
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
new file mode 100644
index 0000000000..0bc02ad42a
--- /dev/null
+++ b/go/utils/networkhelper/helper_test.go
@@ -0,0 +1,24 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package networkhelper
+
+import "testing"
+
+func TestGetIP(t *testing.T) {
+	_, err := GetExternalIP()
+	if err != nil {
+		t.Errorf("GetExternalIP returns error : %v\n", err)
+	}
+}
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index fa7baccc86..8fd58925ee 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -21,22 +21,15 @@
 # 
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #
-
-if ! python -c "import paddle" >/dev/null 2>/dev/null; then
-  PYPATH=""
-  set -x
-  while getopts "d:" opt; do
-    case $opt in
-      d)
-        PYPATH=$OPTARG
-        ;;
-    esac
-  done
-  shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH:$PYTHONPATH
-  $@
-else
-  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
-  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
-  exit 1
-fi
+PYPATH=""
+set -x
+while getopts "d:" opt; do
+  case $opt in
+    d)
+      PYPATH=$OPTARG
+      ;;
+  esac
+done
+shift $(($OPTIND - 1))
+export PYTHONPATH=$PYPATH:$PYTHONPATH
+$@
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9898dc083e..3f9c132ef6 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,33 +1,33 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
-add_subdirectory(testing)
 add_subdirectory(math)
-add_subdirectory(parameter)
 add_subdirectory(gserver)
-add_subdirectory(pserver)
-add_subdirectory(trainer)
-add_subdirectory(scripts)
-
-# Do not build go directory until go cmake is working smoothly.
-# if(CMAKE_Go_COMPILER)
-#   add_subdirectory(go)
-# endif()
-
-find_package(Boost QUIET)
+add_subdirectory(parameter)
+add_subdirectory(testing)
 
-if(Boost_FOUND)
-  include_directories(${Boost_INCLUDE_DIRS})
-  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-  add_subdirectory(majel)
-endif()
+if(MOBILE_INFERENCE)
+  add_subdirectory(capi)
+else()
+  add_subdirectory(pserver)
+  add_subdirectory(trainer)
+  add_subdirectory(string)
+  add_subdirectory(scripts)
 
-if(WITH_C_API)
+  if(WITH_C_API)
     add_subdirectory(capi)
-endif()
+  endif()
+
+  if(NOT ANDROID AND NOT IOS)
+    add_subdirectory(memory)
+    add_subdirectory(platform)
+    add_subdirectory(framework)
+    add_subdirectory(operators)
+    add_subdirectory(pybind)
+    add_subdirectory(inference)
+  endif()
 
-if(WITH_SWIG_PY)
-  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
-  add_subdirectory(api)
+  if(WITH_SWIG_PY)
+    add_subdirectory(api)
+  endif()
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index e147659566..cf84568ecd 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,17 +16,27 @@ set(API_HEADER
     Internal.h)
 
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
+INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
 
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
+SET(SWIG_NEED_FLAGS
+    -ftls-model=global-dynamic
+    -Wno-parentheses-equality
+    -Wno-self-assign
+    -Wno-maybe-uninitialized
+    -Wno-missing-field-initializers)
+  FOREACH(flag ${SWIG_NEED_FLAGS})
+  safe_set_cxxflag(SWIG_CXX_FLAGS ${flag})
+ENDFOREACH()
+
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
@@ -41,10 +51,11 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_network
     paddle_proto
     ${external_project_dependencies}
+    ${RDMA_LIBS}
 )
 
 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
     SET(START_GROUP "-Xlinker -start-group")
     SET(END_GROUP "-Xlinker -end-group")
@@ -65,6 +76,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
     paddle_trainer_lib
     paddle_network
     paddle_parameter
+    paddle_optimizer
     paddle_math
     paddle_utils
     paddle_proto
@@ -73,25 +85,20 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${CMAKE_DL_LIBS}
     ${EXTERNAL_LIBS}
     ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
-    COMMAND rm -rf py_paddle.egg-info build
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
-
-install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
@@ -105,7 +112,7 @@ if(WITH_TESTING)
             BUILD_COMMAND       ""
             INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
             BUILD_IN_SOURCE     1
-            DEPENDS python setuptools python_api_wheel
+            #DEPENDS python setuptools python_api_wheel
         )
     ENDIF()
     add_subdirectory(test)
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd..b6ff6ec789 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index 681e3a3809..fcda6eaf03 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -37,7 +37,7 @@ std::vector<std::string> Evaluator::getNames() const {
 double Evaluator::getValue(const std::string name) const {
   paddle::Error err;
   double v = m->rawPtr->getValue(name, &err);
-  if (err) {
+  if (!err.isOK()) {
     throw std::runtime_error(err.msg());
   }
   return v;
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
index 068ba286c0..3237e73745 100644
--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index da0f157abd..0b9b83d429 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,10 @@ public:
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
                                                bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config,
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e..120eea3f70 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 79921ea6e7..8cd73b348c 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "PaddleAPIPrivate.h"
+#ifndef PADDLE_WITHOUT_GOLANG
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
+#endif
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 
@@ -28,6 +31,20 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
   return updater;
 }
 
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config,
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
+#ifndef PADDLE_WITHOUT_GOLANG
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec, useEtcd));
+  return updater;
+#else
+  throw UnsupportError("not compiled with WITH_GOLANG");
+#endif
+}
+
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
     OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index d369df5d4e..11bd05c09d 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929..500bc448c9 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index f3b1c2c4d4..761aeb5b17 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,6 @@
-add_python_test(test_swig_api
-    testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py)
+py_test(testTrain SRCS testTrain.py)
+py_test(testMatrix SRCS testMatrix.py)
+py_test(testVector SRCS testVector.py)
+py_test(testTrainer SRCS testTrainer.py)
+py_test(testArguments SRCS testArguments.py)
+py_test(testGradientMachine SRCS testGradientMachine.py)
diff --git a/paddle/api/test/testTrainConfig.py b/paddle/api/test/testTrainConfig.py
index 77e0cd37d5..1a1283e116 100644
--- a/paddle/api/test/testTrainConfig.py
+++ b/paddle/api/test/testTrainConfig.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_method=AdamOptimizer())
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
index 8b81ec69e6..1ec403077e 100644
--- a/paddle/capi/Arguments.cpp
+++ b/paddle/capi/Arguments.cpp
@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args,
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                              uint64_t ID,
+                                              uint64_t frameHeight,
+                                              uint64_t frameWidth) {
+  if (args == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].setFrameHeight(frameHeight);
+  a->args[ID].setFrameWidth(frameWidth);
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
                                                      uint64_t ID,
                                                      uint32_t nestedLevel,
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 206f512563..ebb083c5a4 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -4,6 +4,16 @@ else ()
   set(PADDLE_FLOAT_TYPE float)
 endif()
 
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
+
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
@@ -26,47 +36,79 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
   ${CAPI_PRIVATE_HEADER})
 
-add_dependencies(paddle_capi gen_proto_cpp)
+add_dependencies(paddle_capi paddle_proto)
 
+# TODO: paddle_capi_whole will be removed.
+set(PADDLE_CAPI_LAYERS_LIBS
+    paddle_function
+    paddle_gserver)
+if(MOBILE_INFERENCE)
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto)
+else()
+  set(PADDLE_CAPI_ENGINE_LIBS
+      paddle_utils
+      paddle_parameter
+      paddle_math
+      paddle_cuda
+      paddle_proto
+      paddle_pserver
+      paddle_network)
+endif()
+set(PADDLE_CAPI_INFER_LIBS ${PADDLE_CAPI_LAYERS_LIBS} ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
-# combine all paddle static libraries together, into libpaddle_capi_whole.a
-# user should use PaddleCAPI as -lpaddle_capi_whole
-set(capi_whole_library libpaddle_capi_whole.a)
-add_custom_target(paddle_capi_whole ALL
-        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
-        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
-        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
-        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
-        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
-        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
-        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
-        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
-        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
-        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
-        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
-        COMMAND rm -rf o_files
-        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
-                paddle_cuda paddle_function paddle_gserver
-                paddle_proto paddle_pserver paddle_network
-        )
-set_target_properties(paddle_capi_whole
-  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+# Link the static library for inference
+cc_library(paddle_capi_engine DEPS paddle_capi ${PADDLE_CAPI_ENGINE_LIBS})
+cc_library(paddle_capi_layers DEPS ${PADDLE_CAPI_LAYERS_LIBS})
 
-add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
-target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
-link_paddle_exe(paddle_capi_shared)
+# Link the shared library for inference
+if(NOT IOS)
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
+  add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+  set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+  target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+  link_paddle_exe(paddle_capi_shared)
+endif()
 
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES paddle_capi.map DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library}
-          DESTINATION lib/${ANDROID_ABI})
-  install(TARGETS paddle_capi_shared DESTINATION lib/${ANDROID_ABI})
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
+          ARCHIVE DESTINATION lib/${ANDROID_ABI}
+          LIBRARY DESTINATION lib/${ANDROID_ABI})
+  execute_process(
+    COMMAND ${GIT_EXECUTABLE} log --pretty=oneline -1
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+    OUTPUT_VARIABLE GIT_COMMITS_LIST
+    RESULT_VARIABLE GIT_COMMITS_LIST_RESULT
+    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+  if(${GIT_COMMITS_LIST_RESULT})
+    set(GIT_COMMITS_LIST "No commits.")
+  endif()
+  install(CODE "FILE(WRITE ${CMAKE_INSTALL_PREFIX}/lib/${ANDROID_ABI}/BUILD.txt
+          \"Compiler:\n\"
+          \"\\t${CMAKE_C_COMPILER}\\n\"
+          \"\\t${CMAKE_CXX_COMPILER}\\n\"
+          \"Compiler Flags:\\n\"
+          \"\\t${CMAKE_F_FLAGS}\\n\"
+          \"\\t${CMAKE_CXX_FLAGS}\\n\"
+          \"Android API: ${CMAKE_SYSTEM_VERSION}\\n\"
+          \"Lastest commit:\\n\"
+          \"\\t${GIT_COMMITS_LIST}\\n\"
+      )"
+  )
 else(ANDROID)
-  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
-  install(TARGETS paddle_capi_shared DESTINATION lib)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
+  if(NOT IOS)
+    install(TARGETS paddle_capi_shared DESTINATION lib)
+  endif()
 endif(ANDROID)
 
 # this variable used for unittest
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
index 78c43949df..c038789340 100644
--- a/paddle/capi/Main.cpp
+++ b/paddle/capi/Main.cpp
@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
 
 extern "C" {
 paddle_error paddle_init(int argc, char** argv) {
+  static bool isInit = false;
+  if (isInit) return kPD_NO_ERROR;
+
   std::vector<char*> realArgv;
   realArgv.reserve(argc + 1);
   realArgv.push_back(strdup(""));
@@ -37,6 +40,14 @@ paddle_error paddle_init(int argc, char** argv) {
   }
   initPaddle(argc + 1, realArgv.data());
   free(realArgv[0]);
+  isInit = true;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_init_thread() {
+  if (FLAGS_use_gpu) {
+    hl_init(FLAGS_gpu_id);
+  }
   return kPD_NO_ERROR;
 }
 }
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d898ebe261..cbacd1fb71 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -40,13 +40,13 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) {
 paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real* rowArray) {
-  if (mat == nullptr) return kPD_NULLPTR;
+  if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
   paddle::real* buf = ptr->mat->getRowBuf(rowID);
   size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
   std::copy(rowArray, rowArray + width, buf);
@@ -54,6 +54,46 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   return kPD_NO_ERROR;
 }
 
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                            paddle_real* value) {
+  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(buf, value, sizeof(paddle::real) * width * height);
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(value, value + width * height, buf);
+  }
+  return kPD_NO_ERROR;
+}
+
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                            paddle_real* result) {
+  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  paddle::real* buf = ptr->mat->getRowBuf(0);
+  size_t width = ptr->mat->getWidth();
+  size_t height = ptr->mat->getHeight();
+  if (ptr->mat->useGpu()) {
+#ifdef PADDLE_WITH_CUDA
+    hl_memcpy(result, buf, width * height * sizeof(paddle::real));
+#else
+    return kPD_NOT_SUPPORTED;
+#endif
+  } else {
+    std::copy(buf, buf + width * height, result);
+  }
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                    uint64_t rowID,
                                    paddle_real** rawRowBuffer) {
@@ -81,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 
 paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
   auto ptr = new paddle::capi::CMatrix();
   ptr->mat = paddle::Matrix::createSparseMatrix(
       height,
@@ -91,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
       false,
       useGpu);
   return ptr;
+#else
+  return nullptr;
+#endif
 }
 
 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@@ -100,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                             uint64_t colSize,
                                             float* valueArray,
                                             uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (mat == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (rowArray == nullptr || colArray == nullptr ||
@@ -120,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
   } else {
     return kPD_NOT_SUPPORTED;
   }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
index d71ea26a5d..7c32524a00 100644
--- a/paddle/capi/arguments.h
+++ b/paddle/capi/arguments.h
@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
                                              uint64_t ID,
                                              paddle_ivector ids);
 
+/**
+ * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
+ *        in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [in] frameHeight maximum height of input images
+ * @param [in] frameWidth maximum width of input images
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint64_t frameHeight,
+                                                     uint64_t frameWidth);
+
 /**
  * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
  *        argument in array, which index is `ID`.
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
index d205307588..0ddbd8c753 100644
--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@@ -3,6 +3,9 @@
 
 typedef @PADDLE_FLOAT_TYPE@ paddle_real;
 
+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
+
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))
diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp
new file mode 100644
index 0000000000..96ce31b45f
--- /dev/null
+++ b/paddle/capi/error.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "error.h"
+
+extern "C" const char* paddle_error_string(paddle_error err) {
+  switch (err) {
+    case kPD_NULLPTR:
+      return "nullptr error";
+    case kPD_OUT_OF_RANGE:
+      return "out of range error";
+    case kPD_PROTOBUF_ERROR:
+      return "protobuf error";
+    case kPD_NOT_SUPPORTED:
+      return "not supported error";
+    case kPD_UNDEFINED_ERROR:
+      return "undefined error";
+    default:
+      return "";
+  }
+}
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
index 44d8c2040d..2da9e0a3ef 100644
--- a/paddle/capi/error.h
+++ b/paddle/capi/error.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef __PADDLE_CAPI_ERROR_H__
 #define __PADDLE_CAPI_ERROR_H__
 
+#include "config.h"
+
 /**
  * Error Type for Paddle API.
  */
@@ -27,4 +29,17 @@ typedef enum {
   kPD_UNDEFINED_ERROR = -1,
 } paddle_error;
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Error string for Paddle API.
+ */
+PD_API const char* paddle_error_string(paddle_error err);
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
index a78522e4a7..9efcbc387e 100644
--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -1,20 +1,36 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #ifndef __CAPI_EXAMPLE_COMMON_H__
 #define __CAPI_EXAMPLE_COMMON_H__
 #include <stdio.h>
 #include <stdlib.h>
 
-#define CHECK(stmt)                                                \
-  do {                                                             \
-    paddle_error __err__ = stmt;                                   \
-    if (__err__ != kPD_NO_ERROR) {                                 \
-      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
-      exit(__err__);                                               \
-    }                                                              \
+#define CHECK(stmt)                                                      \
+  do {                                                                   \
+    paddle_error __err__ = stmt;                                         \
+    if (__err__ != kPD_NO_ERROR) {                                       \
+      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
+      exit(__err__);                                                     \
+    }                                                                    \
   } while (0)
 
 void* read_config(const char* filename, long* size) {
   FILE* file = fopen(filename, "r");
-  if (file == NULL) return NULL;
+  if (file == NULL) {
+    fprintf(stderr, "Open %s error\n", filename);
+    return NULL;
+  }
   fseek(file, 0L, SEEK_END);
   *size = ftell(file);
   fseek(file, 0L, SEEK_SET);
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 3e6bd52850..f795bfe11d 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -1,64 +1,111 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
+// Modify this path as needed.
 #define CONFIG_BIN "./trainer_config.bin"
+// Modify this path as needed.
+// This demo assumes that merged model is not used, then this path is the
+// directory storing all the trained parameters.
+// If the model is trained by PaddlePaddle V2 API, the model is saved as
+// a compressed file. You need to uncompress the compressed file first.
+#define MODEL_PATH "models/pass_4"
 
 int main() {
-  // Initalize Paddle
+  // Initalize the PaddlePaddle runtime environment.
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file generated by `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
+  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
+  // path of the trained model.
+  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
+
+  // Inputs and outputs of the network are organized as paddle_arguments object
+  // in C-API. In the comments below, "argument" specifically means one input of
+  // the neural network in PaddlePaddle C-API.
   paddle_arguments in_args = paddle_arguments_create_none();
 
-  // There is only one input of this network.
+  // There is only one data layer in this demo MNIST network, invoke this
+  // function to create one argument.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
-  srand(time(0));
-  paddle_real* array;
+  // Each argument needs one matrix or one ivector (integer vector, for sparse
+  // index input, usually used in NLP task) to holds the real input data.
+  // In the comments below, "matrix" specifically means the object needed by
+  // argument to hold the data. Here we create the matrix for the above created
+  // agument to store the testing samples.
+  paddle_matrix mat =
+      paddle_matrix_create(/* height = batch size */ 1,
+                           /* width = dimensionality of the data layer */ 784,
+                           /* whether to use GPU */ false);
 
-  // Get First row.
+  paddle_real* array;
+  // Get the pointer pointing to the start address of the first row of the
+  // created matrix.
   CHECK(paddle_matrix_get_row(mat, 0, &array));
 
+  // Fill the matrix with a randomly generated test sample.
+  srand(time(0));
   for (int i = 0; i < 784; ++i) {
     array[i] = rand() / ((float)RAND_MAX);
   }
 
+  // Assign the matrix to the argument.
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
+  // Create the output argument.
   paddle_arguments out_args = paddle_arguments_create_none();
+
+  // Invoke the forward computation.
   CHECK(paddle_gradient_machine_forward(machine,
                                         in_args,
                                         out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
+                                        /* is train taks or not */ false));
 
+  // Create the matrix to hold the forward result of the neural network.
+  paddle_matrix prob = paddle_matrix_create_none();
+  // Access the matrix of the output argument, the predicted result is stored in
+  // which.
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
+  uint64_t height;
+  uint64_t width;
+  CHECK(paddle_matrix_get_shape(prob, &height, &width));
   CHECK(paddle_matrix_get_row(prob, 0, &array));
 
-  printf("Prob: ");
-  for (int i = 0; i < 10; ++i) {
-    printf("%.2f ", array[i]);
+  printf("Prob: \n");
+  for (int i = 0; i < height * width; ++i) {
+    printf("%.4f ", array[i]);
+    if ((i + 1) % width == 0) {
+      printf("\n");
+    }
   }
   printf("\n");
 
+  // The cleaning up.
   CHECK(paddle_matrix_destroy(prob));
   CHECK(paddle_arguments_destroy(out_args));
   CHECK(paddle_matrix_destroy(mat));
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
new file mode 100644
index 0000000000..7aeb482903
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
@@ -0,0 +1,22 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils.merge_model import merge_v2_model
+
+from mnist_v2 import network
+
+net = network(is_infer=True)
+param_file = "models/params_pass_4.tar"
+output_file = "output.paddle.model"
+merge_v2_model(net, param_file, output_file)
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/capi/examples/model_inference/dense/mnist_v2.py
new file mode 100644
index 0000000000..183eecfdf2
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
@@ -0,0 +1,131 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import gzip
+import logging
+import argparse
+from PIL import Image
+import numpy as np
+
+import paddle.v2 as paddle
+from paddle.utils.dump_v2_config import dump_v2_config
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def multilayer_perceptron(img, layer_size, lbl_dim):
+    for idx, size in enumerate(layer_size):
+        hidden = paddle.layer.fc(input=(img if not idx else hidden),
+                                 size=size,
+                                 act=paddle.activation.Relu())
+    return paddle.layer.fc(input=hidden,
+                           size=lbl_dim,
+                           act=paddle.activation.Softmax())
+
+
+def network(input_dim=784, lbl_dim=10, is_infer=False):
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(input_dim))
+
+    predict = multilayer_perceptron(
+        images, layer_size=[128, 64], lbl_dim=lbl_dim)
+
+    if is_infer:
+        return predict
+    else:
+        label = paddle.layer.data(
+            name='label', type=paddle.data_type.integer_value(lbl_dim))
+        return paddle.layer.classification_cost(input=predict, label=label)
+
+
+def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
+    if task == "train":
+        if not os.path.exists(save_dir):
+            os.mkdir(save_dir)
+
+        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
+        cost = network()
+        parameters = paddle.parameters.create(cost)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.1 / 128.0,
+            momentum=0.9,
+            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 100 == 0:
+                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
+                                (event.pass_id, event.batch_id, event.cost,
+                                 event.metrics))
+            if isinstance(event, paddle.event.EndPass):
+                with gzip.open(
+                        os.path.join(save_dir, "params_pass_%d.tar" %
+                                     event.pass_id), "w") as f:
+                    trainer.save_parameter_to_tar(f)
+
+        trainer.train(
+            reader=paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=8192),
+                batch_size=128),
+            event_handler=event_handler,
+            num_passes=5)
+    elif task == "dump_config":
+        predict = network(is_infer=True)
+        dump_v2_config(predict, "trainer_config.bin", True)
+    else:
+        raise RuntimeError(("Error value for parameter task. "
+                            "Available options are: train and dump_config."))
+
+
+def parse_cmd():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle MNIST demo for CAPI.")
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=False,
+        help=("A string indicating the taks type. "
+              "Available options are: \"train\", \"dump_config\"."),
+        default="train")
+    parser.add_argument(
+        "--use_gpu",
+        type=bool,
+        help=("A bool flag indicating whether to use GPU device or not."),
+        default=False)
+    parser.add_argument(
+        "--trainer_count",
+        type=int,
+        help=("This parameter is only used in training task. It indicates "
+              "how many computing threads are created in training."),
+        default=1)
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        help=("This parameter is only used in training task. It indicates "
+              "path of the directory to save the trained models."),
+        default="models")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_cmd()
+    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/capi/examples/model_inference/dense/trainer_config.py
index 873ec119e7..b94a21a7e4 100644
--- a/paddle/capi/examples/model_inference/dense/trainer_config.py
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
@@ -1,18 +1,13 @@
-from paddle.trainer_config_helpers import *
-
-img = data_layer(name='pixel', size=784)
-
-hidden = fc_layer(
-    input=img,
-    size=200,
-    param_attr=ParamAttr(name='hidden.w'),
-    bias_attr=ParamAttr(name='hidden.b'))
-
-prob = fc_layer(
-    input=hidden,
-    size=10,
-    act=SoftmaxActivation(),
-    param_attr=ParamAttr(name='prob.w'),
-    bias_attr=ParamAttr(name='prob.b'))
-
-outputs(prob)
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
index 98e411ddc0..2fc8debdde 100644
--- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -1,8 +1,29 @@
 project(multi_thread)
 cmake_minimum_required(VERSION 2.8)
-aux_source_directory(. SRC_LIST)
-add_executable(${PROJECT_NAME} ${SRC_LIST})
+
 find_package (Threads)
+
+if(NOT PADDLE_ROOT)
+  set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path")
+endif()
+if(PADDLE_ROOT)
+  include_directories(${PADDLE_ROOT}/include)
+  link_directories(${PADDLE_ROOT}/lib)
+endif()
+
+set(CPU_SRCS main.c)
+add_executable(${PROJECT_NAME} ${CPU_SRCS})
 set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
-target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
-  ${CMAKE_THREAD_LIBS_INIT})
+target_link_libraries(${PROJECT_NAME}
+                      -lpaddle_capi_shared
+                      ${CMAKE_THREAD_LIBS_INIT})
+
+find_package(CUDA QUIET)
+if(CUDA_FOUND)
+  set(GPU_SRCS main_gpu.c)
+  cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS})
+  set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99)
+  target_link_libraries(${PROJECT_NAME}_gpu
+                        -lpaddle_capi_shared
+                        ${CMAKE_THREAD_LIBS_INIT})
+endif(CUDA_FOUND)
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
index d7675cd80a..eecb9138e7 100644
--- a/paddle/capi/examples/model_inference/multi_thread/main.c
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -1,3 +1,17 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <paddle/capi.h>
 #include <pthread.h>
 #include <time.h>
diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
new file mode 100644
index 0000000000..85bb456584
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c
@@ -0,0 +1,127 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+/*
+ * @brief It is an simple inference example that runs multi-threads on a GPU.
+ *        Each thread holds it own local gradient_machine but shares the same
+ *        parameters.
+ *        If you want to run on different GPUs, you need to launch
+ *        multi-processes or set trainer_count > 1.
+ */
+void* thread_main(void* gm_ptr) {
+  // Initialize the thread environment of Paddle.
+  CHECK(paddle_init_thread());
+
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  // Create input arguments.
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ true);
+  // Create output arguments.
+  paddle_arguments out_args = paddle_arguments_create_none();
+  // Create output matrix.
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  // CPU buffer to cache the input and output.
+  paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real));
+  paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real));
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input layer of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    for (int i = 0; i < 784; ++i) {
+      cpu_input[i] = rand() / ((float)RAND_MAX);
+    }
+    CHECK(paddle_matrix_set_value(mat, cpu_input));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+    CHECK(paddle_matrix_get_value(prob, cpu_output));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", cpu_output[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  free(cpu_input);
+  free(cpu_output);
+
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=True"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/capi/examples/model_inference/sequence/main.c
index 50bc0c9201..80937c830d 100644
--- a/paddle/capi/examples/model_inference/sequence/main.c
+++ b/paddle/capi/examples/model_inference/sequence/main.c
@@ -1,3 +1,17 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <paddle/capi.h>
 #include <time.h>
 #include "../common/common.h"
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/capi/examples/model_inference/sequence/trainer_config.py
index 6bbc7a909a..889f8acdfd 100644
--- a/paddle/capi/examples/model_inference/sequence/trainer_config.py
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 WORD_DIM = 3000
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
index 8ba67aee56..efec010a91 100644
--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -1,5 +1,20 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -9,16 +24,18 @@ int main() {
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file which is generated by
+  // `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
   CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
+  // Load the trained parameters. Uncomment the following line and change the
+  // directory as needed.
   // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
   //                                                "./some_where_to_params"));
   paddle_arguments in_args = paddle_arguments_create_none();
@@ -26,7 +43,7 @@ int main() {
   // There is only one input of this network.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
+  // Create the input matrix.
   paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
   srand(time(0));
   paddle_real* array;
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 00f76e0152..1f0e033c5b 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -54,6 +54,37 @@ paddle_error paddle_gradient_machine_create_for_inference(
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  paddle::ModelConfig modelConfig;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    if (!modelConfig.ParseFromString(modelConfigProtobuf) ||
+        !modelConfig.IsInitialized()) {
+      return kPD_PROTOBUF_ERROR;
+    }
+  } else {
+    modelConfig = config.model_config();
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      modelConfig, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
   delete cast(machine);
   return kPD_NO_ERROR;
@@ -121,3 +152,29 @@ paddle_error paddle_gradient_machine_randomize_param(
   m->machine->randParameters();
   return kPD_NO_ERROR;
 }
+
+paddle_error paddle_gradient_machine_get_layer_output(
+    paddle_gradient_machine machine,
+    const char* layerName,
+    paddle_arguments args) {
+  auto m = cast(machine);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(args);
+  if (m == nullptr || layerName == nullptr || out == nullptr ||
+      m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+
+  auto layerOutput = m->machine->getLayerOutput(layerName);
+  out->args.push_back(layerOutput);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_release_layer_output(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  m->machine->releaseOutput();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index d7e2dd9bf8..7e37dea00b 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -36,6 +36,22 @@ typedef void* paddle_gradient_machine;
 PD_API paddle_error paddle_gradient_machine_create_for_inference(
     paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
 
+/**
+ * @brief Create a gradient machine used for model inference, using config with
+ *        parameters which is generated by `paddle merge_model`.
+ *        Example:
+ *          paddle merge_model \
+ *                 --model_dir="pass-00000" \
+ *                 --model_file="merged_model.paddle"
+ * @param [out] machine that used for model inference
+ * @param [in] mergedModel
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
+
 /**
  * @brief Load parameter from disk.
  * @param machine Gradient Machine.
@@ -85,6 +101,26 @@ paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
 PD_API paddle_error
 paddle_gradient_machine_destroy(paddle_gradient_machine machine);
 
+/**
+ * @brief Get the output of the layer named `layerName`.
+ * @param [in] gradient machine that have run a inference
+ * @param [in] layerName name of specified layer
+ * @param [out] args output of the specified layer
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
+                                         const char* layerName,
+                                         paddle_arguments args);
+
+/**
+ * @brief Release the middle layer's output memory of the gradient machine.
+ * @param [in] gradient machine that have run a inference
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
index 893ebcbd58..99c4e8428d 100644
--- a/paddle/capi/main.h
+++ b/paddle/capi/main.h
@@ -26,6 +26,13 @@ extern "C" {
  */
 PD_API paddle_error paddle_init(int argc, char** argv);
 
+/**
+ * Initialize the thread environment of Paddle.
+ * @note it is requisite for GPU runs but optional for CPU runs.
+ *       For GPU runs, all threads will run on the same GPU devices.
+ */
+PD_API paddle_error paddle_init_thread();
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index f15f7f3bbb..8cc3e0034e 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
  * @param isBinary is binary (either 1 or 0 in matrix) or not.
  * @param useGpu is using GPU or not.
  * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_matrix paddle_matrix_create_sparse(
     uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@@ -70,6 +71,16 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real* rowArray);
 
+/**
+ * @brief paddle_matrix_set_value Set value to matrix.
+ * @param mat Target Matrix
+ * @param value Row data.
+ * @return paddle_error
+ * @note  value should contain enough element of data to init the mat
+ */
+PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
+                                            paddle_real* value);
+
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
  * @param [in] mat Target matrix
@@ -81,6 +92,15 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           uint64_t rowID,
                                           paddle_real** rawRowBuffer);
 
+/**
+ * @brief copy data from the matrix
+ * @param [in] mat Target matrix
+ * @param [out] result pointer to store the matrix data
+ * @return paddle_error
+ * @note the space of the result should allocated before invoke this API
+ */
+PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
@@ -110,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
  * NULL if the matrix is binary.
  * @param [in] valueSize length of value array. Zero if the matrix is binary.
  * @return paddle_error
+ * @note Mobile inference does not support this interface.
  */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                    int* rowArray,
diff --git a/paddle/capi/paddle_capi.map b/paddle/capi/paddle_capi.map
new file mode 100644
index 0000000000..8d673f675d
--- /dev/null
+++ b/paddle/capi/paddle_capi.map
@@ -0,0 +1,6 @@
+{
+	global:
+		paddle_*;
+	local:
+		*;
+};
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
index d73f6b7733..bb38ace628 100644
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -4,11 +4,12 @@ add_unittest(capi_test_mats test_Vector.cpp
 target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_mats paddle_capi)
 
-
-add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
-target_include_directories(capi_test_gradientMachine PUBLIC
-  ${PADDLE_CAPI_INC_PATH})
-target_link_libraries(capi_test_gradientMachine paddle_capi)
-add_test(NAME capi_test_gradientMachine
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
+if(NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+    target_include_directories(capi_test_gradientMachine PUBLIC
+      ${PADDLE_CAPI_INC_PATH})
+    target_link_libraries(capi_test_gradientMachine paddle_capi)
+    add_test(NAME capi_test_gradientMachine
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
+endif()
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
index 4bf9a9d6a9..6940c28448 100644
--- a/paddle/capi/tests/test_Matrix.cpp
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -45,3 +45,49 @@ TEST(CAPIMatrix, createNone) {
   paddle_matrix mat = paddle_matrix_create_none();
   ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
 }
+
+TEST(CAPIMatrix, cpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(CAPIMatrix, gpu_get_set_value) {
+  paddle_matrix mat = paddle_matrix_create(128, 32, true);
+  std::vector<paddle_real> sample;
+  std::vector<paddle_real> result;
+  sample.resize(128 * 32);
+  result.resize(128 * 32);
+  for (size_t i = 0; i < sample.size(); ++i) {
+    sample[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_value(mat, sample.data()));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_value(mat, result.data()));
+  for (size_t i = 0; i < sample.size(); ++i) {
+    ASSERT_NEAR(sample[i], result[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+#endif
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/capi/tests/test_predict_network.py
index 82ef5cb1a7..6560417b2a 100644
--- a/paddle/capi/tests/test_predict_network.py
+++ b/paddle/capi/tests/test_predict_network.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100)
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index f9061e96de..efd1b7a73e 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -27,7 +27,9 @@ if(WITH_GPU)
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
     set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()
 
 set(CUDA_CU_SOURCES
@@ -39,6 +41,7 @@ set(CUDA_CU_SOURCES
     src/hl_cuda_lstm.cu
     src/hl_top_k.cu
     src/hl_batch_transpose.cu
+    src/hl_batch_norm.cu
     src/hl_cuda_sequence.cu
     src/hl_table_apply.cu)
 
@@ -83,7 +86,7 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
-add_dependencies(paddle_cuda ${external_project_dependencies})
+add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
diff --git a/paddle/cuda/include/hl_batch_norm.h b/paddle/cuda/include/hl_batch_norm.h
new file mode 100644
index 0000000000..afc5e0b2de
--- /dev/null
+++ b/paddle/cuda/include/hl_batch_norm.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_BATCH_NORM_H_
+#define HL_BATCH_NORM_H_
+
+#include "hl_base.h"
+
+/**
+ * @brief   batch norm inferece.
+ *
+ * @param[in]   input         input data.
+ * @param[out]  output        output data.
+ * @param[in]   scale         batch normalization scale parameter (in original
+ *                            paper scale is referred to as gamma).
+ * @param[in]   bias          batch normalization bias parameter (in original
+ *                            paper scale is referred to as beta).
+ * @param[in]   estimatedMean
+ * @param[in]   estimatedVar  The moving mean and variance
+ *                            accumulated during the training phase are passed
+ *                            as inputs here.
+ * @param[in]   epsilon       Epsilon value used in the batch
+ *                            normalization formula.
+ */
+extern void hl_batch_norm_cuda_inference(const real* input,
+                                         real* output,
+                                         const real* scale,
+                                         const real* bias,
+                                         const real* estimatedMean,
+                                         const real* estimatedVar,
+                                         const double epsilon,
+                                         size_t batchSize,
+                                         size_t channel,
+                                         size_t height,
+                                         size_t width);
+
+#endif  // HL_BATCH_NORM_H_
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9..8841806292 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -18,74 +18,7 @@ limitations under the License. */
 #include "hl_base.h"
 
 /**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
-/**
- * @brief   Maximum pool forward.
+ * @brief   Maximum pool forward with Mask output.
  *
  * @param[in]   frameCnt    batch size of input image.
  * @param[in]   inputData   input data.
@@ -102,7 +35,7 @@ extern void hl_expand_feature2col(const real* dataIm,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
- *
+ * @param[out]  maskData    the location indices of select max data.
  */
 extern void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
@@ -118,7 +51,8 @@ extern void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               real* maskData = NULL);
 
 /**
  * @brief   Maximum pool backward.
@@ -182,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
  * @param[in]   paddingW    padding width.
  * @param[out]  tgtData     output data.
  * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -198,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 
 /**
  * @brief   Maximum pool backward.
@@ -220,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
  * @param[in]   scaleB      scale.
  * @param[out]  backGrad    output grad.
  * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
  *
  */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -238,7 +175,98 @@ extern void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
+
+extern void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride);
+
+extern void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride);
+
+extern void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride);
+
+extern void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  int paddingD,
+                                  int paddingH,
+                                  int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride);
 
 /**
  * @brief   Bilinear interpolation forward.
@@ -342,4 +370,4 @@ extern void hl_maxout_backward(real* inGrad,
                                size_t featLen,
                                size_t groups);
 
-#endif /* HL_CNN_H_ */
+#endif  // HL_CNN_H_
diff --git a/paddle/cuda/include/hl_cpu_gru.cuh b/paddle/cuda/include/hl_cpu_gru.cuh
index c0a37ced2a..e4f6bf42c6 100644
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -18,14 +18,6 @@ limitations under the License. */
 
 #ifndef __NVCC__
 
-#include "paddle/math/MathFunctions.h"
-
-#ifndef PADDLE_TYPE_DOUBLE
-#define     CBLAS_GEMM     paddle::gemm<float>
-#else
-#define     CBLAS_GEMM     paddle::gemm<double>
-#endif
-
 template<class OpResetOutput>
 void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
                                        real *gateValue,
@@ -210,51 +202,6 @@ inline void forward_final_output(OpFinalOutput opFinalOutput,
   }
 }
 
-template<class OpResetOutput, class OpFinalOutput>
-void hl_cpu_gru_forward(OpResetOutput opResetOutput,
-                        OpFinalOutput opFinalOutput,
-                        hl_gru_value value,
-                        int frameSize,
-                        int batchSize,
-                        hl_activation_mode_t active_node,
-                        hl_activation_mode_t active_gate) {
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               2 * frameSize,
-               frameSize,
-               1,
-               value.prevOutValue,
-               frameSize,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               value.gateValue,
-               frameSize * 3);
-  }
-
-  forward_reset_output(opResetOutput, value, frameSize, batchSize, active_gate);
-
-  if (value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasNoTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               value.resetOutputValue,
-               frameSize,
-               value.stateWeight,
-               frameSize,
-               1,
-               value.gateValue + frameSize * 2,
-               frameSize * 3);
-  }
-
-  forward_final_output(opFinalOutput, value, frameSize, batchSize, active_node);
-}
-
 template<class OpStateGrad>
 void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad,
                                       real *gateValue,
@@ -525,86 +472,6 @@ inline void backward_reset_grad(OpResetGrad opResetGrad,
   }
 }
 
-template<class OpStateGrad, class OpResetGrad>
-void hl_cpu_gru_backward(OpStateGrad opStateGrad,
-                         OpResetGrad opResetGrad,
-                         hl_gru_value value,
-                         hl_gru_grad  grad,
-                         int frameSize,
-                         int batchSize,
-                         hl_activation_mode_t active_node,
-                         hl_activation_mode_t active_gate) {
-  backward_state_grad(opStateGrad, value, grad,
-    frameSize, batchSize, active_node);
-
-  if (value.prevOutValue && grad.prevOutGrad) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize,
-               1,
-               grad.gateGrad + frameSize * 2,
-               frameSize * 3,
-               value.stateWeight,
-               frameSize,
-               0,
-               grad.resetOutputGrad,
-               frameSize);
-
-    if (grad.stateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize,
-                 batchSize,
-                 1,
-                 value.resetOutputValue,
-                 frameSize,
-                 grad.gateGrad + frameSize * 2,
-                 frameSize * 3,
-                 1,
-                 grad.stateWeightGrad,
-                 frameSize);
-    }
-  }
-
-  backward_reset_grad(opResetGrad, value, grad,
-    frameSize, batchSize, active_gate);
-
-  if (grad.prevOutGrad && value.prevOutValue) {
-    CBLAS_GEMM(CblasNoTrans,
-               CblasTrans,
-               batchSize,
-               frameSize,
-               frameSize * 2,
-               1,
-               grad.gateGrad,
-               frameSize * 3,
-               value.gateWeight,
-               frameSize * 2,
-               1,
-               grad.prevOutGrad,
-               frameSize);
-
-    if (grad.gateWeightGrad) {
-      CBLAS_GEMM(CblasTrans,
-                 CblasNoTrans,
-                 frameSize,
-                 frameSize * 2,
-                 batchSize,
-                 1,
-                 value.prevOutValue,
-                 frameSize,
-                 grad.gateGrad,
-                 frameSize * 3,
-                 1,
-                 grad.gateWeightGrad,
-                 frameSize * 2);
-    }
-  }
-}
-
 #endif
 
 #endif  // HL_CPU_GRU_CUH_
diff --git a/paddle/cuda/include/hl_cuda_cudnn.h b/paddle/cuda/include/hl_cuda_cudnn.h
index db18e4912b..b44b071bd1 100644
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@@ -22,10 +22,10 @@ limitations under the License. */
  */
 typedef enum {
   HL_POOLING_MAX = 0,
-  // average includes padded values
-  HL_POOLING_AVERAGE = 1,
   // average does not include padded values
-  HL_POOLING_AVERAGE_EXCLUDE_PADDING = 2,
+  HL_POOLING_AVERAGE = 1,
+  // average includes padded values
+  HL_POOLING_AVERAGE_INCLUDE_PADDING = 2,
   HL_POOLING_END
 } hl_pooling_mode_t;
 
@@ -214,7 +214,8 @@ extern void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes);
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation);
 
 /**
  * @brief   destroy filter descriptor.
@@ -242,7 +243,9 @@ extern void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width);
+                                             int stride_width,
+                                             int dilation_h = 1,
+                                             int dilation_w = 1);
 
 /**
  * @brief   reset convolution descriptor.
@@ -262,7 +265,9 @@ extern void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width);
+                                            int stride_width,
+                                            int dilation_h = 1,
+                                            int dilation_w = 1);
 
 /**
  * @brief   destroy convolution descriptor.
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index ede2670882..4ab8de80d1 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,7 +25,9 @@ limitations under the License. */
 #include "hl_matrix.h"
 #include "hl_sequence.h"
 #include "hl_sparse.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "hl_warpctc_wrap.h"
+#endif
 
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_aggregate_stub.h"
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index eb454c59c1..7daca18761 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -224,4 +224,88 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
 extern void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise);
 
+/**
+ * @brief  Matrix vol2Col: Convert 3D volume into col matrix
+ *
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   channel    channel of matSrc.
+ * @param[in]   depth      depth of matSrc.
+ * @param[in]   height     height of matSrc.
+ * @param[in]   width      width of matSrc.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[out]   dataDst     output matrix.
+ *
+ */
+extern void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ *
+ * @param[out]  matDst     output matrix.
+ * @param[in]   channel    channel of matDst.
+ * @param[in]   depth      depth of matDst.
+ * @param[in]   height     height of matDst.
+ * @param[in]   width      width of matDst.
+ * @param[in]   filterD    depth of filter.
+ * @param[in]   filterH    height of filter.
+ * @param[in]   filterW    width of filter.
+ * @param[in]   strideD    stride in the depth.
+ * @param[in]   strideH    stride in the height.
+ * @param[in]   strideW    stride in the width.
+ * @param[in]   paddingD   padding in the depth.
+ * @param[in]   paddingH   padding in the height.
+ * @param[in]   paddingW   padding in the width.
+ * @param[in]   matSrc     input matrix.
+ * @param[in]   beta       input
+ * @param[in]   alpha      input
+ *
+ */
+extern void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta);
+
+/**
+ * @brief  Matrix col2Vol: Convert col matrix into 3D volume
+ * @param[out]  out     output int vector.
+ * @param[in]   vec     input float vector.
+ * @param[in]   size    size of the vector.
+ */
+extern void hl_vector_cast2int(int* out, real* vec, int size);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_tensor_ops.h b/paddle/cuda/include/hl_tensor_ops.h
index 93d38b7d22..b2bf334dab 100644
--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
@@ -461,7 +461,7 @@ class add<float32x4_t> {
 public:
   INLINE float32x4_t operator()(const float32x4_t a,
                                 const float32x4_t b) const {
-    return vmulq_f32(a, b);
+    return vaddq_f32(a, b);
   }
 };
 
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc..706cc59a8e 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
@@ -61,7 +31,8 @@ inline void hl_maxpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               real* MaskData) {}
 
 inline void hl_maxpool_backward(const int frameCnt,
                                 const real* inputData,
@@ -97,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                                const int paddingH,
                                const int paddingW,
                                real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 
 inline void hl_avgpool_backward(const int frameCnt,
                                 const real* outGrad,
@@ -115,7 +87,98 @@ inline void hl_avgpool_backward(const int frameCnt,
                                 real scaleA,
                                 real scaleB,
                                 real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
+
+inline void hl_maxpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 real* maxPoolIdxData,
+                                 const int tgtStride) {}
+
+inline void hl_maxpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  real* maxPoolIdxData,
+                                  const int outStride) {}
+
+inline void hl_avgpool3D_forward(const int frameCnt,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int depth,
+                                 const int height,
+                                 const int width,
+                                 const int pooledD,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeZ,
+                                 const int sizeY,
+                                 const int sizeX,
+                                 const int strideD,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int paddingD,
+                                 const int paddingH,
+                                 const int paddingW,
+                                 real* tgtData,
+                                 const int tgtStride) {}
+
+inline void hl_avgpool3D_backward(const int frameCnt,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int depth,
+                                  const int height,
+                                  const int width,
+                                  const int pooledD,
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeZ,
+                                  const int sizeY,
+                                  const int sizeX,
+                                  const int strideD,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int paddingD,
+                                  const int paddingH,
+                                  const int paddingW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* backGrad,
+                                  const int outStride) {}
 
 inline void hl_bilinear_forward(const real* inData,
                                 const size_t inImgH,
diff --git a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
index abd0d6b099..3afcc6fa85 100644
--- a/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cudnn_stub.h
@@ -78,7 +78,9 @@ inline void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                              int padding_height,
                                              int padding_width,
                                              int stride_height,
-                                             int stride_width) {}
+                                             int stride_width,
+                                             int dilation_h,
+                                             int dilation_w) {}
 
 inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             hl_tensor_descriptor image,
@@ -86,7 +88,9 @@ inline void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                             int padding_height,
                                             int padding_width,
                                             int stride_height,
-                                            int stride_width) {}
+                                            int stride_width,
+                                            int dilation_h,
+                                            int dilation_w) {}
 
 inline void hl_destroy_convolution_descriptor(hl_convolution_descriptor conv) {}
 
@@ -99,7 +103,8 @@ inline void hl_conv_workspace(hl_tensor_descriptor input,
                               int* convBwdDataAlgo,
                               size_t* bwdDataLimitBytes,
                               int* convBwdFilterAlgo,
-                              size_t* bwdFilterLimitBytes) {}
+                              size_t* bwdFilterLimitBytes,
+                              bool useDilation) {}
 
 inline void hl_convolution_forward(hl_tensor_descriptor input,
                                    real* input_data,
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 127cb7e279..46e77e1407 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -99,4 +99,40 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
 inline void hl_matrix_rotate(
     real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
 
+inline void hl_matrix_vol2Col(const real* dataSrc,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              real* dataDst) {}
+
+inline void hl_matrix_col2Vol(real* dataDst,
+                              int channels,
+                              int depth,
+                              int height,
+                              int width,
+                              int filterD,
+                              int filterH,
+                              int filterW,
+                              int strideD,
+                              int strideH,
+                              int strideW,
+                              int paddingD,
+                              int paddingH,
+                              int paddingW,
+                              const real* dataSrc,
+                              real alpha,
+                              real beta) {}
+
+inline void hl_vector_cast2int(int* out, real* vec, int size) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/avx_mathfun.h b/paddle/cuda/src/avx_mathfun.h
index 2412ed5abc..a0ba71faba 100644
--- a/paddle/cuda/src/avx_mathfun.h
+++ b/paddle/cuda/src/avx_mathfun.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /*
    AVX implementation of sin, cos, sincos, exp and log
 
diff --git a/paddle/cuda/src/hl_batch_norm.cu b/paddle/cuda/src/hl_batch_norm.cu
new file mode 100644
index 0000000000..5828ecb8e0
--- /dev/null
+++ b/paddle/cuda/src/hl_batch_norm.cu
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_batch_norm.h"
+
+__global__ void batchNormInference(real* output,
+                                   const real* input,
+                                   const real* scale,
+                                   const real* bias,
+                                   const real* estimatedMean,
+                                   const real* estimatedVar,
+                                   const double epsilon,
+                                   size_t batchSize,
+                                   size_t channel,
+                                   size_t height,
+                                   size_t width) {
+  const int tid = threadIdx.x;
+  const int num = channel * height * width;
+  const int batch = blockIdx.x;
+  for (int i = tid; i < num; i += blockDim.x) {
+    const int c = i / (height * width);
+    const int id = batch * num + i;
+    real val = input[id] - estimatedMean[c];
+    val /= sqrt(estimatedVar[c] + epsilon);
+    val *= scale[c];
+    val += bias[c];
+    output[id] = val;
+  }
+}
+
+void hl_batch_norm_cuda_inference(const real* input,
+                                  real* output,
+                                  const real* scale,
+                                  const real* bias,
+                                  const real* estimatedMean,
+                                  const real* estimatedVar,
+                                  const double epsilon,
+                                  size_t batchSize,
+                                  size_t channel,
+                                  size_t height,
+                                  size_t width) {
+  batchNormInference<<<batchSize, 256, 0, STREAM_DEFAULT>>>(output,
+                                                            input,
+                                                            scale,
+                                                            bias,
+                                                            estimatedMean,
+                                                            estimatedVar,
+                                                            epsilon,
+                                                            batchSize,
+                                                            channel,
+                                                            height,
+                                                            width);
+
+  CHECK_SYNC("hl_batch_norm_cuda_inference failed!");
+}
diff --git a/paddle/cuda/src/hl_batch_transpose.cu b/paddle/cuda/src/hl_batch_transpose.cu
index f047403da1..f4c253df7b 100644
--- a/paddle/cuda/src/hl_batch_transpose.cu
+++ b/paddle/cuda/src/hl_batch_transpose.cu
@@ -12,17 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_batch_transpose.h"
 #include "hl_base.h"
+#include "hl_batch_transpose.h"
 
 const int TILE_DIM = 64;
 const int BLOCK_ROWS = 16;
 
 // No bank-conflict transpose for a batch of data.
-__global__ void batchTransposeNoBankConflicts(real* odata,
-                                              const real* idata,
-                                              int numSamples, int width,
-                                              int height) {
+__global__ void batchTransposeNoBankConflicts(
+    real* odata, const real* idata, int numSamples, int width, int height) {
   __shared__ float tile[TILE_DIM][TILE_DIM + 1];
 
   const int x = blockIdx.x * TILE_DIM + threadIdx.x;
@@ -50,12 +48,12 @@ __global__ void batchTransposeNoBankConflicts(real* odata,
           newX] = tile[threadIdx.x][j];
 }
 
-void batchTranspose(const real* input, real* output, int width, int height,
-                    int batchSize) {
+void batchTranspose(
+    const real* input, real* output, int width, int height, int batchSize) {
   dim3 dimBlock(TILE_DIM, BLOCK_ROWS, 1);
   dim3 dimGrid(DIVUP(width, TILE_DIM), DIVUP(height, TILE_DIM), batchSize);
-  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-      (output, input, batchSize, width, height);
+  batchTransposeNoBankConflicts<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+      output, input, batchSize, width, height);
 
   CHECK_SYNC("batchTranspose failed!");
 }
diff --git a/paddle/cuda/src/hl_cuda_aggregate.cu b/paddle/cuda/src/hl_cuda_aggregate.cu
index 97034a9177..16a54ad343 100644
--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -12,27 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
+#include "hl_aggregate.h"
 #include "hl_base.h"
 #include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_aggregate.h"
-#include "hl_thread.ph"
 #include "hl_matrix_base.cuh"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 /**
  * @brief   matrix row operator.
  */
-template<class Agg, int blockSize>
-__global__ void KeMatrixRowOp(Agg agg,
-                              real *E,
-                              real *Sum,
-                              int dimN) {
+template <class Agg, int blockSize>
+__global__ void KeMatrixRowOp(Agg agg, real *E, real *Sum, int dimN) {
   __shared__ real sum_s[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int rowId = blockIdx.x + blockIdx.y*gridDim.x;
-  int index = rowId*dimN;
+  int cnt = (dimN + blockSize - 1) / blockSize;
+  int rowId = blockIdx.x + blockIdx.y * gridDim.x;
+  int index = rowId * dimN;
   int tid = threadIdx.x;
   int lmt = tid;
 
@@ -44,7 +40,7 @@ __global__ void KeMatrixRowOp(Agg agg,
   sum_s[tid] = tmp;
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] = agg(sum_s[tid], sum_s[tid + stride]);
     }
@@ -58,29 +54,21 @@ __global__ void KeMatrixRowOp(Agg agg,
 }
 
 template <class Agg>
-void hl_matrix_row_op(Agg agg,
-                      real *A_d,
-                      real *C_d,
-                      int dimM,
-                      int dimN) {
+void hl_matrix_row_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
   int blocksX = dimM;
   int blocksY = 1;
   dim3 threads(128, 1);
   dim3 grid(blocksX, blocksY);
 
-  KeMatrixRowOp<Agg, 128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (agg, A_d, C_d, dimN);
+  KeMatrixRowOp<Agg, 128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      agg, A_d, C_d, dimN);
 }
 
 void hl_matrix_row_sum(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::sum(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::sum(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_sum failed");
 }
 
@@ -88,11 +76,7 @@ void hl_matrix_row_max(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::max(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::max(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_max failed");
 }
 
@@ -100,23 +84,16 @@ void hl_matrix_row_min(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_row_op(aggregate::min(),
-                   A_d,
-                   C_d,
-                   dimM,
-                   dimN);
+  hl_matrix_row_op(aggregate::min(), A_d, C_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_row_min failed");
 }
 
 /**
  * @brief   matrix column operator.
  */
-template<class Agg>
-__global__ void KeMatrixColumnOp(Agg agg,
-                                 real *E,
-                                 real *Sum,
-                                 int dimM,
-                                 int dimN) {
+template <class Agg>
+__global__ void KeMatrixColumnOp(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
   int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
   real tmp = agg.init();
   if (rowIdx < dimN) {
@@ -127,15 +104,12 @@ __global__ void KeMatrixColumnOp(Agg agg,
   }
 }
 
-template<class Agg, int blockDimX, int blockDimY>
-__global__ void KeMatrixColumnOp_S(Agg agg,
-                                   real *E,
-                                   real *Sum,
-                                   int dimM,
-                                   int dimN) {
-    __shared__ real _sum[blockDimX*blockDimY];
-    int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
-    int index = threadIdx.y;
+template <class Agg, int blockDimX, int blockDimY>
+__global__ void KeMatrixColumnOp_S(
+    Agg agg, real *E, real *Sum, int dimM, int dimN) {
+  __shared__ real _sum[blockDimX * blockDimY];
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int index = threadIdx.y;
 
   real tmp = agg.init();
   if (rowIdx < dimN) {
@@ -144,14 +118,14 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
       index += blockDimY;
     }
   }
-  _sum[threadIdx.x + threadIdx.y*blockDimX] = tmp;
+  _sum[threadIdx.x + threadIdx.y * blockDimX] = tmp;
   __syncthreads();
 
   if (rowIdx < dimN) {
-    if (threadIdx.y ==0) {
+    if (threadIdx.y == 0) {
       real tmp = agg.init();
-      for (int i=0; i < blockDimY; i++) {
-        tmp = agg(tmp, _sum[threadIdx.x + i*blockDimX]);
+      for (int i = 0; i < blockDimY; i++) {
+        tmp = agg(tmp, _sum[threadIdx.x + i * blockDimX]);
       }
       Sum[rowIdx] = tmp;
     }
@@ -159,25 +133,21 @@ __global__ void KeMatrixColumnOp_S(Agg agg,
 }
 
 template <class Agg>
-void hl_matrix_column_op(Agg agg,
-                         real *A_d,
-                         real *C_d,
-                         int dimM,
-                         int dimN) {
+void hl_matrix_column_op(Agg agg, real *A_d, real *C_d, int dimM, int dimN) {
   if (dimN >= 8192) {
-    int blocksX = (dimN + 128 -1) / 128;
+    int blocksX = (dimN + 128 - 1) / 128;
     int blocksY = 1;
     dim3 threads(128, 1);
     dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp<Agg><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp<Agg><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
   } else {
-    int blocksX = (dimN + 32 -1) / 32;
+    int blocksX = (dimN + 32 - 1) / 32;
     int blocksY = 1;
     dim3 threads(32, 32);
     dim3 grid(blocksX, blocksY);
-    KeMatrixColumnOp_S<Agg, 32, 32><<< grid, threads, 0, STREAM_DEFAULT>>>
-             (agg, A_d, C_d, dimM, dimN);
+    KeMatrixColumnOp_S<Agg, 32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        agg, A_d, C_d, dimM, dimN);
   }
 
   return;
@@ -187,11 +157,7 @@ void hl_matrix_column_sum(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::sum(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::sum(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_sum failed");
 }
@@ -200,11 +166,7 @@ void hl_matrix_column_max(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::max(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::max(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_max failed");
 }
@@ -213,11 +175,7 @@ void hl_matrix_column_min(real *A_d, real *C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
-  hl_matrix_column_op(aggregate::min(),
-                      A_d,
-                      C_d,
-                      dimM,
-                      dimN);
+  hl_matrix_column_op(aggregate::min(), A_d, C_d, dimM, dimN);
 
   CHECK_SYNC("hl_matrix_column_min failed");
 }
@@ -226,16 +184,16 @@ template <int blockSize>
 __global__ void KeVectorSum(real *E, real *Sum, int dimM) {
   __shared__ double sum_s[blockSize];
   int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
 
   sum_s[tid] = 0.0f;
   while (index < dimM) {
     sum_s[tid] += E[index];
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
   }
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] += sum_s[tid + stride];
     }
@@ -259,38 +217,39 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
   dim3 threads(blockSize, 1);
   dim3 grid(blocksX, blocksY);
 
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
 
-  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
 
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }
 
 template <int blockSize>
 __global__ void KeVectorAbsSum(real *E, real *Sum, int dimM) {
   __shared__ double sum_s[blockSize];
   int tid = threadIdx.x;
-  int index = blockIdx.y*blockDim.x+threadIdx.x;
+  int index = blockIdx.y * blockDim.x + threadIdx.x;
 
   sum_s[tid] = 0.0f;
   while (index < dimM) {
     sum_s[tid] += abs(E[index]);
-    index += blockDim.x*gridDim.y;
+    index += blockDim.x * gridDim.y;
   }
   __syncthreads();
 
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
+  for (int stride = blockSize / 2; stride > 0; stride = stride / 2) {
     if (tid < stride) {
       sum_s[tid] += sum_s[tid + stride];
     }
@@ -314,20 +273,21 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
   dim3 threads(blockSize, 1);
   dim3 grid(blocksX, blocksY);
 
-  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
+  struct _hl_event_st hl_event_st = {.cu_event = t_resource.event};
   hl_event_t hl_event = &hl_event_st;
-  while (!hl_cuda_event_is_ready(hl_event)) {}
+  while (!hl_cuda_event_is_ready(hl_event)) {
+  }
 
-  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, t_resource.gpu_mem, dimM);
-  KeVectorAbsSum<128><<< 1, threads, 0, STREAM_DEFAULT >>>
-           (t_resource.gpu_mem, t_resource.cpu_mem, 128);
+  KeVectorAbsSum<128><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, t_resource.gpu_mem, dimM);
+  KeVectorAbsSum<128><<<1, threads, 0, STREAM_DEFAULT>>>(
+      t_resource.gpu_mem, t_resource.cpu_mem, 128);
 
   hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
   hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);
 
   hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   cudaError_t err = (cudaError_t)hl_get_device_last_error();
-  CHECK_EQ(cudaSuccess, err)
-    << "CUDA error: " << hl_get_device_error_string((size_t)err);
+  CHECK_EQ(cudaSuccess, err) << "CUDA error: "
+                             << hl_get_device_error_string((size_t)err);
 }
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4..2d1bc4f6d5 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -12,149 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <float.h>
 #include "hl_base.h"
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
-__global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
-                                 const int channels, const int height,
+__global__ void KeMaxPoolForward(const int nthreads,
+                                 const real* inputData,
+                                 const int channels,
+                                 const int height,
                                  const int width,
-                                 const int pooledH, const int pooledW,
-                                 const int ksizeW, const int ksizeH,
-                                 const int strideH, const int strideW,
-                                 const int offsetH, const int offsetW,
-                                 real* tgtData, const int tgtStride) {
-  int index =  blockIdx.x * blockDim.x + threadIdx.x;
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int ksizeW,
+                                 const int ksizeH,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int offsetH,
+                                 const int offsetW,
+                                 real* tgtData,
+                                 const int tgtStride,
+                                 real* maskData) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
     int ph = (index / pooledW) % pooledH;
@@ -167,51 +46,85 @@ __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
     real maxval = -FLT_MAX;
+    int max_index = -1;
     inputData += (frameNum * channels + c) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
-        if (maxval < inputData[h * width + w])
-          maxval = inputData[h * width + w];
+        if (maxval < inputData[h * width + w]) {
+          max_index = h * width + w;
+          maxval = inputData[max_index];
+        }
       }
     }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
-        frameNum * tgtStride;
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
     tgtData[tgtIndex] = maxval;
+    if (maskData != NULL) {
+      maskData[tgtIndex] = max_index;
+    }
   }
 }
 
-void hl_maxpool_forward(const int frameCnt, const real* inputData,
+void hl_maxpool_forward(const int frameCnt,
+                        const real* inputData,
                         const int channels,
-                        const int height, const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW,
-                        real* tgtData, const int tgtStride) {
-
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride,
+                        real* maskData) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
   dim3 grid(blocks, 1);
 
-  KeMaxPoolForward<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, channels, height, width,
-           pooledH, pooledW, sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData, tgtStride);
+  KeMaxPoolForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         tgtData,
+                                                         tgtStride,
+                                                         maskData);
   CHECK_SYNC("hl_maxpool_forward failed");
 }
 
-__global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
-                                  const real* outData, const real* outGrad,
-                                  const int channels, const int height,
+__global__ void KeMaxPoolBackward(const int nthreads,
+                                  const real* inputData,
+                                  const real* outData,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                   const int width,
-                                  const int pooledH, const int pooledW,
-                                  const int sizeX, const int sizeY,
-                                  const int strideH, const int strideW,
-                                  const int padH, const int padW,
-                                  real scaleA, real scaleB,
-                                  real* targetGrad, const int outStride) {
-  int index = blockIdx.x  * blockDim.x + threadIdx.x;
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* targetGrad,
+                                  const int outStride) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     // find out the local index
     // find out the local offset
@@ -235,43 +148,70 @@ __global__ void KeMaxPoolBackward(const int nthreads, const real* inputData,
         }
       }
     }
-    targetGrad[index] =
-      scaleB * targetGrad[index] + scaleA * gradient;
+    targetGrad[index] = scaleB * targetGrad[index] + scaleA * gradient;
   }
 }
 
-void hl_maxpool_backward(const int frameCnt, const real* inputData,
-                        const real* outData, const real* outGrad,
-                        const int channels, const int height,
-                        const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW,
-                        real scaleA, real scaleB,
-                        real* targetGrad, const int outStride) {
-
+void hl_maxpool_backward(const int frameCnt,
+                         const real* inputData,
+                         const real* outData,
+                         const real* outGrad,
+                         const int channels,
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* targetGrad,
+                         const int outStride) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
-  KeMaxPoolBackward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, outData, outGrad, channels,
-           height, width, pooledH, pooledW, sizeX, sizeY,
-           strideH, strideW,
-           paddingH, paddingW,
-           scaleA, scaleB,
-           targetGrad, outStride);
+  KeMaxPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         inputData,
+                                                         outData,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         targetGrad,
+                                                         outStride);
   CHECK_SYNC("hl_maxpool_backward");
 }
 
-__global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
+__global__ void KeAvgPoolForward(const int nthreads,
+                                 const real* inputData,
                                  const int channels,
-                                 const int height, const int width,
-                                 const int pooledH, const int pooledW,
-                                 const int sizeX, const int sizeY,
-                                 const int strideH, const int strideW,
-                                 const int padH, const int padW,
-                                 real* tgtData, const int tgtStride) {
+                                 const int height,
+                                 const int width,
+                                 const int pooledH,
+                                 const int pooledW,
+                                 const int sizeX,
+                                 const int sizeY,
+                                 const int strideH,
+                                 const int strideW,
+                                 const int padH,
+                                 const int padW,
+                                 real* tgtData,
+                                 const int tgtStride,
+                                 const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int pw = index % pooledW;
@@ -281,13 +221,12 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
 
     int hstart = ph * strideH - padH;
     int wstart = pw * strideW - padW;
-    int hend = min(hstart + sizeY, height + padH);
-    int wend = min(wstart + sizeX, width + padW);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
     hstart = max(hstart, 0);
     wstart = max(wstart, 0);
-    hend = min(hend, height);
-    wend = min(wend, width);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
 
     real aveval = 0;
     inputData += (frameNum * channels + c) * height * width;
@@ -296,39 +235,67 @@ __global__ void KeAvgPoolForward(const int nthreads, const real* inputData,
         aveval += inputData[h * width + w];
       }
     }
-    int tgtIndex = index % (pooledW * pooledH * channels) +
-        frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    int tgtIndex =
+        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / poolSize;
   }
 }
 
-void hl_avgpool_forward(const int frameCnt, const real* inputData,
+void hl_avgpool_forward(const int frameCnt,
+                        const real* inputData,
                         const int channels,
-                        const int height, const int width,
-                        const int pooledH, const int pooledW,
-                        const int sizeX, const int sizeY,
-                        const int strideH, const int strideW,
-                        const int paddingH, const int paddingW, 
-                        real* tgtData, const int tgtStride) {
+                        const int height,
+                        const int width,
+                        const int pooledH,
+                        const int pooledW,
+                        const int sizeX,
+                        const int sizeY,
+                        const int strideH,
+                        const int strideW,
+                        const int paddingH,
+                        const int paddingW,
+                        real* tgtData,
+                        const int tgtStride,
+                        const bool excludeMode) {
   int num_kernels = pooledH * pooledW * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  KeAvgPoolForward<<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, inputData, channels,
-           height, width, pooledH, pooledW,
-           sizeX, sizeY, strideH, strideW,
-           paddingH, paddingW, tgtData, tgtStride);
+  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                        inputData,
+                                                        channels,
+                                                        height,
+                                                        width,
+                                                        pooledH,
+                                                        pooledW,
+                                                        sizeX,
+                                                        sizeY,
+                                                        strideH,
+                                                        strideW,
+                                                        paddingH,
+                                                        paddingW,
+                                                        tgtData,
+                                                        tgtStride,
+                                                        excludeMode);
   CHECK_SYNC("hl_avgpool_forward failed");
 }
 
-__global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
-                                  const int channels, const int height,
+__global__ void KeAvgPoolBackward(const int nthreads,
+                                  const real* outGrad,
+                                  const int channels,
+                                  const int height,
                                   const int width,
-                                  const int pooledH, const int pooledW,
-                                  const int sizeX, const int sizeY,
-                                  const int strideH, const int strideW,
-                                  const int padH, const int padW,
-                                  real scaleA, real scaleB,
-                                  real* tgtGrad, const int outStride) {
+                                  const int pooledH,
+                                  const int pooledW,
+                                  const int sizeX,
+                                  const int sizeY,
+                                  const int strideH,
+                                  const int strideW,
+                                  const int padH,
+                                  const int padW,
+                                  real scaleA,
+                                  real scaleB,
+                                  real* tgtGrad,
+                                  const int outStride,
+                                  const bool excludeMode) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < nthreads) {
     int offsetW = index % width + padW;
@@ -343,44 +310,493 @@ __global__ void KeAvgPoolBackward(const int nthreads, const real* outGrad,
     real gradient = 0;
     outGrad += (frameNum * outStride + offsetC * pooledH * pooledW);
 
-
     for (int ph = phstart; ph < phend; ++ph) {
+      int hstart = ph * strideH - padH;
+      int hend = min(hstart + sizeY, height);
+      hstart = max(hstart, 0);
       for (int pw = pwstart; pw < pwend; ++pw) {
         // figure out the pooling size
-        int hstart = ph * strideH - padH;
         int wstart = pw * strideW - padW;
-        int hend = min(hstart + sizeY, height + padH);
-        int wend = min(wstart + sizeX, width + padW);
-        int poolsize = (hend - hstart) * (wend - wstart);
-        gradient += outGrad[ph * pooledW + pw]/poolsize;
+        int wend = min(wstart + sizeX, width);
+        wstart = max(wstart, 0);
+        int poolSize =
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
       }
     }
     tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
   }
 }
 
-void hl_avgpool_backward(const int frameCnt, const real* outGrad,
+void hl_avgpool_backward(const int frameCnt,
+                         const real* outGrad,
                          const int channels,
-                         const int height, const int width,
-                         const int pooledH, const int pooledW,
-                         const int sizeX, const int sizeY,
-                         const int strideH, const int strideW,
-                         const int paddingH, const int paddingW,
-                         real scaleA, real scaleB,
-                         real* backGrad, const int outStride) {
+                         const int height,
+                         const int width,
+                         const int pooledH,
+                         const int pooledW,
+                         const int sizeX,
+                         const int sizeY,
+                         const int strideH,
+                         const int strideW,
+                         const int paddingH,
+                         const int paddingW,
+                         real scaleA,
+                         real scaleB,
+                         real* backGrad,
+                         const int outStride,
+                         const bool excludeMode) {
   int num_kernels = height * width * channels * frameCnt;
   int blocks = (num_kernels + 1024 - 1) / 1024;
 
-  KeAvgPoolBackward <<< blocks, 1024, 0, STREAM_DEFAULT >>>
-           (num_kernels, outGrad, channels, height, width,
-           pooledH, pooledW, sizeX, sizeY,
-           strideH, strideW,
-           paddingH, paddingW,
-           scaleA, scaleB,
-           backGrad, outStride);
+  KeAvgPoolBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                         outGrad,
+                                                         channels,
+                                                         height,
+                                                         width,
+                                                         pooledH,
+                                                         pooledW,
+                                                         sizeX,
+                                                         sizeY,
+                                                         strideH,
+                                                         strideW,
+                                                         paddingH,
+                                                         paddingW,
+                                                         scaleA,
+                                                         scaleB,
+                                                         backGrad,
+                                                         outStride,
+                                                         excludeMode);
   CHECK_SYNC("hl_avgpool_backward failed");
 }
 
+__global__ void KeMaxPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int ksizeD,
+                                   const int ksizeH,
+                                   const int ksizeW,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   real* maxPoolIdxData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + ksizeD, depth);
+    int hend = min(hstart + ksizeH, height);
+    int wend = min(wstart + ksizeW, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    real maxval = -FLT_MAX;
+    int maxIdx = -1;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (maxval < inputData[(d * height + h) * width + w]) {
+            maxval = inputData[(d * height + h) * width + w];
+            maxIdx = (d * height + h) * width + w;
+          }
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = maxval;
+    maxPoolIdxData[tgtIndex] = maxIdx;
+  }
+}
+
+void hl_maxpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int padD,
+                          const int padH,
+                          const int padW,
+                          real* tgtData,
+                          real* maxPoolIdxData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KeMaxPool3DForward<<<grid, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           inputData,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           pooledD,
+                                                           pooledH,
+                                                           pooledW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           padD,
+                                                           padH,
+                                                           padW,
+                                                           tgtData,
+                                                           maxPoolIdxData,
+                                                           tgtStride);
+  CHECK_SYNC("hl_maxpool3D_forward failed");
+}
+
+__global__ void KeMaxPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* targetGrad,
+                                    real* maxPoolIdxData,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width;
+    int offsetH = (index / width) % height;
+    int offsetD = (index / width / height) % depth;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart =
+        (offsetD + padD < sizeZ) ? 0 : (offsetD + padD - sizeZ) / strideD + 1;
+    int phstart =
+        (offsetH + padH < sizeY) ? 0 : (offsetH + padH - sizeY) / strideH + 1;
+    int pwstart =
+        (offsetW + padW < sizeX) ? 0 : (offsetW + padW - sizeX) / strideW + 1;
+    int pdend = min((offsetD + padD) / strideD + 1, pooledD);
+    int phend = min((offsetH + padH) / strideH + 1, pooledH);
+    int pwend = min((offsetW + padW) / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    maxPoolIdxData +=
+        ((frameNum * channels + offsetC) * pooledD * pooledH * pooledW);
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          if (((offsetD * height + offsetH) * width + offsetW) ==
+              maxPoolIdxData[(pd * pooledH + ph) * pooledW + pw])
+            gradient += outGrad[(pd * pooledH + ph) * pooledW + pw];
+        }
+      }
+    }
+    targetGrad[index] = scaleA * gradient + scaleB * targetGrad[index];
+  }
+}
+
+void hl_maxpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           const int paddingD,
+                           const int paddingH,
+                           const int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* targetGrad,
+                           real* maxPoolIdxData,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeMaxPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           targetGrad,
+                                                           maxPoolIdxData,
+                                                           outStride);
+  CHECK_SYNC("hl_maxpool3D_backward");
+}
+
+__global__ void KeAvgPool3DForward(const int nthreads,
+                                   const real* inputData,
+                                   const int channels,
+                                   const int depth,
+                                   const int height,
+                                   const int width,
+                                   const int pooledD,
+                                   const int pooledH,
+                                   const int pooledW,
+                                   const int sizeZ,
+                                   const int sizeY,
+                                   const int sizeX,
+                                   const int strideD,
+                                   const int strideH,
+                                   const int strideW,
+                                   const int padD,
+                                   const int padH,
+                                   const int padW,
+                                   real* tgtData,
+                                   const int tgtStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int pw = index % pooledW;
+    int ph = (index / pooledW) % pooledH;
+    int pd = (index / pooledW / pooledH) % pooledD;
+    int c = (index / pooledW / pooledH / pooledD) % channels;
+    int frameNum = index / pooledW / pooledH / pooledD / channels;
+    int dstart = pd * strideD - padD;
+    int hstart = ph * strideH - padH;
+    int wstart = pw * strideW - padW;
+    int dend = min(dstart + sizeZ, depth);
+    int hend = min(hstart + sizeY, height);
+    int wend = min(wstart + sizeX, width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+
+    real aveval = 0;
+    inputData += (frameNum * channels + c) * depth * height * width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          aveval += inputData[(d * height + h) * width + w];
+        }
+      }
+    }
+    int tgtIndex =
+        index % (pooledW * pooledH * pooledD * channels) + frameNum * tgtStride;
+    tgtData[tgtIndex] = aveval / pool_size;
+  }
+}
+
+void hl_avgpool3D_forward(const int frameCnt,
+                          const real* inputData,
+                          const int channels,
+                          const int depth,
+                          const int height,
+                          const int width,
+                          const int pooledD,
+                          const int pooledH,
+                          const int pooledW,
+                          const int sizeZ,
+                          const int sizeY,
+                          const int sizeX,
+                          const int strideD,
+                          const int strideH,
+                          const int strideW,
+                          const int paddingD,
+                          const int paddingH,
+                          const int paddingW,
+                          real* tgtData,
+                          const int tgtStride) {
+  int num_kernels = pooledD * pooledH * pooledW * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  KeAvgPool3DForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          inputData,
+                                                          channels,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          pooledD,
+                                                          pooledH,
+                                                          pooledW,
+                                                          sizeZ,
+                                                          sizeY,
+                                                          sizeX,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          tgtData,
+                                                          tgtStride);
+  CHECK_SYNC("hl_avgpool3D_forward failed");
+}
+
+__global__ void KeAvgPool3DBackward(const int nthreads,
+                                    const real* outGrad,
+                                    const int channels,
+                                    const int depth,
+                                    const int height,
+                                    const int width,
+                                    const int pooledD,
+                                    const int pooledH,
+                                    const int pooledW,
+                                    const int sizeZ,
+                                    const int sizeY,
+                                    const int sizeX,
+                                    const int strideD,
+                                    const int strideH,
+                                    const int strideW,
+                                    const int padD,
+                                    const int padH,
+                                    const int padW,
+                                    real scaleA,
+                                    real scaleB,
+                                    real* tgtGrad,
+                                    const int outStride) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < (nthreads);
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % width + padW;
+    int offsetH = (index / width) % height + padH;
+    int offsetD = (index / width / height) % depth + padD;
+    int offsetC = (index / width / height / depth) % channels;
+    int frameNum = index / width / height / depth / channels;
+
+    int pdstart = (offsetD < sizeZ) ? 0 : (offsetD - sizeZ) / strideD + 1;
+    int phstart = (offsetH < sizeY) ? 0 : (offsetH - sizeY) / strideH + 1;
+    int pwstart = (offsetW < sizeX) ? 0 : (offsetW - sizeX) / strideW + 1;
+    int pdend = min(offsetD / strideD + 1, pooledD);
+    int phend = min(offsetH / strideH + 1, pooledH);
+    int pwend = min(offsetW / strideW + 1, pooledW);
+
+    real gradient = 0;
+    outGrad += (frameNum * channels + offsetC) * pooledD * pooledH * pooledW;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      int dstart = pd * strideD - padD;
+      int dend = min(dstart + sizeZ, depth);
+      dstart = max(dstart, 0);
+      for (int ph = phstart; ph < phend; ++ph) {
+        int hstart = ph * strideH - padH;
+        int hend = min(hstart + sizeY, height);
+        hstart = max(hstart, 0);
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int wstart = pw * strideW - padW;
+          int wend = min(wstart + sizeX, width);
+          wstart = max(wstart, 0);
+          int poolsize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          gradient += outGrad[(pd * pooledH + ph) * pooledW + pw] / poolsize;
+        }
+      }
+    }
+    tgtGrad[index] = scaleA * gradient + scaleB * tgtGrad[index];
+  }
+}
+
+void hl_avgpool3D_backward(const int frameCnt,
+                           const real* outGrad,
+                           const int channels,
+                           const int depth,
+                           const int height,
+                           const int width,
+                           const int outputD,
+                           const int outputH,
+                           const int outputW,
+                           const int sizeZ,
+                           const int sizeY,
+                           const int sizeX,
+                           const int strideD,
+                           const int strideH,
+                           const int strideW,
+                           int paddingD,
+                           int paddingH,
+                           int paddingW,
+                           real scaleA,
+                           real scaleB,
+                           real* backGrad,
+                           const int outStride) {
+  int num_kernels = depth * height * width * channels * frameCnt;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+
+  KeAvgPool3DBackward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                           outGrad,
+                                                           channels,
+                                                           depth,
+                                                           height,
+                                                           width,
+                                                           outputD,
+                                                           outputH,
+                                                           outputW,
+                                                           sizeZ,
+                                                           sizeY,
+                                                           sizeX,
+                                                           strideD,
+                                                           strideH,
+                                                           strideW,
+                                                           paddingD,
+                                                           paddingH,
+                                                           paddingW,
+                                                           scaleA,
+                                                           scaleB,
+                                                           backGrad,
+                                                           outStride);
+  CHECK_SYNC("hl_avgpool3D_backward failed");
+}
+
 __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t inImgH,
                                    const size_t inImgW,
@@ -394,7 +810,7 @@ __global__ void KeBilinearInterpFw(const real* in,
                                    const size_t numChannels,
                                    const real ratioH,
                                    const real ratioW) {
-  int nthreads = outputH * outputW;                      
+  int nthreads = outputH * outputW;
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   if (tid < nthreads) {
     int outIdH = tid / outputW;
@@ -415,13 +831,14 @@ __global__ void KeBilinearInterpFw(const real* in,
     real w1lambda = ratioW * outImgIdx - inImgIdx;
     real w2lambda = 1.f - w1lambda;
 
-    const real* inPos =
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+    const real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                            inImgIdy * inImgW + inImgIdx];
 
     // bilinear interpolation
     out[outIdH * outputW + outIdW] =
-      h2lambda * (w2lambda * inPos[0]            + w1lambda * inPos[wId]) + 
-      h1lambda * (w2lambda * inPos[hId * inImgW] + w1lambda * inPos[hId * inImgW + wId]);
+        h2lambda * (w2lambda * inPos[0] + w1lambda * inPos[wId]) +
+        h1lambda * (w2lambda * inPos[hId * inImgW] +
+                    w1lambda * inPos[hId * inImgW + wId]);
   }
 }
 
@@ -441,9 +858,19 @@ void hl_bilinear_forward(const real* inData,
   int threadNum = outputH * outputW;
   int blocks = (threadNum + 1024 - 1) / 1024;
 
-  KeBilinearInterpFw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    inData, inImgH, inImgW, inputH, inputW, outData, outImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  KeBilinearInterpFw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inData,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outData,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
   CHECK_SYNC("hl_bilinear_forward failed");
 }
 
@@ -481,13 +908,15 @@ __global__ void KeBilinearInterpBw(real* in,
     real w1lambda = ratioW * outImgIdx - inImgIdx;
     real w2lambda = 1.f - w1lambda;
 
-    real* inPos =
-      &in[outIdH * inputW + channelId * inImgSize + inImgIdy * inImgW + inImgIdx];
+    real* inPos = &in[outIdH * inputW + channelId * inImgSize +
+                      inImgIdy * inImgW + inImgIdx];
     const real* outPos = &out[outIdH * outputW + outIdW];
     paddle::paddleAtomicAdd(&inPos[0], h2lambda * w2lambda * outPos[0]);
     paddle::paddleAtomicAdd(&inPos[wId], h2lambda * w1lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW], h1lambda * w2lambda * outPos[0]);
-    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId], h1lambda * w1lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW],
+                            h1lambda * w2lambda * outPos[0]);
+    paddle::paddleAtomicAdd(&inPos[hId * inImgW + wId],
+                            h1lambda * w1lambda * outPos[0]);
   }
 }
 
@@ -507,22 +936,37 @@ void hl_bilinear_backward(real* inGrad,
   int threadNum = outputH * outputW;
   int blocks = (threadNum + 1024 - 1) / 1024;
 
-  KeBilinearInterpBw<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    inGrad, inImgH, inImgW, inputH, inputW, outGrad, outImgH,
-    outImgW, outputH, outputW, numChannels, ratioH, ratioW);
+  KeBilinearInterpBw<<<blocks, 1024, 0, STREAM_DEFAULT>>>(inGrad,
+                                                          inImgH,
+                                                          inImgW,
+                                                          inputH,
+                                                          inputW,
+                                                          outGrad,
+                                                          outImgH,
+                                                          outImgW,
+                                                          outputH,
+                                                          outputW,
+                                                          numChannels,
+                                                          ratioH,
+                                                          ratioW);
   CHECK_SYNC("hl_bilinear_backward failed");
 }
 
-__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
-                                real * outData, int* idData, 
-                                size_t size, size_t featLen, size_t groups) {
+__global__ void maxoutFpCompute(size_t nthreads,
+                                const real* inData,
+                                real* outData,
+                                int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
     size_t batch_idx = index / size;
     size_t i = index % size;
     size_t channel_idx = i / featLen;
     size_t feat_idx = i % featLen;
-    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    size_t data_idx =
+        (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
     real max = inData[data_idx];
     int maxId = 0;
     for (size_t g = 1; g < groups; ++g) {
@@ -537,37 +981,50 @@ __global__ void maxoutFpCompute(size_t nthreads, const real * inData,
   }
 }
 
-void hl_maxout_forward(const real* inData, real* outData,
-                       int* idData, size_t batchSize, size_t size,
-                       size_t featLen, size_t groups) {
+void hl_maxout_forward(const real* inData,
+                       real* outData,
+                       int* idData,
+                       size_t batchSize,
+                       size_t size,
+                       size_t featLen,
+                       size_t groups) {
   int num_kernels = size * batchSize;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
-    num_kernels, inData, outData, idData, size, featLen, groups);
+  maxoutFpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inData, outData, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_forward failed");
 }
 
-__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
-                                const real* outGrad, const int* idData,
-                                size_t size, size_t featLen, size_t groups) {
+__global__ void maxoutBpCompute(size_t nthreads,
+                                real* inGrad,
+                                const real* outGrad,
+                                const int* idData,
+                                size_t size,
+                                size_t featLen,
+                                size_t groups) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
-  if(index < nthreads) {
+  if (index < nthreads) {
     size_t batch_idx = index / size;
     size_t i = index % size;
     size_t channel_idx = i / featLen;
     size_t feat_idx = i % featLen;
     size_t newIndex = batch_idx * size;
-    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    size_t gradIdx =
+        (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
     (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
   }
 }
 
-void hl_maxout_backward(real* inGrad, const real* outGrad,
-                        const int* idData, size_t batchSize, size_t size,
-                        size_t featLen, size_t groups) {
+void hl_maxout_backward(real* inGrad,
+                        const real* outGrad,
+                        const int* idData,
+                        size_t batchSize,
+                        size_t size,
+                        size_t featLen,
+                        size_t groups) {
   int num_kernels = size * batchSize;
   int blocks = (num_kernels + 1024 - 1) / 1024;
-  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
-    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  maxoutBpCompute<<<blocks, 1024, 0, STREAM_DEFAULT>>>(
+      num_kernels, inGrad, outGrad, idData, size, featLen, groups);
   CHECK_SYNC("hl_maxout_backward failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index c53a563682..b8caf48f9c 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -201,7 +201,8 @@ void hl_conv_workspace(hl_tensor_descriptor input,
                        int* convBwdDataAlgo,
                        size_t* bwdDataLimitBytes,
                        int* convBwdFilterAlgo,
-                       size_t* bwdFilterLimitBytes) {
+                       size_t* bwdFilterLimitBytes,
+                       bool useDilation) {
 #if CUDNN_VERSION >= 4000
 
   CHECK_NOTNULL(input);
@@ -213,21 +214,60 @@ void hl_conv_workspace(hl_tensor_descriptor input,
   size_t memoryLimitBytes =
       (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
+  // For dilation
+  int algo = 0;
+
   // cudnn convolution forward configuration
   cudnnTensorDescriptor_t fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
   cudnnTensorDescriptor_t fwd_dest_desc = GET_TENSOR_DESCRIPTOR(output);
   cudnnFilterDescriptor_t fwd_filter_desc = GET_FILTER_DESCRIPTOR(filter);
   cudnnConvolutionDescriptor_t fwd_conv_desc = GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward data configuration
+  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
+  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  // cudnn convolution backward filter configuration
+  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
+  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
+  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
+      GET_CONVOLUTION_DESCRIPTOR(conv);
+  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
 
-  CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
-      t_resource.cudnn_handle,
-      fwd_src_desc,
-      fwd_filter_desc,
-      fwd_conv_desc,
-      fwd_dest_desc,
-      CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+  if (useDilation) {
+    convFwdAlgo = &algo;
+    convBwdDataAlgo = &algo;
+    convBwdFilterAlgo = &algo;
+  } else {
+    CHECK_CUDNN(dynload::cudnnGetConvolutionForwardAlgorithm(
+        t_resource.cudnn_handle,
+        fwd_src_desc,
+        fwd_filter_desc,
+        fwd_conv_desc,
+        fwd_dest_desc,
+        CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionFwdAlgo_t*>(convFwdAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_data_filter_desc,
+        bwd_data_diff_desc,
+        bwd_data_conv_desc,
+        bwd_data_grad_desc,
+        CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
+    CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+        t_resource.cudnn_handle,
+        bwd_filter_src_desc,
+        bwd_filter_diff_desc,
+        bwd_filter_conv_desc,
+        bwd_filter_grad_desc,
+        CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+        memoryLimitBytes,
+        reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
+  }
 
   CHECK_CUDNN(dynload::cudnnGetConvolutionForwardWorkspaceSize(
       t_resource.cudnn_handle,
@@ -238,23 +278,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionFwdAlgo_t>(*convFwdAlgo),
       fwdLimitBytes));
 
-  // cudnn convolution backward data configuration
-  cudnnFilterDescriptor_t bwd_data_filter_desc = GET_FILTER_DESCRIPTOR(filter);
-  cudnnTensorDescriptor_t bwd_data_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnTensorDescriptor_t bwd_data_grad_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnConvolutionDescriptor_t bwd_data_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_data_filter_desc,
-      bwd_data_diff_desc,
-      bwd_data_conv_desc,
-      bwd_data_grad_desc,
-      CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdDataAlgo_t*>(convBwdDataAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_data_filter_desc,
@@ -264,23 +287,6 @@ void hl_conv_workspace(hl_tensor_descriptor input,
       static_cast<cudnnConvolutionBwdDataAlgo_t>(*convBwdDataAlgo),
       bwdDataLimitBytes));
 
-  // cudnn convolution backward filter configuration
-  cudnnTensorDescriptor_t bwd_filter_src_desc = GET_TENSOR_DESCRIPTOR(input);
-  cudnnTensorDescriptor_t bwd_filter_diff_desc = GET_TENSOR_DESCRIPTOR(output);
-  cudnnConvolutionDescriptor_t bwd_filter_conv_desc =
-      GET_CONVOLUTION_DESCRIPTOR(conv);
-  cudnnFilterDescriptor_t bwd_filter_grad_desc = GET_FILTER_DESCRIPTOR(filter);
-
-  CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
-      t_resource.cudnn_handle,
-      bwd_filter_src_desc,
-      bwd_filter_diff_desc,
-      bwd_filter_conv_desc,
-      bwd_filter_grad_desc,
-      CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
-      memoryLimitBytes,
-      reinterpret_cast<cudnnConvolutionBwdFilterAlgo_t*>(convBwdFilterAlgo)));
-
   CHECK_CUDNN(dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
       t_resource.cudnn_handle,
       bwd_filter_src_desc,
@@ -426,11 +432,11 @@ void hl_create_pooling_descriptor(hl_pooling_descriptor* pooling_desc,
       cudnn_mode = CUDNN_POOLING_MAX;
       break;
     case HL_POOLING_AVERAGE:
-      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
-      break;
-    case HL_POOLING_AVERAGE_EXCLUDE_PADDING:
       cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING;
       break;
+    case HL_POOLING_AVERAGE_INCLUDE_PADDING:
+      cudnn_mode = CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING;
+      break;
     default:
       LOG(FATAL) << "parameter mode error";
   }
@@ -603,7 +609,9 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                       int padding_height,
                                       int padding_width,
                                       int stride_height,
-                                      int stride_width) {
+                                      int stride_width,
+                                      int dilation_h,
+                                      int dilation_w) {
   CHECK_NOTNULL(conv);
 
   cudnn_convolution_descriptor hl_conv = (cudnn_convolution_descriptor)malloc(
@@ -625,18 +633,24 @@ void hl_create_convolution_descriptor(hl_convolution_descriptor* conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
+  if (dilation_h > 1 || dilation_w > 1) {
+    LOG(FATAL)
+        << "Current cuDNN version does't support for dilation convolution. "
+        << "The dilation convolution requires cuDNN >= v6.0.";
+  }
+
   CHECK_CUDNN(dynload::cudnnSetConvolution2dDescriptor(hl_conv->desc,
                                                        padding_height,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
@@ -659,7 +673,9 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                      int padding_height,
                                      int padding_width,
                                      int stride_height,
-                                     int stride_width) {
+                                     int stride_width,
+                                     int dilation_h,
+                                     int dilation_w) {
   CHECK_NOTNULL(conv);
   CHECK_NOTNULL(image);
   CHECK_NOTNULL(filter);
@@ -678,8 +694,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode,
                                                        data_type));
 #else
@@ -688,8 +704,8 @@ void hl_reset_convolution_descriptor(hl_convolution_descriptor conv,
                                                        padding_width,
                                                        stride_height,
                                                        stride_width,
-                                                       1,
-                                                       1,
+                                                       dilation_h,
+                                                       dilation_w,
                                                        mode));
 #endif
 
@@ -1022,6 +1038,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
   CHECK_CUDNN(
       dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                        mode,
diff --git a/paddle/cuda/src/hl_cuda_lstm.cu b/paddle/cuda/src/hl_cuda_lstm.cu
index b869d903ba..a5ce81a904 100644
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
+#include "hl_activation_functions.h"
 #include "hl_base.h"
 #include "hl_cuda_cublas.h"
 #include "hl_device_functions.cuh"
-#include "hl_activation_functions.h"
 #include "paddle/utils/Logging.h"
 
-typedef hppl::Active<real>::forward  t_forward;
+typedef hppl::Active<real>::forward t_forward;
 typedef hppl::Active<real>::backward t_backward;
 
 bool hl_lstm_sequence_parallel(int frameSize) {
@@ -42,9 +41,9 @@ public:
       value_ += (start + length - 1) * frameSize + idx;
     }
   }
-  __device__ inline real *getPtr() const {return value_;}
-  __device__ inline real getValue() {return *value_;}
-  __device__ inline void setValue(real value) {*value_ = value;}
+  __device__ inline real *getPtr() const { return value_; }
+  __device__ inline real getValue() { return *value_; }
+  __device__ inline void setValue(real value) { *value_ = value; }
   template <int reversed, int frameSize>
   __device__ inline void nextFrame() {
     if (reversed == 0) {
@@ -55,28 +54,25 @@ public:
   }
 };
 
-__device__ __forceinline__
-void ptx_sync(const int id, const int barriers) {
+__device__ __forceinline__ void ptx_sync(const int id, const int barriers) {
   asm volatile("bar.sync %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
 
-__device__ __forceinline__
-void ptx_arrive(const int id, const int barriers) {
+__device__ __forceinline__ void ptx_arrive(const int id, const int barriers) {
   asm volatile("bar.arrive %0, %1;" : : "r"(id), "r"(barriers) : "memory");
 }
 
-template<int valueSize, int frameSize>
-__device__ __forceinline__ real
-forward_sequence(real value,
-                 real *shValue,
-                 real *state,
-                 real *preOutput,
-                 real *output,
-                 real check,
-                 int index,
-                 t_forward activeNode,
-                 t_forward activeGate,
-                 t_forward activeState) {
+template <int valueSize, int frameSize>
+__device__ __forceinline__ real forward_sequence(real value,
+                                                 real *shValue,
+                                                 real *state,
+                                                 real *preOutput,
+                                                 real *output,
+                                                 real check,
+                                                 int index,
+                                                 t_forward activeNode,
+                                                 t_forward activeGate,
+                                                 t_forward activeState) {
   real out;
   real prevOut;
   real state_r;
@@ -112,17 +108,20 @@ forward_sequence(real value,
   if (idy == 0) {
     ptx_sync(2, frameSize * 2);
     prevOut = state[idx];
-     prevOut = activeState(prevOut);
+    prevOut = activeState(prevOut);
     preOutput[idx] = prevOut;
     ptx_arrive(3, frameSize * 2);
   }
   return value;
 }
 
-#define     OUTPUT_BARRIER_ID               10
-#define     OUTPUT_BARRIER_ID2              11
-template<int valueSize, int frameSize, int reversed,
-         int computeThreads, int blockSize>
+#define OUTPUT_BARRIER_ID 10
+#define OUTPUT_BARRIER_ID2 11
+template <int valueSize,
+          int frameSize,
+          int reversed,
+          int computeThreads,
+          int blockSize>
 __global__ void KeLstmForward(real *gateValue,
                               real *state,
                               real *output,
@@ -184,10 +183,16 @@ __global__ void KeLstmForward(real *gateValue,
         }
       }
       value = forward_sequence<valueSize, frameSize>(
-        value, shValue, shState, shPrevOutput, shOutput, check, index,
-        hppl::gpu::forward[active_node],
-        hppl::gpu::forward[active_gate],
-        hppl::gpu::forward[active_state]);
+          value,
+          shValue,
+          shState,
+          shPrevOutput,
+          shOutput,
+          check,
+          index,
+          hppl::gpu::forward[active_node],
+          hppl::gpu::forward[active_gate],
+          hppl::gpu::forward[active_state]);
       const int idx = index % frameSize;
       const int idy = index / frameSize;
       if (valueSize == 128) {
@@ -218,7 +223,7 @@ __global__ void KeLstmForward(real *gateValue,
           real B_r[frameSize];
           const int computeIdx = index - valueSize;
           if (i == 0) {
-            #pragma unroll
+#pragma unroll
             for (int n = 0; n < frameSize; n++) {
               B_r[n] = weight[n * valueSize + computeIdx];
             }
@@ -230,7 +235,7 @@ __global__ void KeLstmForward(real *gateValue,
           }
           real sum = 0.0f;
           for (int n = 0; n < frameSize; n++) {
-            sum += A_r[n]*B_r[n];
+            sum += A_r[n] * B_r[n];
           }
           shValue[computeIdx] = sum;
           ptx_arrive(OUTPUT_BARRIER_ID2, blockSize);
@@ -239,14 +244,14 @@ __global__ void KeLstmForward(real *gateValue,
       if (valueSize == 256) {
         real B_r[frameSize];
         if (i == 0) {
-          #pragma unroll
+#pragma unroll
           for (int n = 0; n < frameSize; n++) {
             B_r[n] = weight[n * valueSize + index];
           }
         }
         real sum = 0.0f;
         for (int n = 0; n < frameSize; n++) {
-          sum += shOutput[n]*B_r[n];
+          sum += shOutput[n] * B_r[n];
         }
         value += sum;
       }
@@ -273,50 +278,81 @@ void hl_lstm_parallel_forward(real *gateValue,
   dim3 grid(numSequences, 1);
   if (!reversed) {
     if (frameSize == 32) {
-      KeLstmForward<128, 32, 0, 128, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<128, 32, 0, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 0, 256, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<256, 64, 0, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   } else {
     if (frameSize == 32) {
-      KeLstmForward<128, 32, 1, 128, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<128, 32, 1, 128, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmForward<256, 64, 1, 256, 256>
-               <<<grid, 256, 0, STREAM_DEFAULT>>>
-               (gateValue, stateValue, outputValue, preOutputValue,
-               checkIg, checkFg, checkOg, weight, sequence,
-               active_node, active_gate, active_state);
+      KeLstmForward<256, 64, 1, 256, 256><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          stateValue,
+          outputValue,
+          preOutputValue,
+          checkIg,
+          checkFg,
+          checkOg,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   }
   CHECK_SYNC("hl_lstm_parallel_forward failed");
 }
 
-__device__ __forceinline__
-void transpose_32x32(real a[], const int idx) {
+__device__ __forceinline__ void transpose_32x32(real a[], const int idx) {
   int addr = idx % 32;
-  #pragma unroll
+#pragma unroll
   for (int k = 1; k < 32; k++) {
     // rSrc[k] = __shfl(rSrc[k], (threadIdx.x + k) % 32, 32);
     addr = __shfl(addr, (idx + 1) % 32, 32);
     a[k] = __shfl(a[k], addr, 32);
   }
 
-  #pragma unroll
+#pragma unroll
   for (int tid = 0; tid < 31; tid++) {
     real tmp = (idx > tid) ? a[0] : a[1];
-    #pragma unroll
+#pragma unroll
     for (int k = 31; k > 0; k--) {
       a[(k + 1) % 32] = (idx > tid) ? a[k] : a[(k + 1) % 32];
     }
@@ -324,29 +360,28 @@ void transpose_32x32(real a[], const int idx) {
   }
 
   addr = (32 - idx) % 32;
-  #pragma unroll
+#pragma unroll
   for (int k = 0; k < 32; k++) {
     a[k] = __shfl(a[k], addr, 32);
     addr = __shfl(addr, (idx + 31) % 32, 32);
   }
 }
 
-template<int valueSize, int frameSize>
-__device__ void
-backward_sequence(real rGateValue,
-                  real rOutputGrad,
-                  real rPreOutputValue,
-                  real &rGateGrad,
-                  real &rStateGrad,
-                  real *shStateGrad,
-                  real *shStateValue,
-                  real *shGateValue,
-                  real rCheck,
-                  real &rGateValuePrev,
-                  int index,
-                  t_backward activeNode,
-                  t_backward activeGate,
-                  t_backward activeState) {
+template <int valueSize, int frameSize>
+__device__ void backward_sequence(real rGateValue,
+                                  real rOutputGrad,
+                                  real rPreOutputValue,
+                                  real &rGateGrad,
+                                  real &rStateGrad,
+                                  real *shStateGrad,
+                                  real *shStateValue,
+                                  real *shGateValue,
+                                  real rCheck,
+                                  real &rGateValuePrev,
+                                  int index,
+                                  t_backward activeNode,
+                                  t_backward activeGate,
+                                  t_backward activeState) {
   const int frameIdx = index % frameSize;
   const int frameIdy = index / frameSize;
   if (frameIdy == 3) {
@@ -363,8 +398,8 @@ backward_sequence(real rGateValue,
     rStateGrad = rGateGrad * rCheck;
     shStateGrad[index] = rStateGrad;
     ptx_sync(3, valueSize);
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateGrad = rStateGrad * shGateValue[frameIdx];
     rGateGrad = activeGate(rGateGrad, rGateValue);
   } else if (frameIdy == 2) {
@@ -373,7 +408,7 @@ backward_sequence(real rGateValue,
     shStateGrad[index] = rStateGrad;
     ptx_sync(3, valueSize);
     rStateGrad += shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateValuePrev = rGateValue;
     rGateGrad = rStateGrad * shStateValue[frameIdx];
     rGateGrad = activeGate(rGateGrad, rGateValue);
@@ -381,43 +416,43 @@ backward_sequence(real rGateValue,
     shGateValue[frameIdx] = rGateValue;
     ptx_sync(3, valueSize);
     rStateGrad = shStateGrad[frameIdx + frameSize];
-    rStateGrad += shStateGrad[frameIdx + frameSize *2];
-    rStateGrad += shStateGrad[frameIdx + frameSize *3];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 2];
+    rStateGrad += shStateGrad[frameIdx + frameSize * 3];
     rGateGrad = rStateGrad * shGateValue[frameIdx + frameSize];
     rGateGrad = activeNode(rGateGrad, rGateValue);
   }
 }
 
-template<int valueSize, int frameSize>
+template <int valueSize, int frameSize>
 __device__ void load_weight(real rWeight[], real *weight, const int index) {
   if (valueSize == 128) {
     weight += index;
-    #pragma unroll
+#pragma unroll
     for (int n = 0; n < frameSize; n++) {
-      rWeight[n] = weight[n*valueSize];
+      rWeight[n] = weight[n * valueSize];
     }
     transpose_32x32(rWeight, index % 32);
   }
   if (valueSize == 256) {
     int id = (index / 32) % 2;
     weight += index - id * 32 + id * 32 * valueSize;
-    #pragma unroll
+#pragma unroll
     for (int n = 0; n < 32; n++) {
-      rWeight[n] = weight[n*valueSize];
-      rWeight[n + 32] = weight[n*valueSize + 32];
+      rWeight[n] = weight[n * valueSize];
+      rWeight[n + 32] = weight[n * valueSize + 32];
     }
     transpose_32x32(rWeight, index % 32);
     transpose_32x32(&rWeight[32], index % 32);
   }
 }
 
-template<int valueSize, int frameSize, int reversed>
+template <int valueSize, int frameSize, int reversed>
 __global__ void KeLstmBackward(real *gateValue,
                                real *gateGrad,
                                real *stateValue,
-                               real *stateGrad,       /* do not need save */
+                               real *stateGrad, /* do not need save */
                                real *preOutputValue,
-                               real *preOutputGrad,   /* do not need save */
+                               real *preOutputGrad, /* do not need save */
                                real *checkIg,
                                real *checkIgGrad,
                                real *checkFg,
@@ -484,20 +519,27 @@ __global__ void KeLstmBackward(real *gateValue,
 
   for (int i = 0; i < length; ++i) {
     if (frameIdy == 3) {
-      if (i != length -1) {
+      if (i != length - 1) {
         frameStateValue.nextFrame<!reversed, frameSize>();
         shStateValue[frameIdx] = frameStateValue.getValue();
       } else {
         shStateValue[frameIdx] = 0.0;
       }
     }
-    backward_sequence<valueSize, frameSize>(
-        rGateValue, rOutputGrad, rPreOutputValue, rGateGrad,
-        rStateGrad, shStateGrad, shStateValue, shGateValue,
-        rCheck, rGateValuePrev, index,
-        hppl::gpu::backward[active_node],
-        hppl::gpu::backward[active_gate],
-        hppl::gpu::backward[active_state]);
+    backward_sequence<valueSize, frameSize>(rGateValue,
+                                            rOutputGrad,
+                                            rPreOutputValue,
+                                            rGateGrad,
+                                            rStateGrad,
+                                            shStateGrad,
+                                            shStateValue,
+                                            shGateValue,
+                                            rCheck,
+                                            rGateValuePrev,
+                                            index,
+                                            hppl::gpu::backward[active_node],
+                                            hppl::gpu::backward[active_gate],
+                                            hppl::gpu::backward[active_state]);
     if (frameIdy == 3) {
       rCheckGrad += rGateGrad * rStateValue;
       rStateValue = shStateValue[frameIdx];
@@ -523,9 +565,9 @@ __global__ void KeLstmBackward(real *gateValue,
       shGateGrad[frameIdy][frameIdx] = rGateGrad;
       if (valueSize == 128) {
         real sum = 0.0f;
-        #pragma unroll
+#pragma unroll
         for (int n = 0; n < frameSize; n++) {
-          sum += shGateGrad[frameIdy][n]*B_r[n];
+          sum += shGateGrad[frameIdy][n] * B_r[n];
         }
         if (frameIdy == 3) {
           rOutputGrad += sum;
@@ -541,7 +583,7 @@ __global__ void KeLstmBackward(real *gateValue,
         }
         real sum = 0.0f;
         for (int n = 0; n < frameSize; n++) {
-          sum += A_r[n]*B_r[n];
+          sum += A_r[n] * B_r[n];
         }
         if (frameIdy == 3) {
           rOutputGrad += sum;
@@ -552,8 +594,8 @@ __global__ void KeLstmBackward(real *gateValue,
 
       if (frameIdy == 3) {
         ptx_sync(6, valueSize);
-        #pragma unroll
-        for (int i = 0; i < 3; i ++) {
+#pragma unroll
+        for (int i = 0; i < 3; i++) {
           rOutputGrad += shOutputGrad[i][frameIdx];
         }
       } else {
@@ -564,11 +606,14 @@ __global__ void KeLstmBackward(real *gateValue,
 
   /* TODO: Temporary save & merger in another kernel */
   if (frameIdy == 1) {
-    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad)
+      paddle::paddleAtomicAdd(checkIgGrad + frameIdx, rCheckGrad);
   } else if (frameIdy == 2) {
-    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad)
+      paddle::paddleAtomicAdd(checkFgGrad + frameIdx, rCheckGrad);
   } else if (frameIdy == 3) {
-    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad)
+      paddle::paddleAtomicAdd(checkOgGrad + frameIdx, rCheckGrad);
   }
 }
 
@@ -593,68 +638,183 @@ void hl_lstm_parallel_backward_data(real *gateValue,
                                     hl_activation_mode_t active_node,
                                     hl_activation_mode_t active_gate,
                                     hl_activation_mode_t active_state) {
-  CHECK(frameSize == 32 || frameSize == 64 ||
-        frameSize == 128 || frameSize == 256);
+  CHECK(frameSize == 32 || frameSize == 64 || frameSize == 128 ||
+        frameSize == 256);
   dim3 grid(numSequences, 1);
   if (!reversed) {
     if (frameSize == 32) {
-      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<128, 32, 0><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<256, 64, 0><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<512, 128, 0><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<1024, 256, 0><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   } else {
     if (frameSize == 32) {
-      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<128, 32, 1><<<grid, 128, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 64) {
-      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<256, 64, 1><<<grid, 256, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 128) {
-      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<512, 128, 1><<<grid, 512, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     } else if (frameSize == 256) {
-      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>
-          (gateValue, gateGrad, stateValue, stateGrad, preOutputValue,
-          preOutputGrad, checkIg, checkIgGrad, checkFg, checkFgGrad, checkOg,
-          checkOgGrad, outputGrad, weight, sequence,
-          active_node, active_gate, active_state);
+      KeLstmBackward<1024, 256, 1><<<grid, 1024, 0, STREAM_DEFAULT>>>(
+          gateValue,
+          gateGrad,
+          stateValue,
+          stateGrad,
+          preOutputValue,
+          preOutputGrad,
+          checkIg,
+          checkIgGrad,
+          checkFg,
+          checkFgGrad,
+          checkOg,
+          checkOgGrad,
+          outputGrad,
+          weight,
+          sequence,
+          active_node,
+          active_gate,
+          active_state);
     }
   }
   CHECK_SYNC("hl_lstm_parallel_backward_data");
 }
 
-template<int B_X, int B_Y>
+template <int B_X, int B_Y>
 __global__ void KeSetGradZero(real *gateGrad,
-    const int *starts, int valueSize, int numSequences, bool reversed) {
+                              const int *starts,
+                              int valueSize,
+                              int numSequences,
+                              bool reversed) {
   // const int tid = threadIdx.x;
 
   const int frameIdx = blockIdx.x * B_X + threadIdx.x;
@@ -682,19 +842,31 @@ void hl_lstm_parallel_backward_weight(real *weightGrad,
   int valueSize = 4 * frameSize;
   dim3 threads(32, 32);
   dim3 grid((valueSize + 32 - 1) / 32, (numSequences + 32 - 1) / 32);
-  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>
-           (gateGrad, sequence, valueSize, numSequences, reversed);
+  KeSetGradZero<32, 32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      gateGrad, sequence, valueSize, numSequences, reversed);
 
   if (!reversed) {
     hl_matrix_mul(outputValue,
-      HPPL_OP_T, gateGrad + valueSize, HPPL_OP_N, weightGrad,
-      frameSize, valueSize, batchSize - 1,
-      1.0, 1.0);
+                  HPPL_OP_T,
+                  gateGrad + valueSize,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
   } else {
     hl_matrix_mul(outputValue + frameSize,
-      HPPL_OP_T, gateGrad, HPPL_OP_N, weightGrad,
-      frameSize, valueSize, batchSize - 1,
-      1.0, 1.0);
+                  HPPL_OP_T,
+                  gateGrad,
+                  HPPL_OP_N,
+                  weightGrad,
+                  frameSize,
+                  valueSize,
+                  batchSize - 1,
+                  1.0,
+                  1.0);
   }
   CHECK_SYNC("hl_lstm_parallel_backward_weight");
 }
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 9bcc7fb7de..607efb4f6b 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 #include "hl_matrix.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sequence.h"
 #include "hl_sparse.ph"
 #include "paddle/utils/Logging.h"
-#include "hl_device_functions.cuh"
-#include "hl_gpu_matrix_kernel.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
-void hl_matrix_add(real *A_d,
-                   real *B_d,
-                   real *C_d,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1 * a + p2 * b);
+void hl_matrix_add(real* A_d,
+                   real* B_d,
+                   real* C_d,
                    int dimM,
                    int dimN,
                    real alpha,
@@ -36,33 +35,32 @@ void hl_matrix_add(real *A_d,
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  hl_gpu_apply_ternary_op
-    <real, ternary::_add<real>, 0, 0>(ternary::_add<real>(alpha, beta),
-                                      A_d,
-                                      B_d,
-                                      C_d,
-                                      dimM,
-                                      dimN,
-                                      dimN,
-                                      dimN,
-                                      dimN);
+  hl_gpu_apply_ternary_op<real, ternary::_add<real>, 0, 0>(
+      ternary::_add<real>(alpha, beta),
+      A_d,
+      B_d,
+      C_d,
+      dimM,
+      dimN,
+      dimN,
+      dimN,
+      dimN);
   CHECK_SYNC("hl_matrix_add failed");
 }
 
 #ifdef PADDLE_TYPE_DOUBLE
-    #define THRESHOLD   128
+#define THRESHOLD 128
 #else
-    #define THRESHOLD   64
+#define THRESHOLD 64
 #endif
-__device__ __forceinline__
-void findMax(real* I,
-             real* dfMax_s,
-             int blockSize,
-             int base,
-             int curIdx,
-             int nextIdx,
-             int dimN,
-             real* max) {
+__device__ __forceinline__ void findMax(real* I,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN,
+                                        real* max) {
   dfMax_s[base] = -1.0e20;
   while (curIdx < dimN) {
     if (dfMax_s[base] < I[nextIdx]) {
@@ -78,25 +76,24 @@ void findMax(real* I,
     if (base < stride) {
       nextIdx = base + stride;
       if (dfMax_s[base] < dfMax_s[nextIdx]) {
-          dfMax_s[base] = dfMax_s[nextIdx];
+        dfMax_s[base] = dfMax_s[nextIdx];
       }
     }
   }
 
-  if (0 == base)  {
+  if (0 == base) {
     max[0] = dfMax_s[0];
   }
   __syncthreads();
 }
 
-__device__ __forceinline__
-void subMaxAndExp(real* I,
-                  real* O,
-                  int curIdx,
-                  int nextIdx,
-                  int blockSize,
-                  int dimN,
-                  real max) {
+__device__ __forceinline__ void subMaxAndExp(real* I,
+                                             real* O,
+                                             int curIdx,
+                                             int nextIdx,
+                                             int blockSize,
+                                             int dimN,
+                                             real max) {
   real val;
   while (curIdx < dimN) {
     val = I[nextIdx] - max;
@@ -115,14 +112,13 @@ void subMaxAndExp(real* I,
   __syncthreads();
 }
 
-__device__ __forceinline__
-void valueSum(real* O,
-              real* dfMax_s,
-              int blockSize,
-              int base,
-              int curIdx,
-              int nextIdx,
-              int dimN) {
+__device__ __forceinline__ void valueSum(real* O,
+                                         real* dfMax_s,
+                                         int blockSize,
+                                         int base,
+                                         int curIdx,
+                                         int nextIdx,
+                                         int dimN) {
   dfMax_s[base] = 0;
   while (curIdx < dimN) {
     dfMax_s[base] += O[nextIdx];
@@ -141,13 +137,8 @@ void valueSum(real* O,
   __syncthreads();
 }
 
-__device__ __forceinline__
-void divSum(real* O,
-            real sum,
-            int curIdx,
-            int nextIdx,
-            int blockSize,
-            int dimN) {
+__device__ __forceinline__ void divSum(
+    real* O, real sum, int curIdx, int nextIdx, int blockSize, int dimN) {
   while (curIdx < dimN) {
     O[nextIdx] /= sum;
     nextIdx += blockSize;
@@ -155,20 +146,18 @@ void divSum(real* O,
   }
 }
 
-__device__ __forceinline__
-void softmax(real* I,
-             real* O,
-             real* dfMax_s,
-             int blockSize,
-             int base,
-             int curIdx,
-             int nextIdx,
-             int dimN) {
+__device__ __forceinline__ void softmax(real* I,
+                                        real* O,
+                                        real* dfMax_s,
+                                        int blockSize,
+                                        int base,
+                                        int curIdx,
+                                        int nextIdx,
+                                        int dimN) {
   __shared__ real max;
 
   // find the max number
-  findMax(I, dfMax_s, blockSize, base, curIdx,
-          nextIdx, dimN, &max);
+  findMax(I, dfMax_s, blockSize, base, curIdx, nextIdx, dimN, &max);
 
   // sub max Value and do Exp operation
   subMaxAndExp(I, O, base, nextIdx, blockSize, dimN, max);
@@ -181,8 +170,8 @@ void softmax(real* I,
   divSum(O, dfMax_s[0], curIdx, nextIdx, blockSize, dimN);
 }
 
-template<int blockSize>
-__global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
+template <int blockSize>
+__global__ void KeMatrixSoftMax(real* O, real* I, int dimN) {
   int base = threadIdx.x;
   __shared__ real dfMax_s[blockSize];
   int nextIdx = blockIdx.x * dimN + base;
@@ -191,19 +180,18 @@ __global__ void KeMatrixSoftMax(real *O, real *I, int dimN) {
   softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
 
-void hl_matrix_softmax(real *A_d, real *C_d, int dimM, int dimN) {
+void hl_matrix_softmax(real* A_d, real* C_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
   dim3 block(512, 1);
   dim3 grid(dimM, 1);
-  KeMatrixSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
+  KeMatrixSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, dimN);
   CHECK_SYNC("hl_matrix_softmax failed");
 }
 
-template<int blockSize>
-__global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
+template <int blockSize>
+__global__ void KeSequenceSoftMax(real* O, real* I, const int* index) {
   int base = threadIdx.x;
   int bid = blockIdx.x;
   __shared__ real dfMax_s[blockSize];
@@ -217,8 +205,8 @@ __global__ void KeSequenceSoftMax(real *O, real *I, const int* index) {
   softmax(I, O, dfMax_s, blockSize, base, curIdx, nextIdx, dimN);
 }
 
-void hl_sequence_softmax_forward(real *A_d,
-                                 real *C_d,
+void hl_sequence_softmax_forward(real* A_d,
+                                 real* C_d,
                                  const int* index,
                                  int numSequence) {
   CHECK_NOTNULL(A_d);
@@ -226,59 +214,48 @@ void hl_sequence_softmax_forward(real *A_d,
 
   dim3 block(512, 1);
   dim3 grid(numSequence, 1);
-  KeSequenceSoftMax<512>
-           <<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
+  KeSequenceSoftMax<512><<<grid, block, 0, STREAM_DEFAULT>>>(C_d, A_d, index);
   CHECK_SYNC("hl_sequence_softmax_forward failed");
 }
 
-__global__ void KeMatrixDerivative(real *grad_d,
-                                   real *output_d,
-                                   real *sftmaxSum_d,
-                                   int dimM,
-                                   int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixDerivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
   int index;
 
   if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
     grad_d[index] = output_d[index] * (grad_d[index] - sftmaxSum_d[rowIdx]);
   }
 }
 
-void hl_matrix_softmax_derivative(real *grad_d,
-                                  real *output_d,
-                                  real *sftmaxSum_d,
-                                  int dimM,
-                                  int dimN) {
+void hl_matrix_softmax_derivative(
+    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {
   CHECK_NOTNULL(grad_d);
   CHECK_NOTNULL(output_d);
   CHECK_NOTNULL(sftmaxSum_d);
 
   int blocksX = (dimM + 0) / 1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksY = (dimN + 1024 - 1) / 1024;
   dim3 threads(1, 1024);
   dim3 grid(blocksX, blocksY);
 
-  KeMatrixDerivative<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, sftmaxSum_d, dimM, dimN);
+  KeMatrixDerivative<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, sftmaxSum_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-__global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
-                                                real* entropy,
-                                                int* row,
-                                                int* col,
-                                                int dimM,
-                                                int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropy(
+    real* output, real* entropy, int* row, int* col, int dimM, int dimN) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   if (index < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
       entropy[index] -= log(1 - output[index * dimN + i]);
     }
-    int *row_col = col + row[index];
+    int* row_col = col + row[index];
     int col_num = row[index + 1] - row[index];
-    for (int i = 0; i < col_num; i ++) {
+    for (int i = 0; i < col_num; i++) {
       real o = output[index * dimN + row_col[i]];
       entropy[index] -= log(o / (1 - o));
     }
@@ -299,37 +276,30 @@ void hl_matrix_multi_binary_cross_entropy(real* output,
   dim3 threads(n_threads);
   dim3 grid(blocks);
   hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, entropy, mat->csr_row, mat->csr_col, dimM, dimN);
   CHECK_SYNC("hl_matrix_multi_binary_cross_entropy failed");
 }
 
-__global__ void KeMatrixMultiBinaryCrossEntropyBp(real* output,
-                                                  real* grad,
-                                                  int* row,
-                                                  int* col,
-                                                  int dimM,
-                                                  int dimN) {
+__global__ void KeMatrixMultiBinaryCrossEntropyBp(
+    real* output, real* grad, int* row, int* col, int dimM, int dimN) {
   int row_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (row_idx < dimM) {
-    for (int i = 0; i < dimN; i ++) {
+    for (int i = 0; i < dimN; i++) {
       int index = row_idx * dimN + i;
       grad[index] += 1.0 / (1 - output[index]);
     }
     int col_num = row[row_idx + 1] - row[row_idx];
-    int *row_col = col + row[row_idx];
-    for (int i = 0; i < col_num; i ++) {
+    int* row_col = col + row[row_idx];
+    for (int i = 0; i < col_num; i++) {
       int index = row_idx * dimN + row_col[i];
       grad[index] -= 1.0 / (output[index] * (1 - output[index]));
     }
   }
 }
 
-void hl_matrix_multi_binary_cross_entropy_bp(real* output,
-                                             real* grad,
-                                             hl_sparse_matrix_s csr_mat,
-                                             int dimM,
-                                             int dimN) {
+void hl_matrix_multi_binary_cross_entropy_bp(
+    real* output, real* grad, hl_sparse_matrix_s csr_mat, int dimM, int dimN) {
   CHECK_NOTNULL(output);
   CHECK_NOTNULL(grad);
   CHECK_NOTNULL(csr_mat);
@@ -339,16 +309,13 @@ void hl_matrix_multi_binary_cross_entropy_bp(real* output,
   dim3 threads(n_threads);
   dim3 grid(blocks);
   hl_csr_matrix mat = (hl_csr_matrix)(csr_mat->matrix);
-  KeMatrixMultiBinaryCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-          (output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
+  KeMatrixMultiBinaryCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, grad, mat->csr_row, mat->csr_col, dimM, dimN);
   CHECK_SYNC("hl_matrix_multi_binary_cross_entropy_bp failed");
 }
 
-__global__ void KeMatrixCrossEntropy(real* O,
-                                     real* E,
-                                     int* label,
-                                     int dimM,
-                                     int dimN) {
+__global__ void KeMatrixCrossEntropy(
+    real* O, real* E, int* label, int dimM, int dimN) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int newBase;
   if (index < dimM) {
@@ -358,59 +325,49 @@ __global__ void KeMatrixCrossEntropy(real* O,
   }
 }
 
-void hl_matrix_cross_entropy(real* A_d,
-                             real* C_d,
-                             int* label_d,
-                             int dimM,
-                             int dimN) {
+void hl_matrix_cross_entropy(
+    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(C_d);
 
   int blocks = (dimM + 1024 - 1) / 1024;
   dim3 threads(1024, 1);
   dim3 grid(blocks, 1);
-  KeMatrixCrossEntropy<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, C_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropy<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      A_d, C_d, label_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_cross_entropy failed");
 }
 
-__global__ void KeMatrixCrossEntropyBp(real* grad_d,
-                                       real* output_d,
-                                       int* label_d,
-                                       int dimM,
-                                       int dimN) {
-  int rowIdx = blockIdx.x*blockDim.x + threadIdx.x;
-  int colIdx = blockIdx.y*blockDim.y + threadIdx.y;
+__global__ void KeMatrixCrossEntropyBp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
+  int rowIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  int colIdx = blockIdx.y * blockDim.y + threadIdx.y;
   int index;
   if (rowIdx < dimM && colIdx < dimN) {
-    index = rowIdx*dimN + colIdx;
+    index = rowIdx * dimN + colIdx;
     if (label_d[rowIdx] == colIdx) {
       grad_d[index] -= 1.0f / output_d[index];
     }
   }
 }
 
-void hl_matrix_cross_entropy_bp(real* grad_d,
-                                real* output_d,
-                                int* label_d,
-                                int dimM,
-                                int dimN) {
+void hl_matrix_cross_entropy_bp(
+    real* grad_d, real* output_d, int* label_d, int dimM, int dimN) {
   CHECK_NOTNULL(grad_d);
   CHECK_NOTNULL(output_d);
   CHECK_NOTNULL(label_d);
 
-  int blocksX = (dimM + 0)/1;
-  int blocksY = (dimN + 1024 -1) / 1024;
+  int blocksX = (dimM + 0) / 1;
+  int blocksY = (dimN + 1024 - 1) / 1024;
   dim3 threads(1, 1024);
   dim3 grid(blocksX, blocksY);
-  KeMatrixCrossEntropyBp<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (grad_d, output_d, label_d, dimM, dimN);
+  KeMatrixCrossEntropyBp<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_d, output_d, label_d, dimM, dimN);
   CHECK_SYNC("hl_matrix_cross_entropy_bp failed");
 }
 
 void hl_matrix_zero_mem(real* data, int num) {
-  hl_gpu_apply_unary_op(
-        unary::Zero<real>(), data, 1, num, num);
+  hl_gpu_apply_unary_op(unary::Zero<real>(), data, 1, num, num);
 }
 
 __global__ void KeParamReluForward(real* output,
@@ -423,8 +380,8 @@ __global__ void KeParamReluForward(real* output,
   int ty = blockIdx.y * blockDim.y + threadIdx.y;
   if (tx < width && ty < height) {
     int index = ty * width + tx;
-    output[index] = input[index] > 0 ? input[index] :
-        input[index] * w[tx / partial_sum];
+    output[index] =
+        input[index] > 0 ? input[index] : input[index] * w[tx / partial_sum];
   }
 }
 
@@ -439,14 +396,14 @@ void hl_param_relu_forward(real* output,
   CHECK_NOTNULL(w);
   dim3 threads(16, 16);
   int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
   dim3 grid(blockX, blockY);
-  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input, w, width, height, partial_sum);
+  KeParamReluForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input, w, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_forward failed");
 }
 
-template<int blockSize>
+template <int blockSize>
 __global__ void KeParamReluBackWardW(real* grad_w,
                                      real* grad_o,
                                      real* input,
@@ -491,8 +448,8 @@ void hl_param_relu_backward_w(real* grad_w,
   int grid_num = width / partial_sum;
   dim3 threads(blockSize, 1);
   dim3 grid(grid_num, 1);
-  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad_w, grad_o, input, width, height, partial_sum);
+  KeParamReluBackWardW<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_w, grad_o, input, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_backward_w failed");
 }
 
@@ -524,19 +481,15 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_NOTNULL(diff);
   dim3 threads(16, 16);
   int blockX = (width + 16 - 1) / 16;
-  int blockY = (height + 16 -1) / 16;
+  int blockY = (height + 16 - 1) / 16;
   dim3 grid(blockX, blockY);
-  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>
-      (grad_o, data, w, diff, width, height, partial_sum);
+  KeParamReluBackwardDiff<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad_o, data, w, diff, width, height, partial_sum);
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-__global__ void KeMatrixAddSharedBias(real* A,
-                                      real* B,
-                                      const int channel,
-                                      const int M,
-                                      const int N,
-                                      real scale) {
+__global__ void KeMatrixAddSharedBias(
+    real* A, real* B, const int channel, const int M, const int N, real scale) {
   int index = blockIdx.x * blockDim.x + threadIdx.x;
   int dim = N / channel;
   if (index < M * N) {
@@ -554,15 +507,14 @@ void hl_matrix_add_shared_bias(real* A_d,
                                real scale) {
   const int blocks = 512;
   const int grids = DIVUP(dimM * dimN, blocks);
-  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
-    (A_d, B_d, channel, dimM, dimN, scale);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      A_d, B_d, channel, dimM, dimN, scale);
   CHECK_SYNC("hl_matrix_add_shared_bias failed");
 }
 
-
 template <int blockSize>
-__global__ void KeMatrixCollectSharedBias(real *B,
-                                          real *A,
+__global__ void KeMatrixCollectSharedBias(real* B,
+                                          real* A,
                                           const int channel,
                                           const int M,
                                           const int N,
@@ -589,7 +541,7 @@ __global__ void KeMatrixCollectSharedBias(real *B,
       int n = j * blockSize + tid;
       int m = n / dim;
       int w = n % dim;
-      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      smem[tid] = (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
       __syncthreads();
       simpleReduce(smem, tid, blockSize);
       sum += smem[0];
@@ -611,33 +563,244 @@ void hl_matrix_collect_shared_bias(real* B_d,
   const int limit = 64;
   int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
 
-  KeMatrixCollectSharedBias<blocks>
-      <<< grids, blocks, 0, STREAM_DEFAULT>>>
-      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  KeMatrixCollectSharedBias<blocks><<<grids, blocks, 0, STREAM_DEFAULT>>>(
+      B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
 
-__global__ void keMatrixRotate(real* mat, real* matRot,
-                               int dimM, int dimN, bool clockWise) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx < dimM * dimN) {
-        int i = idx / dimN;
-        int j = idx % dimN;
-        if (clockWise) {
-            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
-        } else {
-            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+__global__ void keMatrixRotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < dimM * dimN) {
+    int i = idx / dimN;
+    int j = idx % dimN;
+    if (clockWise) {
+      matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+    } else {
+      matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+    }
+  }
+}
+
+void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {
+  CHECK_NOTNULL(mat);
+  CHECK_NOTNULL(matRot);
+  const int threads = 512;
+  const int blocks = DIVUP(dimM * dimN, threads);
+  keMatrixRotate<<<blocks, threads, 0, STREAM_DEFAULT>>>(
+      mat, matRot, dimM, dimN, clockWise);
+  CHECK_SYNC("hl_matrix_rotate failed");
+}
+
+__global__ void keMatrixVol2Col(int num_kernels,
+                                const real* dataSrc,
+                                real* dataDst,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int d_out = (index / width_col / height_col) % depth_col;
+    int channel_in = index / width_col / height_col / depth_col;
+    int channel_out = channel_in * filterD * filterH * filterW;
+    int w_in = w_out * strideW - paddingW;
+    int h_in = h_out * strideH - paddingH;
+    int d_in = d_out * strideD - paddingD;
+
+    dataDst +=
+        ((channel_out * depth_col + d_out) * height_col + h_out) * width_col +
+        w_out;
+    dataSrc += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filterD; ++k) {
+      for (int i = 0; i < filterH; ++i) {
+        for (int j = 0; j < filterW; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *dataDst = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                      w < width)
+                         ? dataSrc[(k * height + i) * width + j]
+                         : 0;
+          dataDst += depth_col * height_col * width_col;
         }
+      }
     }
+  }
+}
+
+void hl_matrix_vol2Col(const real* dataSrc,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real* dataDst) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth_col * height_col * width_col;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixVol2Col<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataSrc,
+                                                          dataDst,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col);
+  CHECK_SYNC("hl_matrix_vol2Col failed");
+}
+
+__global__ void keMatrixCol2Vol(int num_kernels,
+                                real* dataDst,
+                                const real* dataSrc,
+                                int depth,
+                                int height,
+                                int width,
+                                int filterD,
+                                int filterH,
+                                int filterW,
+                                int strideD,
+                                int strideH,
+                                int strideW,
+                                int paddingD,
+                                int paddingH,
+                                int paddingW,
+                                int depth_col,
+                                int height_col,
+                                int width_col,
+                                real alpha,
+                                real beta) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    real srcVal = 0;
+    real dstVal = dataDst[index];
+    int w = index % width + paddingW;
+    int h = (index / width) % height + paddingH;
+    int d = (index / width / height) % depth + paddingD;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start = (w < filterW) ? 0 : (w - filterW) / strideW + 1;
+    int w_col_end = min(w / strideW + 1, width_col);
+    int h_col_start = (h < filterH) ? 0 : (h - filterH) / strideH + 1;
+    int h_col_end = min(h / strideH + 1, height_col);
+    int d_col_start = (d < filterD) ? 0 : (d - filterD) / strideD + 1;
+    int d_col_end = min(d / strideD + 1, depth_col);
+
+    int offset = (c * filterD * filterW * filterH + d * filterW * filterH +
+                  h * filterW + w) *
+                 depth_col * height_col * width_col;
+
+    int coeff_d_col =
+        (1 - strideD * filterW * filterH * depth_col) * height_col * width_col;
+    int coeff_h_col =
+        (1 - strideH * filterW * depth_col * height_col) * width_col;
+    int coeff_w_col = (1 - strideW * depth_col * height_col * width_col);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          srcVal += dataSrc[offset + d_col * coeff_d_col + h_col * coeff_h_col +
+                            w_col * coeff_w_col];
+        }
+      }
+    }
+    dataDst[index] = alpha * srcVal + beta * dstVal;
+  }
+}
+
+void hl_matrix_col2Vol(real* dataDst,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       const real* dataSrc,
+                       real alpha,
+                       real beta) {
+  int depth_col = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int height_col = (height + 2 * paddingH - filterH) / strideH + 1;
+  int width_col = (width + 2 * paddingW - filterW) / strideW + 1;
+  int num_kernels = channels * depth * height * width;
+
+  const int threads = 512;
+  const int blocks = DIVUP(num_kernels, threads);
+
+  keMatrixCol2Vol<<<blocks, threads, 0, STREAM_DEFAULT>>>(num_kernels,
+                                                          dataDst,
+                                                          dataSrc,
+                                                          depth,
+                                                          height,
+                                                          width,
+                                                          filterD,
+                                                          filterH,
+                                                          filterW,
+                                                          strideD,
+                                                          strideH,
+                                                          strideW,
+                                                          paddingD,
+                                                          paddingH,
+                                                          paddingW,
+                                                          depth_col,
+                                                          height_col,
+                                                          width_col,
+                                                          alpha,
+                                                          beta);
+
+  CHECK_SYNC("hl_matrix_col2Vol failed");
+}
+
+__global__ void keVectorCast2Int(int* out, real* vec, int size) {
+  for (int i = threadIdx.x; i < (size); i += blockDim.x) {
+    out[i] = int(vec[i]);
+  }
 }
 
-void hl_matrix_rotate(real *mat, real* matRot,
-                      int dimM, int dimN, bool clockWise) {
-    CHECK_NOTNULL(mat);
-    CHECK_NOTNULL(matRot);
-    const int threads = 512;
-    const int blocks = DIVUP(dimM * dimN, threads);
-    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
-            (mat, matRot, dimM, dimN, clockWise);
-    CHECK_SYNC("hl_matrix_rotate failed");
+void hl_vector_cast2int(int* out, real* vec, int size) {
+  keVectorCast2Int<<<1, 512, 0, STREAM_DEFAULT>>>(out, vec, size);
+  CHECK_SYNC("hl_vector_cast2int failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0fe2877f89..c52780dfca 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -16,36 +16,36 @@ limitations under the License. */
 #include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
 
-__global__ void KeMaxSequenceForward(real *input,
-                                     const int *sequence,
+__global__ void KeMaxSequenceForward(real* input,
+                                     const int* sequence,
                                      real* output,
-                                     int *index,
+                                     int* index,
                                      int numSequences,
                                      int dim) {
   int dimIdx = threadIdx.x;
   int sequenceId = blockIdx.x;
   if (sequenceId >= numSequences) return;
   int start = sequence[sequenceId];
-  int end = sequence[sequenceId+1];
+  int end = sequence[sequenceId + 1];
 
   for (int i = dimIdx; i < dim; i += blockDim.x) {
     real tmp = -HL_FLOAT_MAX;
     int tmpId = -1;
     for (int insId = start; insId < end; insId++) {
-      if (tmp < input[insId*dim + i]) {
-        tmp = input[insId*dim + i];
+      if (tmp < input[insId * dim + i]) {
+        tmp = input[insId * dim + i];
         tmpId = insId;
       }
     }
-    output[sequenceId*dim + i] = tmp;
-    index[sequenceId*dim + i] = tmpId;
+    output[sequenceId * dim + i] = tmp;
+    index[sequenceId * dim + i] = tmpId;
   }
 }
 
 void hl_max_sequence_forward(real* input,
                              const int* sequence,
                              real* output,
-                             int *index,
+                             int* index,
                              int numSequences,
                              int dim) {
   CHECK_NOTNULL(input);
@@ -55,29 +55,23 @@ void hl_max_sequence_forward(real* input,
 
   dim3 threads(256, 1);
   dim3 grid(numSequences, 1);
-  KeMaxSequenceForward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, output, index, numSequences, dim);
+  KeMaxSequenceForward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, sequence, output, index, numSequences, dim);
   CHECK_SYNC("hl_max_sequence_forward failed");
 }
 
-__global__ void KeMaxSequenceBackward(real *outputGrad,
-                                      int *index,
-                                      real* inputGrad,
-                                      int numSequences,
-                                      int dim) {
+__global__ void KeMaxSequenceBackward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   int colIdx = idx % dim;
-  if (idx < numSequences*dim) {
+  if (idx < numSequences * dim) {
     int insId = index[idx];
     inputGrad[insId * dim + colIdx] += outputGrad[idx];
   }
 }
 
-void hl_max_sequence_backward(real* outputGrad,
-                              int *index,
-                              real* inputGrad,
-                              int numSequences,
-                              int dim) {
+void hl_max_sequence_backward(
+    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {
   CHECK_NOTNULL(outputGrad);
   CHECK_NOTNULL(index);
   CHECK_NOTNULL(inputGrad);
@@ -85,12 +79,12 @@ void hl_max_sequence_backward(real* outputGrad,
   unsigned int blocks = (numSequences * dim + 128 - 1) / 128;
   dim3 threads(128, 1);
   dim3 grid(blocks, 1);
-  KeMaxSequenceBackward<<< grid, threads, 0, STREAM_DEFAULT >>>
-      (outputGrad, index, inputGrad, numSequences, dim);
+  KeMaxSequenceBackward<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      outputGrad, index, inputGrad, numSequences, dim);
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
                                 int* ids,
@@ -104,8 +98,8 @@ __global__ void KeMatrixAddRows(real* output,
   while (sampleId < numSamples) {
     int tableId = ids[sampleId];
     if ((0 <= tableId) && (tableId < tableSize)) {
-      real *outputData = output + sampleId * dim;
-      real *tableData = table + tableId * dim;
+      real* outputData = output + sampleId * dim;
+      real* tableData = table + tableId * dim;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow == 0) {
           outputData[i] += tableData[i];
@@ -114,24 +108,27 @@ __global__ void KeMatrixAddRows(real* output,
         }
       }
     }
-    sampleId += blockDimY*gridDimX;
+    sampleId += blockDimY * gridDimX;
   }
 }
 
-template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
-__global__
-void KeSequence2Batch(real *batch,
-                      real *sequence,
-                      const int *batchIndex,
-                      int seqWidth,
-                      int batchCount) {
+template <int blockDimX,
+          int blockDimY,
+          int gridDimX,
+          bool seq2batch,
+          bool isAdd>
+__global__ void KeSequence2Batch(real* batch,
+                                 real* sequence,
+                                 const int* batchIndex,
+                                 int seqWidth,
+                                 int batchCount) {
   int idx = threadIdx.x;
   int idy = threadIdx.y;
   int id = blockIdx.x + idy * gridDimX;
   while (id < batchCount) {
     int seqId = batchIndex[id];
-    real* batchData = batch + id*seqWidth;
-    real* seqData = sequence + seqId*seqWidth;
+    real* batchData = batch + id * seqWidth;
+    real* seqData = sequence + seqId * seqWidth;
     for (int i = idx; i < seqWidth; i += blockDimX) {
       if (seq2batch) {
         if (isAdd) {
@@ -147,13 +144,13 @@ void KeSequence2Batch(real *batch,
         }
       }
     }
-    id += blockDimY*gridDimX;
+    id += blockDimY * gridDimX;
   }
 }
 
-void hl_sequence2batch_copy(real *batch,
-                            real *sequence,
-                            const int *batchIndex,
+void hl_sequence2batch_copy(real* batch,
+                            real* sequence,
+                            const int* batchIndex,
                             int seqWidth,
                             int batchCount,
                             bool seq2batch) {
@@ -164,18 +161,18 @@ void hl_sequence2batch_copy(real *batch,
   dim3 threads(128, 8);
   dim3 grid(8, 1);
   if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   } else {
-    KeSequence2Batch<128, 8, 8, 0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   }
   CHECK_SYNC("hl_sequence2batch_copy failed");
 }
 
-void hl_sequence2batch_add(real *batch,
-                           real *sequence,
-                           int *batchIndex,
+void hl_sequence2batch_add(real* batch,
+                           real* sequence,
+                           int* batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {
@@ -186,23 +183,22 @@ void hl_sequence2batch_add(real *batch,
   dim3 threads(128, 8);
   dim3 grid(8, 1);
   if (seq2batch) {
-    KeSequence2Batch<128, 8, 8, 1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   } else {
-    KeSequence2Batch<128, 8, 8, 0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (batch, sequence, batchIndex, seqWidth, batchCount);
+    KeSequence2Batch<128, 8, 8, 0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        batch, sequence, batchIndex, seqWidth, batchCount);
   }
   CHECK_SYNC("hl_sequence2batch_add failed");
 }
 
-template<bool normByTimes, bool seq2batch>
-__global__
-void KeSequence2BatchPadding(real* batch,
-                             real* sequence,
-                             const int* sequenceStartPositions,
-                             const size_t sequenceWidth,
-                             const size_t maxSequenceLength,
-                             const size_t numSequences) {
+template <bool normByTimes, bool seq2batch>
+__global__ void KeSequence2BatchPadding(real* batch,
+                                        real* sequence,
+                                        const int* sequenceStartPositions,
+                                        const size_t sequenceWidth,
+                                        const size_t maxSequenceLength,
+                                        const size_t numSequences) {
   int batchIdx = blockIdx.y;
   int sequenceStart = sequenceStartPositions[batchIdx];
   int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
@@ -269,45 +265,56 @@ void hl_sequence2batch_copy_padding(real* batch,
   int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
   dim3 threads(blockDimX, blockDimY);
 
-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
-      CUDA_BLOCK_SIZE;
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
   int gridDimY = numSequences;
   dim3 grid(gridDimX, gridDimY);
 
   if (seq2batch) {
     /* sequence -> batch */
     if (normByTimes) {
-      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     } else {
-      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     }
   } else {
     /* batch -> sequence */
     if (normByTimes) {
-      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<1, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     } else {
-      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
-              batch, sequence, sequenceStartPositions,
-              sequenceWidth, maxSequenceLength, numSequences);
+      KeSequence2BatchPadding<0, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          batch,
+          sequence,
+          sequenceStartPositions,
+          sequenceWidth,
+          maxSequenceLength,
+          numSequences);
     }
   }
 
   CHECK_SYNC("hl_sequence2batch_copy_padding failed");
 }
 
-__device__ inline float my_rsqrt(float x) {
-  return rsqrtf(x);
-}
+__device__ inline float my_rsqrt(float x) { return rsqrtf(x); }
 
-__device__ inline double my_rsqrt(double x) {
-  return rsqrt(x);
-}
+__device__ inline double my_rsqrt(double x) { return rsqrt(x); }
 
 __global__ void KeSequenceAvgForward(real* dst,
                                      real* src,
@@ -328,9 +335,9 @@ __global__ void KeSequenceAvgForward(real* dst,
     for (int i = start; i < end; i++) {
       sum += src[i * width + col];
     }
-    sum = mode == 1 ? sum :
-        (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    sum = mode == 1 ? sum : (mode == 0 ? sum / seqLength
+                                       : sum * my_rsqrt((real)seqLength));
+    dst[gid] += sum;
   }
 }
 
@@ -348,10 +355,10 @@ void hl_sequence_avg_forward(real* dst,
   int grid = DIVUP(width * height, 512);
 
   CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_forward!";
+      << "mode error in hl_sequence_avg_forward!";
 
-  KeSequenceAvgForward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgForward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_forward failed");
 }
 
@@ -371,8 +378,8 @@ __global__ void KeSequenceAvgBackward(real* dst,
     int seqLength = end - start;
     if (seqLength == 0) return;
     real grad = src[gid];
-    grad = mode == 1 ? grad :
-        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    grad = mode == 1 ? grad : (mode == 0 ? grad / seqLength
+                                         : grad * my_rsqrt((real)seqLength));
     for (int i = start; i < end; i++) {
       dst[i * width + col] += grad;
     }
@@ -393,9 +400,9 @@ void hl_sequence_avg_backward(real* dst,
   int grid = DIVUP(width * height, 512);
 
   CHECK(mode == 0 || mode == 1 || mode == 2)
-    << "mode error in hl_sequence_avg_backward!";
+      << "mode error in hl_sequence_avg_backward!";
 
-  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
-           (dst, src, starts, height, width, mode);
+  KeSequenceAvgBackward<<<grid, block, 0, STREAM_DEFAULT>>>(
+      dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_backward failed");
 }
diff --git a/paddle/cuda/src/hl_cuda_sparse.cu b/paddle/cuda/src/hl_cuda_sparse.cu
index ab9ab57c88..6351e7e01e 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cu
+++ b/paddle/cuda/src/hl_cuda_sparse.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_cuda.h"
+#include "hl_cuda_sparse.cuh"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 #include "hl_sparse.h"
 #include "hl_sparse.ph"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_apply.cuh"
-#include "hl_cuda_sparse.cuh"
 #include "paddle/utils/Logging.h"
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(mul_scalar, ONE_PARAMETER, a = a * p);
@@ -34,15 +33,15 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
   CHECK(A_d->format == HL_SPARSE_CSR) << "matrix format error!";
 
   if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
     return;
   }
 
   /* nnz != 0 */
   hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
-  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) &&
-        A_d2->csr_row && A_d2->csr_col) << "parameter transa error!";
+  CHECK((A_d2->csr_val || A_d->type == HL_NO_VALUE) && A_d2->csr_row &&
+        A_d2->csr_col)
+      << "parameter transa error!";
 
   int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
   int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -50,21 +49,11 @@ void hl_matrix_csr2dense(hl_sparse_matrix_s A_d,
   dim3 grid(blocksX, blocksY);
 
   if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsr2Dense<0>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsr2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
   } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsr2Dense<1>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
-                                             A_d2->csr_row,
-                                             A_d2->csr_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsr2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, C_d, dimM, dimN);
   } else {
   }
   CHECK_SYNC("hl_matrix_csr2dense failed");
@@ -80,15 +69,15 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
   CHECK(A_d->format == HL_SPARSE_CSC) << "matrix format error!";
 
   if (A_d->nnz == 0) {
-    hl_gpu_apply_unary_op(
-        unary::Zero<real>(), C_d, dimM, dimN, dimN);
+    hl_gpu_apply_unary_op(unary::Zero<real>(), C_d, dimM, dimN, dimN);
     return;
   }
 
   /* nnz != 0 */
   hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
-  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) &&
-        A_d2->csc_row && A_d2->csc_col) << "parameter transa error!";
+  CHECK((A_d2->csc_val || A_d->type == HL_NO_VALUE) && A_d2->csc_row &&
+        A_d2->csc_col)
+      << "parameter transa error!";
 
   int blocksX = (dimN + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
   int blocksY = (dimM + CU_CSR2DENSE_THREAD_X - 1) / CU_CSR2DENSE_THREAD_X;
@@ -96,21 +85,11 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
   dim3 grid(blocksX, blocksY);
 
   if (A_d->type == HL_NO_VALUE) {
-    KeSMatrixCsc2Dense<0>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsc2Dense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
   } else if (A_d->type == HL_FLOAT_VALUE) {
-    KeSMatrixCsc2Dense<1>
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(A_d2->csc_val,
-                                             A_d2->csc_row,
-                                             A_d2->csc_col,
-                                             C_d,
-                                             dimM,
-                                             dimN);
+    KeSMatrixCsc2Dense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        A_d2->csc_val, A_d2->csc_row, A_d2->csc_col, C_d, dimM, dimN);
   } else {
   }
   CHECK_SYNC("hl_matrix_csc2dense failed");
@@ -118,43 +97,43 @@ void hl_matrix_csc2dense(hl_sparse_matrix_s A_d,
 
 void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
                              hl_matrix_format_t format,
-                             hl_matrix_value_t  value_type,
+                             hl_matrix_value_t value_type,
                              int dimM,
                              int dimN,
                              int nnz) {
   CHECK_NOTNULL(A_d);
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
   CHECK(value_type == HL_FLOAT_VALUE || value_type == HL_NO_VALUE)
-    << "sparse matrix value type error!";
+      << "sparse matrix value type error!";
   /* avoid malloc 0 bytes */
   int nnz_s = (nnz == 0 ? 1 : nnz);
 
   if (format == HL_SPARSE_CSR) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     csr->sparsity = -1.0;
 
     if (value_type == HL_NO_VALUE) {
       csr->csr_val = NULL;
       csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->row_s = dimM + 1;
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csr;
     } else if (value_type == HL_FLOAT_VALUE) {
       csr->nnz_s = nnz_s;
-      csr->row_s = dimM+1;
-      csr->csr_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
-      csr->csr_row = (int*)hl_malloc_device((dimM+1)*sizeof(int));
-      csr->csr_col = (int*)hl_malloc_device((nnz_s)*sizeof(int));
+      csr->row_s = dimM + 1;
+      csr->csr_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csr->csr_row = (int *)hl_malloc_device((dimM + 1) * sizeof(int));
+      csr->csr_col = (int *)hl_malloc_device((nnz_s) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csr;
@@ -162,28 +141,28 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
   } else if (format == HL_SPARSE_CSC) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     csc->sparsity = -1.0f;
 
     if (value_type == HL_NO_VALUE) {
       csc->csc_val = NULL;
       csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->col_s = dimN + 1;
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csc;
     } else if (value_type == HL_FLOAT_VALUE) {
       csc->nnz_s = nnz_s;
-      csc->col_s = dimN+1;
-      csc->csc_val = (real*)hl_malloc_device((nnz_s)*sizeof(real));
-      csc->csc_row = (int*)hl_malloc_device((nnz_s)*sizeof(int));
-      csc->csc_col = (int*)hl_malloc_device((dimN+1)*sizeof(int));
+      csc->col_s = dimN + 1;
+      csc->csc_val = (real *)hl_malloc_device((nnz_s) * sizeof(real));
+      csc->csc_row = (int *)hl_malloc_device((nnz_s) * sizeof(int));
+      csc->csc_col = (int *)hl_malloc_device((dimN + 1) * sizeof(int));
 
       *A_d = (hl_sparse_matrix_s)tmp;
       (*A_d)->matrix = (hl_matrix_s)csc;
@@ -200,7 +179,7 @@ void hl_malloc_sparse_matrix(hl_sparse_matrix_s *A_d,
 void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
   CHECK_NOTNULL(A_d);
   CHECK(A_d->format == HL_SPARSE_CSR || A_d->format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (A_d->matrix == NULL) {
     free(A_d);
@@ -249,77 +228,77 @@ void hl_free_sparse_matrix(hl_sparse_matrix_s A_d) {
 }
 
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                void * dest_d,
+                                void *dest_d,
                                 size_t size,
                                 hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                 int dimM,
                                 int dimN,
                                 int nnz) {
   CHECK_NOTNULL(A_d);
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (format == HL_SPARSE_CSR) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    size_t size_ = (dimM+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimM + 1) * sizeof(int) + nnz * sizeof(int);
     if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
     }
     CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csr_matrix csr = (hl_csr_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
 
     if (value_type == HL_NO_VALUE) {
       csr->csr_val = NULL;
-      csr->csr_row = (int*)dest_d;
-      csr->csr_col = (int*)((char*)dest_d + (dimM+1)*sizeof(int));
+      csr->csr_row = (int *)dest_d;
+      csr->csr_col = (int *)((char *)dest_d + (dimM + 1) * sizeof(int));
     } else {
-      csr->csr_val = (real*)dest_d;
-      csr->csr_row = (int*)((char*)dest_d + nnz*sizeof(real));
-      csr->csr_col = (int*)((char*)dest_d +
-                            nnz*sizeof(real) +
-                            (dimM+1)*sizeof(int));
+      csr->csr_val = (real *)dest_d;
+      csr->csr_row = (int *)((char *)dest_d + nnz * sizeof(real));
+      csr->csr_col = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimM + 1) * sizeof(int));
     }
     csr->nnz_s = nnz;
-    csr->row_s = dimM+1;
+    csr->row_s = dimM + 1;
     csr->sparsity = -1.0;
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csr;
   } else if (format == HL_SPARSE_CSC) {
     CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
-    size_t size_ = (dimN+1)*sizeof(int) + nnz*sizeof(int);
+    size_t size_ = (dimN + 1) * sizeof(int) + nnz * sizeof(int);
     if (value_type != HL_NO_VALUE) {
-      size_ += nnz*sizeof(real);
+      size_ += nnz * sizeof(real);
     }
     CHECK_LE(size_, size) << "dest_d size(" << size
-      << ") too small, should bigger than(" << size_ << ")!";
+                          << ") too small, should bigger than(" << size_
+                          << ")!";
 
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
-    hl_csc_matrix csc = (hl_csc_matrix)(tmp+sizeof(_hl_sparse_matrix_s));
+    hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
     if (value_type == HL_NO_VALUE) {
       csc->csc_val = NULL;
-      csc->csc_col = (int*)dest_d;
-      csc->csc_row = (int*)((char*)dest_d + (dimN+1)*sizeof(int));
+      csc->csc_col = (int *)dest_d;
+      csc->csc_row = (int *)((char *)dest_d + (dimN + 1) * sizeof(int));
     } else {
-      csc->csc_val = (real*)dest_d;
-      csc->csc_col = (int*)((char*)dest_d + nnz*sizeof(real));
-      csc->csc_row = (int*)((char*)dest_d +
-                            nnz*sizeof(real) +
-                            (dimN+1)*sizeof(int));
+      csc->csc_val = (real *)dest_d;
+      csc->csc_col = (int *)((char *)dest_d + nnz * sizeof(real));
+      csc->csc_row = (int *)((char *)dest_d + nnz * sizeof(real) +
+                             (dimN + 1) * sizeof(int));
     }
     csc->nnz_s = nnz;
-    csc->col_s = dimN+1;
+    csc->col_s = dimN + 1;
     csc->sparsity = -1.0f;
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csc;
@@ -333,11 +312,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
 }
 
 void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
-                                real* value_d,
-                                int* rows_d,
-                                int* cols_d,
+                                real *value_d,
+                                int *rows_d,
+                                int *cols_d,
                                 hl_matrix_format_t format,
-                                hl_matrix_value_t  value_type,
+                                hl_matrix_value_t value_type,
                                 int dimM,
                                 int dimN,
                                 int nnz) {
@@ -345,11 +324,11 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
   CHECK(dimM > 0 && nnz >= 0) << "sparse matrix size error!";
 
   CHECK(format == HL_SPARSE_CSR || format == HL_SPARSE_CSC)
-    << "sparse matrix format error!";
+      << "sparse matrix format error!";
 
   if (format == HL_SPARSE_CSR) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csr_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csr_matrix));
     CHECK_NOTNULL(tmp);
 
     hl_csr_matrix csr = (hl_csr_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -362,8 +341,8 @@ void hl_construct_sparse_matrix(hl_sparse_matrix_s *A_d,
     *A_d = (hl_sparse_matrix_s)tmp;
     (*A_d)->matrix = (hl_matrix_s)csr;
   } else if (format == HL_SPARSE_CSC) {
-    char* tmp = (char*)malloc(sizeof(_hl_sparse_matrix_s)
-                              + sizeof(_hl_csc_matrix));
+    char *tmp =
+        (char *)malloc(sizeof(_hl_sparse_matrix_s) + sizeof(_hl_csc_matrix));
     CHECK_NOTNULL(tmp);
 
     hl_csc_matrix csc = (hl_csc_matrix)(tmp + sizeof(_hl_sparse_matrix_s));
@@ -396,35 +375,30 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
                           hl_stream_t stream) {
   CHECK_NOTNULL(csr_matrix);
   CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format!";
+      << "csr_matrix is not csr format!";
   CHECK_NOTNULL(csr_matrix->matrix);
 
   hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  CHECK_LE(csr_matrix->nnz, csr->nnz_s)
-    << "copy size " << csr_matrix->nnz
-    << " is big than alloc size " << csr->nnz_s;
+  CHECK_LE(csr_matrix->nnz, csr->nnz_s) << "copy size " << csr_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csr->nnz_s;
 
-  CHECK_LE((csr_matrix->rows+1), csr->row_s)
-    << "copy size " << (csr_matrix->rows + 1)
-    << " is big than alloc size " << csr->row_s;
+  CHECK_LE((csr_matrix->rows + 1), csr->row_s)
+      << "copy size " << (csr_matrix->rows + 1) << " is big than alloc size "
+      << csr->row_s;
 
-  CHECK(csr_matrix->type == HL_FLOAT_VALUE ||
-        csr_matrix->type == HL_NO_VALUE)
-        << "sparse matrix value type error!";
+  CHECK(csr_matrix->type == HL_FLOAT_VALUE || csr_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
 
   if (csr_matrix->type == HL_NO_VALUE) {
     if (csr_row == NULL && csr_col == NULL) {
       return;
     } else if (csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_row,
-                      csr_row,
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
 
-      hl_memcpy_async(csr->csr_col,
-                      csr_col,
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
     }
@@ -432,30 +406,21 @@ void hl_memcpy_csr_matrix(hl_sparse_matrix_s csr_matrix,
     if (csr_val == NULL && csr_row == NULL && csr_col == NULL) {
       return;
     } else if (csr_val != NULL && csr_row == NULL && csr_col == NULL) {
-      hl_memcpy_async(csr->csr_val,
-                      csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
     } else if (csr_val != NULL && csr_row != NULL && csr_col != NULL) {
-      hl_memcpy_async(csr->csr_val,
-                      csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
-                      stream);
-      hl_memcpy_async(csr->csr_row,
-                      csr_row,
-                      (csr_matrix->rows+1)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csr->csr_col,
-                      csr_col,
-                      (csr_matrix->nnz)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csr->csr_val, csr_val, (csr_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csr->csr_row, csr_row, (csr_matrix->rows + 1) * sizeof(int), stream);
+      hl_memcpy_async(
+          csr->csr_col, csr_col, (csr_matrix->nnz) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csr_row or csr_col is null pointer!";
     }
   }
 
-  csr->sparsity = ((float)csr_matrix->nnz) /
-                  ((float)csr_matrix->rows) /
+  csr->sparsity = ((float)csr_matrix->nnz) / ((float)csr_matrix->rows) /
                   ((float)csr_matrix->cols);
 }
 
@@ -466,33 +431,28 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
                           hl_stream_t stream) {
   CHECK_NOTNULL(csc_matrix);
   CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-    << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
 
   hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  CHECK_LE(csc_matrix->nnz, csc->nnz_s)
-    << "copy size " << csc_matrix->nnz
-    << " is big than alloc size " << csc->nnz_s;
+  CHECK_LE(csc_matrix->nnz, csc->nnz_s) << "copy size " << csc_matrix->nnz
+                                        << " is big than alloc size "
+                                        << csc->nnz_s;
 
-  CHECK_LE((csc_matrix->cols+1), csc->col_s)
-    << "copy size " <<(csc_matrix->cols + 1)
-    << " is big than alloc size " << csc->col_s;
+  CHECK_LE((csc_matrix->cols + 1), csc->col_s)
+      << "copy size " << (csc_matrix->cols + 1) << " is big than alloc size "
+      << csc->col_s;
 
-  CHECK(csc_matrix->type == HL_FLOAT_VALUE ||
-        csc_matrix->type == HL_NO_VALUE)
-        << "sparse matrix value type error!";
+  CHECK(csc_matrix->type == HL_FLOAT_VALUE || csc_matrix->type == HL_NO_VALUE)
+      << "sparse matrix value type error!";
 
   if (csc_matrix->type == HL_NO_VALUE) {
     if (csc_row == NULL && csc_col == NULL) {
       return;
     } else if (csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_row,
-                      csc_row,
-                      (csc_matrix->nnz)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
     }
@@ -500,30 +460,21 @@ void hl_memcpy_csc_matrix(hl_sparse_matrix_s csc_matrix,
     if (csc_val == NULL && csc_row == NULL && csc_col == NULL) {
       return;
     } else if (csc_val != NULL && csc_row == NULL && csc_col == NULL) {
-      hl_memcpy_async(csc->csc_val,
-                      csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
     } else if (csc_val != NULL && csc_row != NULL && csc_col != NULL) {
-      hl_memcpy_async(csc->csc_val,
-                      csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
-                      stream);
-      hl_memcpy_async(csc->csc_row,
-                      csc_row,
-                      (csc_matrix->nnz)*sizeof(int),
-                      stream);
-      hl_memcpy_async(csc->csc_col,
-                      csc_col,
-                      (csc_matrix->cols+1)*sizeof(int),
-                      stream);
+      hl_memcpy_async(
+          csc->csc_val, csc_val, (csc_matrix->nnz) * sizeof(real), stream);
+      hl_memcpy_async(
+          csc->csc_row, csc_row, (csc_matrix->nnz) * sizeof(int), stream);
+      hl_memcpy_async(
+          csc->csc_col, csc_col, (csc_matrix->cols + 1) * sizeof(int), stream);
     } else {
       LOG(FATAL) << "parameter csc_row or csc_col is null pointer!";
     }
   }
 
-  csc->sparsity = ((float)csc_matrix->nnz) /
-                  ((float)csc_matrix->rows) /
+  csc->sparsity = ((float)csc_matrix->nnz) / ((float)csc_matrix->rows) /
                   ((float)csc_matrix->cols);
 }
 
@@ -531,32 +482,23 @@ void hl_memcpy_sparse_matrix(hl_sparse_matrix_s dst,
                              hl_sparse_matrix_s src,
                              hl_stream_t stream) {
   CHECK(dst && src && dst->matrix && src->matrix)
-    << "parameter dst or src is null pointer!";
-  CHECK_EQ(dst->format, src->format)
-    << "sparse matrix format does not match!";
+      << "parameter dst or src is null pointer!";
+  CHECK_EQ(dst->format, src->format) << "sparse matrix format does not match!";
   CHECK(dst->type != HL_FLOAT_VALUE || src->type != HL_NO_VALUE)
-    << "src sparse matrix is no value, dst sparse matrix has value!";
+      << "src sparse matrix is no value, dst sparse matrix has value!";
 
   if (dst->format == HL_SPARSE_CSR) {
     dst->rows = src->rows;
     dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
     hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-    hl_memcpy_csr_matrix(dst,
-                         csr->csr_val,
-                         csr->csr_row,
-                         csr->csr_col,
-                         stream);
+    hl_memcpy_csr_matrix(dst, csr->csr_val, csr->csr_row, csr->csr_col, stream);
   } else if (dst->format == HL_SPARSE_CSC) {
     dst->rows = src->rows;
     dst->cols = src->cols;
-    dst->nnz  = src->nnz;
+    dst->nnz = src->nnz;
     hl_csc_matrix csc = (hl_csc_matrix)src->matrix;
-    hl_memcpy_csc_matrix(dst,
-                         csc->csc_val,
-                         csc->csc_row,
-                         csc->csc_col,
-                         stream);
+    hl_memcpy_csc_matrix(dst, csc->csc_val, csc->csc_row, csc->csc_col, stream);
   } else {
     LOG(FATAL) << "sparse matrix format error!";
   }
@@ -569,20 +511,24 @@ static void _beta_mul_c(real *c, int dimM, int dimN, real beta) {
   if (beta == 0.0) {
     hl_gpu_apply_unary_op(unary::Zero<real>(), c, dimM, dimN, dimN);
   } else {
-    if (beta != 1.0){
-      hl_gpu_apply_unary_op(
-        unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
+    if (beta != 1.0) {
+      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta), c, dimM, dimN, dimN);
     }
   }
 
   return;
 }
 
-void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
-                             real *B_d, hl_trans_op_t transb,
+void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transb, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -592,7 +538,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
 
   if ((HPPL_OP_N == transa && (A_d->rows != dimM || A_d->cols != dimK)) ||
       (HPPL_OP_T == transa && (A_d->rows != dimK || A_d->cols != dimM))) {
-      LOG(FATAL) << "parameter error!";
+    LOG(FATAL) << "parameter error!";
   }
 
   if (A_d->nnz == 0) {
@@ -603,8 +549,7 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csr_matrix A_d2 = (hl_csr_matrix)(A_d->matrix);
   if ((A_d2->csr_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csr_row == NULL ||
-       A_d2->csr_col == NULL) {
+      A_d2->csr_row == NULL || A_d2->csr_col == NULL) {
     LOG(FATAL) << "parameter error!";
   }
 
@@ -617,63 +562,63 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
     /* sparsity pattern */
     // A_d->sparsity;
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCsrMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (HPPL_OP_T == transa) {
     _beta_mul_c(C_d, dimM, dimN, beta);
 
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) /
-                  CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) /
-                  CU_CSC_MUL_DENSE_BLOCK_K;
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
     dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCscMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csr_val,
-                                               A_d2->csr_col,
-                                               A_d2->csr_row,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csr_val,
+          A_d2->csr_col,
+          A_d2->csr_row,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transa error!";
@@ -682,11 +627,16 @@ void hl_matrix_csr_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_csr_mul_dense failed");
 }
 
-void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+void hl_matrix_dense_mul_csc(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transa, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -698,8 +648,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
     LOG(FATAL) << "parameter dims error!";
   }
 
-  CHECK_EQ(B_d->format, HL_SPARSE_CSC)
-    << "matrix format error!";
+  CHECK_EQ(B_d->format, HL_SPARSE_CSC) << "matrix format error!";
 
   if (B_d->nnz == 0) {
     _beta_mul_c(C_d, dimM, dimN, beta);
@@ -709,8 +658,7 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csc_matrix B_d2 = (hl_csc_matrix)(B_d->matrix);
   if ((B_d2->csc_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csc_row == NULL ||
-       B_d2->csc_col == NULL) {
+      B_d2->csc_row == NULL || B_d2->csc_col == NULL) {
     LOG(FATAL) << "parameter B is null!";
   }
 
@@ -721,60 +669,60 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
     dim3 grid(blocksX, blocksY);
 
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_row,
-                                               B_d2->csc_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsc<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_row,
-                                               B_d2->csc_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_row,
+          B_d2->csc_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (transb == HPPL_OP_T) {
     _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
     dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_col,
-                                               B_d2->csc_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsr<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csc_val,
-                                               B_d2->csc_col,
-                                               B_d2->csc_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csc_val,
+          B_d2->csc_col,
+          B_d2->csc_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transb error!";
@@ -783,24 +731,28 @@ void hl_matrix_dense_mul_csc(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_dense_mul_csc failed");
 }
 
-void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
-                             hl_sparse_matrix_s B_d, hl_trans_op_t transb,
+void hl_matrix_dense_mul_csr(real *A_d,
+                             hl_trans_op_t transa,
+                             hl_sparse_matrix_s B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transa, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
 
-  if (dimM <= 0 || dimN <= 0 || dimK <= 0
-      || (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN))
-      || (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
+  if (dimM <= 0 || dimN <= 0 || dimK <= 0 ||
+      (transb == HPPL_OP_N && (B_d->rows != dimK || B_d->cols != dimN)) ||
+      (transb == HPPL_OP_T && (B_d->rows != dimN || B_d->cols != dimK))) {
     LOG(FATAL) << "parameter dims error!";
   }
 
-  CHECK_EQ(B_d->format, HL_SPARSE_CSR)
-    << "matrix format error!";
+  CHECK_EQ(B_d->format, HL_SPARSE_CSR) << "matrix format error!";
 
   if (B_d->nnz == 0) {
     _beta_mul_c(C_d, dimM, dimN, beta);
@@ -810,41 +762,40 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csr_matrix B_d2 = (hl_csr_matrix)(B_d->matrix);
   if ((B_d2->csr_val == NULL && B_d->type != HL_NO_VALUE) ||
-       B_d2->csr_row == NULL ||
-       B_d2->csr_col == NULL) {
+      B_d2->csr_row == NULL || B_d2->csr_col == NULL) {
     LOG(FATAL) << "parameter transa error!";
   }
 
   if (transb == HPPL_OP_N) {
     _beta_mul_c(C_d, dimM, dimN, beta);
-    int blocksX = 1 + (dimK-1)/CU_DM_CSR_THREAD_X;
-    int blocksY = 1 + (dimM-1)/CU_DM_CSR_BLOCK_M;
+    int blocksX = 1 + (dimK - 1) / CU_DM_CSR_THREAD_X;
+    int blocksY = 1 + (dimM - 1) / CU_DM_CSR_BLOCK_M;
     dim3 threads(CU_DM_CSR_THREAD_X, CU_DM_CSR_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsr<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_row,
-                                               B_d2->csr_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsr<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_row,
-                                               B_d2->csr_col,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsr<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_row,
+          B_d2->csr_col,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (transb == HPPL_OP_T) {
     int blocksX = (dimM + CU_CSCMM_BLOCK_M_BEST - 1) / CU_CSCMM_BLOCK_M_BEST;
@@ -852,29 +803,29 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
     dim3 threads(CU_CSCMM_THREAD_X_BEST, CU_CSCMM_THREAD_Y_BEST);
     dim3 grid(blocksX, blocksY);
     if (B_d->type == HL_NO_VALUE) {
-      KeSMatrixDenseMulCsc<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_col,
-                                               B_d2->csr_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixDenseMulCsc<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d,
-                                               B_d2->csr_val,
-                                               B_d2->csr_col,
-                                               B_d2->csr_row,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixDenseMulCsc<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d,
+          B_d2->csr_val,
+          B_d2->csr_col,
+          B_d2->csr_row,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transb error!";
@@ -883,11 +834,16 @@ void hl_matrix_dense_mul_csr(real *A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_dense_mul_csr failed");
 }
 
-void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
-                             real *B_d, hl_trans_op_t transb,
+void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d,
+                             hl_trans_op_t transa,
+                             real *B_d,
+                             hl_trans_op_t transb,
                              real *C_d,
-                             int dimM, int dimN, int dimK,
-                             real alpha, real beta) {
+                             int dimM,
+                             int dimN,
+                             int dimK,
+                             real alpha,
+                             real beta) {
   CHECK_EQ(transb, HPPL_OP_N);
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
@@ -908,42 +864,43 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   /* nnz != 0 */
   hl_csc_matrix A_d2 = (hl_csc_matrix)(A_d->matrix);
   if ((A_d2->csc_val == NULL && A_d->type != HL_NO_VALUE) ||
-       A_d2->csc_row == NULL ||
-       A_d2->csc_col == NULL) {
+      A_d2->csc_row == NULL || A_d2->csc_col == NULL) {
     LOG(FATAL) << "parameter error!";
   }
 
   if (HPPL_OP_N == transa) {
     _beta_mul_c(C_d, dimM, dimN, beta);
 
-    int blocksX = (dimN + CU_CSC_MUL_DENSE_BLOCK_N -1)/CU_CSC_MUL_DENSE_BLOCK_N;
-    int blocksY = (dimK + CU_CSC_MUL_DENSE_BLOCK_K -1)/CU_CSC_MUL_DENSE_BLOCK_K;
+    int blocksX =
+        (dimN + CU_CSC_MUL_DENSE_BLOCK_N - 1) / CU_CSC_MUL_DENSE_BLOCK_N;
+    int blocksY =
+        (dimK + CU_CSC_MUL_DENSE_BLOCK_K - 1) / CU_CSC_MUL_DENSE_BLOCK_K;
     dim3 threads(CU_CSC_MUL_DENSE_THREAD_X, CU_CSC_MUL_DENSE_THREAD_Y);
     dim3 grid(blocksX, blocksY);
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCscMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCscMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCscMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else if (HPPL_OP_T == transa) {
     int blocksX = (dimN + CU_CSRMM_BLOCK_N - 1) / CU_CSRMM_BLOCK_N;
@@ -954,29 +911,29 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
     /* sparsity pattern */
     // A_d->sparsity;
     if (A_d->type == HL_NO_VALUE) {
-      KeSMatrixCsrMulDense<0>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     } else {
-      KeSMatrixCsrMulDense<1>
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d,
-                                               A_d2->csc_val,
-                                               A_d2->csc_row,
-                                               A_d2->csc_col,
-                                               B_d,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
+      KeSMatrixCsrMulDense<1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d,
+          A_d2->csc_val,
+          A_d2->csc_row,
+          A_d2->csc_col,
+          B_d,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
     }
   } else {
     LOG(FATAL) << "parameter transa error!";
@@ -985,11 +942,16 @@ void hl_matrix_csc_mul_dense(hl_sparse_matrix_s A_d, hl_trans_op_t transa,
   CHECK_SYNC("hl_matrix_csc_mul_dense failed");
 }
 
-void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
-                          real *B_d, hl_trans_op_t transb,
-                          hl_sparse_matrix_s  C_d,
-                          int dimM, int dimN, int dimK,
-                          real alpha, real beta) {
+void hl_sparse_matrix_mul(real *A_d,
+                          hl_trans_op_t transa,
+                          real *B_d,
+                          hl_trans_op_t transb,
+                          hl_sparse_matrix_s C_d,
+                          int dimM,
+                          int dimN,
+                          int dimK,
+                          real alpha,
+                          real beta) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
   CHECK_NOTNULL(C_d);
@@ -1000,18 +962,14 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
 
   if (C_d->format == HL_SPARSE_CSC) {
     hl_csc_matrix C_d2 = (hl_csc_matrix)(C_d->matrix);
-    if (C_d2->csc_val == NULL ||
-        C_d2->csc_row == NULL ||
+    if (C_d2->csc_val == NULL || C_d2->csc_row == NULL ||
         C_d2->csc_col == NULL) {
       LOG(FATAL) << "parameter error!";
     }
 
     if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
-                            C_d2->csc_val,
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csc_val, 1, C_d->nnz, C_d->nnz);
     }
 
     int blocksX = dimN;
@@ -1020,34 +978,30 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
     dim3 grid(blocksX, blocksY);
     bool transA = transa == HPPL_OP_T ? 1 : 0;
     bool transB = transb == HPPL_OP_T ? 1 : 0;
-    KeSMatrixDenseMulDense2CSC
-      <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csc_val,
-                                             C_d2->csc_row,
-                                             C_d2->csc_col,
-                                             A_d,
-                                             B_d,
-                                             transA,
-                                             transB,
-                                             dimM,
-                                             dimN,
-                                             dimK,
-                                             alpha,
-                                             beta);
+    KeSMatrixDenseMulDense2CSC<<<grid, threads, 0, STREAM_DEFAULT>>>(
+        C_d2->csc_val,
+        C_d2->csc_row,
+        C_d2->csc_col,
+        A_d,
+        B_d,
+        transA,
+        transB,
+        dimM,
+        dimN,
+        dimK,
+        alpha,
+        beta);
     CHECK_SYNC("hl_sparse_matrix_mul failed");
   } else {
     hl_csr_matrix C_d2 = (hl_csr_matrix)(C_d->matrix);
     if ((C_d2->csr_val == NULL && C_d->type != HL_NO_VALUE) ||
-         C_d2->csr_row == NULL ||
-         C_d2->csr_col == NULL) {
+        C_d2->csr_row == NULL || C_d2->csr_col == NULL) {
       LOG(FATAL) << "parameter error!";
     }
 
     if (beta != 1.0) {
-      hl_gpu_apply_unary_op(unary::mul_scalar<real>(beta),
-                            C_d2->csr_val,
-                            1,
-                            C_d->nnz,
-                            C_d->nnz);
+      hl_gpu_apply_unary_op(
+          unary::mul_scalar<real>(beta), C_d2->csr_val, 1, C_d->nnz, C_d->nnz);
     }
 
     bool transA = transa == HPPL_OP_T ? 1 : 0;
@@ -1058,20 +1012,20 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
       dim3 threads(CU_CSCMM_DMD2CSR_THREAD_X, 1);
       dim3 grid(blocksX, blocksY);
 
-      KeSMatrixDenseMulDense2CSR
-        <<<grid, threads, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
-                                               C_d2->csr_row,
-                                               C_d2->csr_col,
-                                               A_d,
-                                               B_d,
-                                               transA,
-                                               transB,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
+      KeSMatrixDenseMulDense2CSR<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
     } else {
       CHECK(!transA) << "Not supported A is trans and B is not trans!";
 
@@ -1080,21 +1034,21 @@ void hl_sparse_matrix_mul(real *A_d, hl_trans_op_t transa,
       avgNnzPerRow = avgNnzPerRow > 0 ? avgNnzPerRow : 1;
       int gridx = DIVUP(avgNnzPerRow, CU_BLOCK_SIZE);
       dim3 grid(gridx, dimM);
-      KeSMatrixDenseMulDenseTrans2CSR
-         <<<grid, block, 0, STREAM_DEFAULT>>>(C_d2->csr_val,
-                                               C_d2->csr_row,
-                                               C_d2->csr_col,
-                                               A_d,
-                                               B_d,
-                                               transA,
-                                               transB,
-                                               dimM,
-                                               dimN,
-                                               dimK,
-                                               alpha,
-                                               beta);
-     CHECK_SYNC("hl_sparse_matrix_mul failed");
-   }
+      KeSMatrixDenseMulDenseTrans2CSR<<<grid, block, 0, STREAM_DEFAULT>>>(
+          C_d2->csr_val,
+          C_d2->csr_row,
+          C_d2->csr_col,
+          A_d,
+          B_d,
+          transA,
+          transB,
+          dimM,
+          dimN,
+          dimK,
+          alpha,
+          beta);
+      CHECK_SYNC("hl_sparse_matrix_mul failed");
+    }
   }
 }
 
@@ -1111,7 +1065,7 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
   CHECK_NOTNULL(csc_col);
 
   CHECK_EQ(csc_matrix->format, HL_SPARSE_CSC)
-     << "csc_matrix is not csc format error!";
+      << "csc_matrix is not csc format error!";
 
   if (csc_matrix->nnz > row_size ||
       csc_matrix->cols + 1 > static_cast<int>(col_size)) {
@@ -1119,20 +1073,20 @@ void hl_memcpy_from_csc_matrix(real *csc_val,
   }
 
   hl_csc_matrix csc = (hl_csc_matrix)(csc_matrix->matrix);
-  hl_memcpy_async((void*)csc_row,
-                  (void*)csc->csc_row,
+  hl_memcpy_async((void *)csc_row,
+                  (void *)csc->csc_row,
                   (csc_matrix->nnz) * sizeof(int),
                   stream);
-  hl_memcpy_async((void*)csc_col,
-                  (void*)csc->csc_col,
+  hl_memcpy_async((void *)csc_col,
+                  (void *)csc->csc_col,
                   (csc_matrix->cols + 1) * sizeof(int),
                   stream);
   if (csc_matrix->type == HL_FLOAT_VALUE) {
     if (csc_val != NULL) {
       CHECK_LE(csc_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csc_val,
-                      (void*)csc->csc_val,
-                      (csc_matrix->nnz)*sizeof(real),
+      hl_memcpy_async((void *)csc_val,
+                      (void *)csc->csc_val,
+                      (csc_matrix->nnz) * sizeof(real),
                       stream);
     } else {
       LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1152,7 +1106,7 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   CHECK_NOTNULL(csr_row);
   CHECK_NOTNULL(csr_col);
   CHECK_EQ(csr_matrix->format, HL_SPARSE_CSR)
-    << "csr_matrix is not csr format error!";
+      << "csr_matrix is not csr format error!";
 
   if (csr_matrix->nnz > col_size ||
       csr_matrix->rows + 1 > static_cast<int>(row_size)) {
@@ -1160,20 +1114,20 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   }
 
   hl_csr_matrix csr = (hl_csr_matrix)(csr_matrix->matrix);
-  hl_memcpy_async((void*)csr_row,
-                  (void*)csr->csr_row,
-                  (csr_matrix->rows+1)*sizeof(int),
+  hl_memcpy_async((void *)csr_row,
+                  (void *)csr->csr_row,
+                  (csr_matrix->rows + 1) * sizeof(int),
                   stream);
-  hl_memcpy_async((void*)csr_col,
-                  (void*)csr->csr_col,
-                  (csr_matrix->nnz)*sizeof(int),
+  hl_memcpy_async((void *)csr_col,
+                  (void *)csr->csr_col,
+                  (csr_matrix->nnz) * sizeof(int),
                   stream);
   if (csr_matrix->type == HL_FLOAT_VALUE) {
     if (csr_val != NULL) {
       CHECK_LE(csr_matrix->nnz, val_size) << "size not match!";
-      hl_memcpy_async((void*)csr_val,
-                      (void*)csr->csr_val,
-                      (csr_matrix->nnz)*sizeof(real),
+      hl_memcpy_async((void *)csr_val,
+                      (void *)csr->csr_val,
+                      (csr_matrix->nnz) * sizeof(real),
                       stream);
     } else {
       LOG(FATAL) << "parameter csr_val is null pointer!";
@@ -1181,8 +1135,8 @@ void hl_memcpy_from_csr_matrix(real *csr_val,
   }
 }
 
-void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
-                                 int dimN, real scale) {
+void hl_sparse_matrix_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
   if (B_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_column_sum(A_d, B_d, dimM, dimN, scale);
   } else {
@@ -1190,8 +1144,8 @@ void hl_sparse_matrix_column_sum(real* A_d, hl_sparse_matrix_s B_d, int dimM,
   }
 }
 
-void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
-                              int dimM, int dimN, real scale) {
+void hl_matrix_csr_column_sum(
+    real *A_d, hl_sparse_matrix_s B_d, int dimM, int dimN, real scale) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1216,8 +1170,7 @@ void hl_matrix_csr_column_sum(real* A_d, hl_sparse_matrix_s B_d,
   CHECK_SYNC("hl_matrix_csr_column_sum failed");
 }
 
-void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
-                               real* B_d, real scale) {
+void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
   if (A_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_add_bias(A_d, B_d, scale);
   } else {
@@ -1225,8 +1178,7 @@ void hl_sparse_matrix_add_bias(hl_sparse_matrix_s A_d,
   }
 }
 
-void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
-                            real scale) {
+void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real *B_d, real scale) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1247,8 +1199,12 @@ void hl_matrix_csr_add_bias(hl_sparse_matrix_s A_d, real* B_d,
   CHECK_SYNC("hl_sparse_matrix_add_bias failed");
 }
 
-void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
-                                int dimN, real alpha, real beta) {
+void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d,
+                                real *B_d,
+                                int dimM,
+                                int dimN,
+                                real alpha,
+                                real beta) {
   if (A_d->format == HL_SPARSE_CSR) {
     hl_matrix_csr_add_dense(A_d, B_d, dimM, dimN, alpha, beta);
   } else {
@@ -1256,8 +1212,12 @@ void hl_sparse_matrix_add_dense(hl_sparse_matrix_s A_d, real *B_d, int dimM,
   }
 }
 
-void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
-                             int dimN, real alpha, real beta) {
+void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d,
+                             real *B_d,
+                             int dimM,
+                             int dimN,
+                             real alpha,
+                             real beta) {
   CHECK_NOTNULL(A_d);
   CHECK_NOTNULL(B_d);
 
@@ -1277,20 +1237,26 @@ void hl_matrix_csr_add_dense(hl_sparse_matrix_s A_d, real* B_d, int dimM,
   gridX = gridX > 0 ? gridX : 1;
   dim3 block(512, 1);
   dim3 grid(gridX, dimM);
-  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(
-    A_d2->csr_val, A_d2->csr_row, A_d2->csr_col, B_d, alpha, beta, dimM, dimN);
+  KeSMatrixCsrAddDense<<<grid, block, 0, STREAM_DEFAULT>>>(A_d2->csr_val,
+                                                           A_d2->csr_row,
+                                                           A_d2->csr_col,
+                                                           B_d,
+                                                           alpha,
+                                                           beta,
+                                                           dimM,
+                                                           dimN);
 
   CHECK_SYNC("hl_sparse_matrix_add_dense failed");
 }
 
-int* hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_rows(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, row);
 }
 
-int* hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
+int *hl_sparse_matrix_get_cols(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, col);
 }
 
-real* hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
+real *hl_sparse_matrix_get_value(hl_sparse_matrix_s sMat) {
   __sparse_get_return__(sMat, val);
 }
diff --git a/paddle/cuda/src/hl_perturbation_util.cu b/paddle/cuda/src/hl_perturbation_util.cu
index 2a945bcdb8..d01a91561e 100644
--- a/paddle/cuda/src/hl_perturbation_util.cu
+++ b/paddle/cuda/src/hl_perturbation_util.cu
@@ -12,13 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
-#include <cmath>
 #include <stdlib.h>
-#include "hl_cuda.h"
-#include "hl_time.h"
+#include <cmath>
 #include "hl_base.h"
+#include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
+#include "hl_time.h"
 
 #define _USE_MATH_DEFINES
 
@@ -30,10 +29,16 @@ limitations under the License. */
  * centerX, centerY: translation.
  * sourceX, sourceY: output coordinates in the original image.
  */
-__device__ void getTranformCoord(int x, int y, real theta, real scale,
-                                 real tgtCenter, real imgCenter,
-                                 real centerR, real centerC,
-                                 int* sourceX, int* sourceY) {
+__device__ void getTranformCoord(int x,
+                                 int y,
+                                 real theta,
+                                 real scale,
+                                 real tgtCenter,
+                                 real imgCenter,
+                                 real centerR,
+                                 real centerC,
+                                 int* sourceX,
+                                 int* sourceY) {
   real H[4] = {cosf(-theta), -sinf(-theta), sinf(-theta), cosf(-theta)};
 
   // compute coornidates in the rotated and scaled image
@@ -57,11 +62,17 @@ __device__ void getTranformCoord(int x, int y, real theta, real scale,
  * created by Wei Xu (genome), converted by Jiang Wang
  */
 
-__global__ void kSamplingPatches(const real* imgs, real* targets,
-                                 int imgSize, int tgtSize, const int channels,
-                                 int samplingRate, const real* thetas,
-                                 const real* scales, const int* centerRs,
-                                 const int* centerCs, const real padValue,
+__global__ void kSamplingPatches(const real* imgs,
+                                 real* targets,
+                                 int imgSize,
+                                 int tgtSize,
+                                 const int channels,
+                                 int samplingRate,
+                                 const real* thetas,
+                                 const real* scales,
+                                 const int* centerRs,
+                                 const int* centerCs,
+                                 const real padValue,
                                  const int numImages) {
   const int caseIdx = blockIdx.x * 4 + threadIdx.x;
   const int pxIdx = blockIdx.y * 128 + threadIdx.y;
@@ -80,8 +91,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
     const int pxY = pxIdx / tgtSize;
 
     int srcPxX, srcPxY;
-    getTranformCoord(pxX, pxY, thetas[imgIdx], scales[imgIdx], tgtCenter,
-                     imgCenter, centerCs[caseIdx], centerRs[caseIdx], &srcPxX,
+    getTranformCoord(pxX,
+                     pxY,
+                     thetas[imgIdx],
+                     scales[imgIdx],
+                     tgtCenter,
+                     imgCenter,
+                     centerCs[caseIdx],
+                     centerRs[caseIdx],
+                     &srcPxX,
                      &srcPxY);
 
     imgs += (imgIdx * imgPixels + srcPxY * imgSize + srcPxX) * channels;
@@ -100,10 +118,15 @@ __global__ void kSamplingPatches(const real* imgs, real* targets,
  *
  * created by Wei Xu
  */
-void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
-                                int*& gpuCenterR, int*& gpuCenterC,
-                                int numImages, int imgSize, real rotateAngle,
-                                real scaleRatio, int samplingRate,
+void hl_generate_disturb_params(real*& gpuAngle,
+                                real*& gpuScaleRatio,
+                                int*& gpuCenterR,
+                                int*& gpuCenterC,
+                                int numImages,
+                                int imgSize,
+                                real rotateAngle,
+                                real scaleRatio,
+                                int samplingRate,
                                 bool isTrain) {
   // The number of output samples.
   int numPatches = numImages * samplingRate;
@@ -123,7 +146,8 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
     for (int i = 0; i < numImages; i++) {
       r_angle[i] =
           (rotateAngle * M_PI / 180.0) * (rand() / (RAND_MAX + 1.0)  // NOLINT
-                                          - 0.5);
+                                          -
+                                          0.5);
       s_ratio[i] =
           1 + (rand() / (RAND_MAX + 1.0) - 0.5) * scaleRatio;  // NOLINT
     }
@@ -140,8 +164,10 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
         int pxY =
             (int)(real(imgSize - 1) * rand() / (RAND_MAX + 1.0));  // NOLINT
 
-        const real H[4] = {cos(-r_angle[i]), -sin(-r_angle[i]),
-                           sin(-r_angle[i]), cos(-r_angle[i])};
+        const real H[4] = {cos(-r_angle[i]),
+                           -sin(-r_angle[i]),
+                           sin(-r_angle[i]),
+                           cos(-r_angle[i])};
         real x = pxX - imgCenter;
         real y = pxY - imgCenter;
         real xx = H[0] * x + H[1] * y;
@@ -185,9 +211,12 @@ void hl_generate_disturb_params(real*& gpuAngle, real*& gpuScaleRatio,
   delete[] center_c;
 }
 
-void hl_conv_random_disturb_with_params(const real* images, int imgSize,
-                                        int tgtSize, int channels,
-                                        int numImages, int samplingRate,
+void hl_conv_random_disturb_with_params(const real* images,
+                                        int imgSize,
+                                        int tgtSize,
+                                        int channels,
+                                        int numImages,
+                                        int samplingRate,
                                         const real* gpuRotationAngle,
                                         const real* gpuScaleRatio,
                                         const int* gpuCenterR,
@@ -202,29 +231,59 @@ void hl_conv_random_disturb_with_params(const real* images, int imgSize,
   dim3 threadsPerBlock(4, 128);
   dim3 numBlocks(DIVUP(numPatches, 4), DIVUP(targetSize, 128));
 
-  kSamplingPatches <<<numBlocks, threadsPerBlock>>>
-      (images, target, imgSize, tgtSize, channels, samplingRate,
-      gpuRotationAngle, gpuScaleRatio, gpuCenterR, gpuCenterC,
-      paddingValue, numImages);
+  kSamplingPatches<<<numBlocks, threadsPerBlock>>>(images,
+                                                   target,
+                                                   imgSize,
+                                                   tgtSize,
+                                                   channels,
+                                                   samplingRate,
+                                                   gpuRotationAngle,
+                                                   gpuScaleRatio,
+                                                   gpuCenterR,
+                                                   gpuCenterC,
+                                                   paddingValue,
+                                                   numImages);
 
   hl_device_synchronize();
 }
 
-void hl_conv_random_disturb(const real* images, int imgSize,
-                            int tgtSize, int channels, int numImages,
-                            real scaleRatio, real rotateAngle,
-                            int samplingRate, real* gpu_r_angle,
-                            real* gpu_s_ratio, int* gpu_center_r,
-                            int* gpu_center_c, int paddingValue,
-                            bool isTrain, real* targets) {
+void hl_conv_random_disturb(const real* images,
+                            int imgSize,
+                            int tgtSize,
+                            int channels,
+                            int numImages,
+                            real scaleRatio,
+                            real rotateAngle,
+                            int samplingRate,
+                            real* gpu_r_angle,
+                            real* gpu_s_ratio,
+                            int* gpu_center_r,
+                            int* gpu_center_c,
+                            int paddingValue,
+                            bool isTrain,
+                            real* targets) {
   // generate the random disturbance sequence and the sampling locations
-  hl_generate_disturb_params(gpu_r_angle, gpu_s_ratio, gpu_center_r,
-                  gpu_center_c, numImages, imgSize, rotateAngle,
-                  scaleRatio, samplingRate, isTrain);
-
-  hl_conv_random_disturb_with_params(
-                  images, imgSize, tgtSize, channels, numImages,
-                  samplingRate, gpu_r_angle, gpu_s_ratio,
-                  gpu_center_r, gpu_center_r, paddingValue,
-                  targets);
+  hl_generate_disturb_params(gpu_r_angle,
+                             gpu_s_ratio,
+                             gpu_center_r,
+                             gpu_center_c,
+                             numImages,
+                             imgSize,
+                             rotateAngle,
+                             scaleRatio,
+                             samplingRate,
+                             isTrain);
+
+  hl_conv_random_disturb_with_params(images,
+                                     imgSize,
+                                     tgtSize,
+                                     channels,
+                                     numImages,
+                                     samplingRate,
+                                     gpu_r_angle,
+                                     gpu_s_ratio,
+                                     gpu_center_r,
+                                     gpu_center_r,
+                                     paddingValue,
+                                     targets);
 }
diff --git a/paddle/cuda/src/hl_table_apply.cu b/paddle/cuda/src/hl_table_apply.cu
index 61edbe3ccc..d3b71c75e6 100644
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -12,15 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
-#include "hl_device_functions.cuh"
 #include "hl_cuda.h"
+#include "hl_device_functions.cuh"
 #include "paddle/utils/Logging.h"
 
-template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
-__global__ void KeMatrixAddRows(real* output, int ldo,
-                                real* table, int ldt,
+template <int blockDimX, int blockDimY, int gridDimX, bool AddRow>
+__global__ void KeMatrixAddRows(real* output,
+                                int ldo,
+                                real* table,
+                                int ldt,
                                 int* ids,
                                 int numSamples,
                                 int tableSize,
@@ -31,8 +32,8 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
   while (idy < numSamples) {
     int tableId = ids[idy];
     if ((0 <= tableId) && (tableId < tableSize)) {
-      real *out = output + idy * ldo;
-      real *tab = table + tableId * ldt;
+      real* out = output + idy * ldo;
+      real* tab = table + tableId * ldt;
       for (int i = idx; i < dim; i += blockDimX) {
         if (AddRow) {
           paddle::paddleAtomicAdd(&tab[i], out[i]);
@@ -45,8 +46,10 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
   }
 }
 
-void hl_matrix_select_rows(real* output, int ldo,
-                           real* table, int ldt,
+void hl_matrix_select_rows(real* output,
+                           int ldo,
+                           real* table,
+                           int ldt,
                            int* ids,
                            int numSamples,
                            int tableSize,
@@ -57,14 +60,16 @@ void hl_matrix_select_rows(real* output, int ldo,
 
   dim3 threads(128, 8);
   dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 0><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (output, ldo, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 0><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, ldo, table, ldt, ids, numSamples, tableSize, dim);
 
   CHECK_SYNC("hl_matrix_select_rows failed");
 }
 
-void hl_matrix_add_to_rows(real* table, int ldt,
-                           real* input, int ldi,
+void hl_matrix_add_to_rows(real* table,
+                           int ldt,
+                           real* input,
+                           int ldi,
                            int* ids,
                            int numSamples,
                            int tableSize,
@@ -75,16 +80,15 @@ void hl_matrix_add_to_rows(real* table, int ldt,
 
   dim3 threads(128, 8);
   dim3 grid(8, 1);
-  KeMatrixAddRows<128, 8, 8, 1><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (input, ldi, table, ldt, ids, numSamples, tableSize, dim);
+  KeMatrixAddRows<128, 8, 8, 1><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      input, ldi, table, ldt, ids, numSamples, tableSize, dim);
 
   CHECK_SYNC("hl_matrix_add_to_rows failed");
 }
 
-template<class T, int blockDimX, int gridDimX>
-__global__ void KeVectorSelect(T* dst, int sized,
-                               const T* src, int sizes,
-                               const int* ids, int sizei) {
+template <class T, int blockDimX, int gridDimX>
+__global__ void KeVectorSelect(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
   int idx = threadIdx.x + blockDimX * blockIdx.x;
   while (idx < sizei) {
     int index = ids[idx];
@@ -95,9 +99,8 @@ __global__ void KeVectorSelect(T* dst, int sized,
 }
 
 template <class T>
-void hl_vector_select_from(T* dst, int sized,
-                           const T* src, int sizes,
-                           const int* ids, int sizei) {
+void hl_vector_select_from(
+    T* dst, int sized, const T* src, int sizes, const int* ids, int sizei) {
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
   CHECK_NOTNULL(ids);
@@ -105,18 +108,17 @@ void hl_vector_select_from(T* dst, int sized,
 
   dim3 threads(512, 1);
   dim3 grid(8, 1);
-  KeVectorSelect<T, 512, 8><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (dst, sized, src, sizes, ids, sizei);
+  KeVectorSelect<T, 512, 8><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      dst, sized, src, sizes, ids, sizei);
 
   CHECK_SYNC("hl_vector_select_from failed");
 }
 
-template
-void hl_vector_select_from(real* dst, int sized,
-                           const real* src, int sizes,
-                           const int* ids, int sizei);
-template
-void hl_vector_select_from(int* dst, int sized,
-                           const int* src, int sizes,
-                           const int* ids, int sizei);
-
+template void hl_vector_select_from(real* dst,
+                                    int sized,
+                                    const real* src,
+                                    int sizes,
+                                    const int* ids,
+                                    int sizei);
+template void hl_vector_select_from(
+    int* dst, int sized, const int* src, int sizes, const int* ids, int sizei);
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index 4f0bbfcf4e..1896a56634 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -12,45 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "hl_base.h"
-#include "hl_top_k.h"
 #include "hl_sparse.ph"
+#include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
 // using namespace hppl;
 
 struct Pair {
-  __device__ __forceinline__
-  Pair() {}
+  __device__ __forceinline__ Pair() {}
 
-  __device__ __forceinline__
-  Pair(real value, int id) : v_(value), id_(id) {}
+  __device__ __forceinline__ Pair(real value, int id) : v_(value), id_(id) {}
 
-  __device__ __forceinline__
-  void set(real value, int id) {
+  __device__ __forceinline__ void set(real value, int id) {
     v_ = value;
     id_ = id;
   }
 
-  __device__ __forceinline__
-  void operator=(const Pair& in) {
+  __device__ __forceinline__ void operator=(const Pair& in) {
     v_ = in.v_;
     id_ = in.id_;
   }
 
-  __device__ __forceinline__
-  bool operator<(const real value) const {
+  __device__ __forceinline__ bool operator<(const real value) const {
     return (v_ < value);
   }
 
-  __device__ __forceinline__
-  bool operator<(const Pair& in) const {
+  __device__ __forceinline__ bool operator<(const Pair& in) const {
     return (v_ < in.v_) || ((v_ == in.v_) && (id_ > in.id_));
   }
 
-  __device__ __forceinline__
-  bool operator>(const Pair& in) const {
+  __device__ __forceinline__ bool operator>(const Pair& in) const {
     return (v_ > in.v_) || ((v_ == in.v_) && (id_ < in.id_));
   }
 
@@ -58,8 +50,9 @@ struct Pair {
   int id_;
 };
 
-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p, int beamSize) {
+__device__ __forceinline__ void addTo(Pair topK[],
+                                      const Pair& p,
+                                      int beamSize) {
   for (int k = beamSize - 2; k >= 0; k--) {
     if (topK[k] < p) {
       topK[k + 1] = topK[k];
@@ -71,9 +64,8 @@ void addTo(Pair topK[], const Pair &p, int beamSize) {
   topK[0] = p;
 }
 
-template<int beamSize>
-__device__ __forceinline__
-void addTo(Pair topK[], const Pair &p) {
+template <int beamSize>
+__device__ __forceinline__ void addTo(Pair topK[], const Pair& p) {
   for (int k = beamSize - 2; k >= 0; k--) {
     if (topK[k] < p) {
       topK[k + 1] = topK[k];
@@ -85,9 +77,9 @@ void addTo(Pair topK[], const Pair &p) {
   topK[0] = p;
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < src[idx]) {
       Pair tmp(src[idx], idx);
@@ -97,10 +89,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim, int beamSize) {
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *src, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* src, int idx, int dim, const Pair& max, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < src[idx]) {
       Pair tmp(src[idx], idx);
@@ -112,10 +103,9 @@ void getTopK(Pair topK[], real *src, int idx, int dim,
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col,
-             int idx, int dim, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(
+    Pair topK[], real* val, int* col, int idx, int dim, int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < val[idx]) {
       Pair tmp(val[idx], col[idx]);
@@ -125,10 +115,14 @@ void getTopK(Pair topK[], real *val, int *col,
   }
 }
 
-template<int blockSize>
-__device__ __forceinline__
-void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
-             const Pair& max, int beamSize) {
+template <int blockSize>
+__device__ __forceinline__ void getTopK(Pair topK[],
+                                        real* val,
+                                        int* col,
+                                        int idx,
+                                        int dim,
+                                        const Pair& max,
+                                        int beamSize) {
   while (idx < dim) {
     if (topK[beamSize - 1] < val[idx]) {
       Pair tmp(val[idx], col[idx]);
@@ -140,12 +134,16 @@ void getTopK(Pair topK[], real *val, int *col, int idx, int dim,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
-                   real* src,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* src,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
   if (beam > 0) {
     int length = beam < beamSize ? beam : beamSize;
     if (firstStep) {
@@ -160,8 +158,7 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
         }
       }
       if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim,
-                           max, length);
+        getTopK<blockSize>(topK + maxLength - beam, src, tid, dim, max, length);
       }
     }
 
@@ -171,12 +168,17 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void threadGetTopK(Pair topK[], int& beam, int beamSize,
-                   real* val, int* col,
-                   bool& firstStep, bool& isEmpty, Pair& max,
-                   int dim, const int tid) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void threadGetTopK(Pair topK[],
+                                              int& beam,
+                                              int beamSize,
+                                              real* val,
+                                              int* col,
+                                              bool& firstStep,
+                                              bool& isEmpty,
+                                              Pair& max,
+                                              int dim,
+                                              const int tid) {
   if (beam > 0) {
     int length = beam < beamSize ? beam : beamSize;
     if (firstStep) {
@@ -191,8 +193,8 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
         }
       }
       if (!isEmpty) {
-        getTopK<blockSize>(topK + maxLength - beam, val, col, tid, dim,
-                           max, length);
+        getTopK<blockSize>(
+            topK + maxLength - beam, val, col, tid, dim, max, length);
       }
     }
 
@@ -202,12 +204,16 @@ void threadGetTopK(Pair topK[], int& beam, int beamSize,
   }
 }
 
-template<int maxLength, int blockSize>
-__device__ __forceinline__
-void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
-                 real** topVal, int** topIds,
-                 int& beam, int& beamSize,
-                 const int tid, const int warp) {
+template <int maxLength, int blockSize>
+__device__ __forceinline__ void blockReduce(Pair* shTopK,
+                                            int* maxId,
+                                            Pair topK[],
+                                            real** topVal,
+                                            int** topIds,
+                                            int& beam,
+                                            int& beamSize,
+                                            const int tid,
+                                            const int warp) {
   while (true) {
     __syncthreads();
     if (tid < blockSize / 2) {
@@ -218,7 +224,7 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
       }
     }
     __syncthreads();
-    for (int stride = blockSize / 4; stride > 0; stride = stride/2) {
+    for (int stride = blockSize / 4; stride > 0; stride = stride / 2) {
       if (tid < stride) {
         if (shTopK[maxId[tid]] < shTopK[maxId[tid + stride]]) {
           maxId[tid] = maxId[tid + stride];
@@ -257,10 +263,12 @@ void blockReduce(Pair* shTopK, int* maxId, Pair topK[],
  * 3. go to the second setp, until one thread's topK value is null;
  * 4. go to the first setp, until get the topK value.
  */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopK(real* topVal, int ldv,
-                             int * topIds,
-                             real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopK(real* topVal,
+                             int ldv,
+                             int* topIds,
+                             real* src,
+                             int lds,
                              int dim,
                              int beamSize) {
   __shared__ Pair shTopK[blockSize];
@@ -271,7 +279,7 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -281,18 +289,19 @@ __global__ void KeMatrixTopK(real* topVal, int ldv,
     topK[k].set(-HL_FLOAT_MAX, -1);
   }
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 }
 
-template<int maxLength, int blockSize>
-__global__ void KeSMatrixTopK(real* topVal, int ldv,
-                              int * topIds,
+template <int maxLength, int blockSize>
+__global__ void KeSMatrixTopK(real* topVal,
+                              int ldv,
+                              int* topIds,
                               real* val,
                               int* row,
                               int* col,
@@ -304,7 +313,7 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -330,18 +339,20 @@ __global__ void KeSMatrixTopK(real* topVal, int ldv,
     topK[k].set(-HL_FLOAT_MAX, -1);
   }
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, val, col, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 }
 
-void hl_matrix_top_k(real* topVal, int ldv,
-                     int * topIds,
-                     real* src, int lds,
+void hl_matrix_top_k(real* topVal,
+                     int ldv,
+                     int* topIds,
+                     real* src,
+                     int lds,
                      int dim,
                      int beamSize,
                      int numSamples) {
@@ -353,33 +364,32 @@ void hl_matrix_top_k(real* topVal, int ldv,
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, src, lds, dim, beamSize);
+  KeMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, beamSize);
 
   CHECK_SYNC("hl_matrix_top_k failed");
 }
 
-void hl_sparse_matrix_top_k(real* topVal, int ldv,
-                            int * topIds,
+void hl_sparse_matrix_top_k(real* topVal,
+                            int ldv,
+                            int* topIds,
                             hl_sparse_matrix_s src,
                             int beamSize,
                             int numSamples) {
   CHECK_NOTNULL(topVal);
   CHECK_NOTNULL(topIds);
   CHECK_NOTNULL(src);
-  CHECK_EQ(src->format, HL_SPARSE_CSR)
-    <<"sparse matrix format error!";
+  CHECK_EQ(src->format, HL_SPARSE_CSR) << "sparse matrix format error!";
 
   hl_csr_matrix csr = (hl_csr_matrix)src->matrix;
-  if (csr->csr_val == NULL || csr->csr_row == NULL ||
-      csr->csr_col == NULL) {
+  if (csr->csr_val == NULL || csr->csr_row == NULL || csr->csr_col == NULL) {
     LOG(FATAL) << "parameter src is null!";
   }
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeSMatrixTopK<5, 256><<< grid, threads, 0, STREAM_DEFAULT >>>
-    (topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
+  KeSMatrixTopK<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, csr->csr_val, csr->csr_row, csr->csr_col, beamSize);
 
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
@@ -392,10 +402,12 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
  * 3. go to the second setp, until one thread's topK value is null;
  * 4. go to the first setp, until get the topK value.
  */
-template<int maxLength, int blockSize>
-__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
-                                                int * topIds,
-                                                real* src, int lds,
+template <int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal,
+                                                int ldv,
+                                                int* topIds,
+                                                real* src,
+                                                int lds,
                                                 int dim,
                                                 int beamSize,
                                                 int* label,
@@ -408,7 +420,7 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
   topVal += blockIdx.x * ldv;
   topIds += blockIdx.x * beamSize;
 
-  Pair topK[maxLength]; // NOLINT
+  Pair topK[maxLength];  // NOLINT
   int beam = maxLength;
   Pair max;
   bool isEmpty = false;
@@ -420,34 +432,36 @@ __global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
   }
 
   while (beamSize) {
-    threadGetTopK<maxLength, blockSize>
-      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+    threadGetTopK<maxLength, blockSize>(
+        topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
 
     shTopK[tid] = topK[0];
-    blockReduce<maxLength, blockSize>
-      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+    blockReduce<maxLength, blockSize>(
+        shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
   }
 
   __syncthreads();
   if (tid == 0) {
     for (int i = 0; i < topkSize; i++) {
-        if (*--topIds == label[blockIdx.x]) {
-            recResult[blockIdx.x] = 0;
-            break;
-        }
-        recResult[blockIdx.x] = 1.0f;
+      if (*--topIds == label[blockIdx.x]) {
+        recResult[blockIdx.x] = 0;
+        break;
+      }
+      recResult[blockIdx.x] = 1.0f;
     }
   }
 }
 
-void hl_matrix_classification_error(real* topVal, int ldv,
-                                   int* topIds,
-                                   real* src, int lds,
-                                   int dim,
-                                   int topkSize,
-                                   int numSamples,
-                                   int* label,
-                                   real* recResult) {
+void hl_matrix_classification_error(real* topVal,
+                                    int ldv,
+                                    int* topIds,
+                                    real* src,
+                                    int lds,
+                                    int dim,
+                                    int topkSize,
+                                    int numSamples,
+                                    int* label,
+                                    real* recResult) {
   CHECK_NOTNULL(topVal);
   CHECK_NOTNULL(topIds);
   CHECK_NOTNULL(src);
@@ -456,9 +470,8 @@ void hl_matrix_classification_error(real* topVal, int ldv,
 
   dim3 threads(256, 1);
   dim3 grid(numSamples, 1);
-  KeMatrixTopKClassificationError<5, 256>
-  <<< grid, threads, 0, STREAM_DEFAULT >>>
-  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+  KeMatrixTopKClassificationError<5, 256><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
 
   CHECK_SYNC("hl_matrix_top_k classification error failed");
 }
diff --git a/paddle/framework/.clang-format b/paddle/framework/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/framework/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
new file mode 100644
index 0000000000..8c28709a68
--- /dev/null
+++ b/paddle/framework/CMakeLists.txt
@@ -0,0 +1,102 @@
+# ddim lib
+proto_library(framework_proto SRCS framework.proto)
+
+cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+
+if (WITH_GPU)
+  nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
+else()
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
+endif ()
+
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+if (WITH_GPU)
+  nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
+else()
+  cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+endif()
+
+cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
+nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
+
+cc_test(variable_test SRCS variable_test.cc)
+
+cc_library(threadpool SRCS threadpool.cc DEPS enforce)
+cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
+
+cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
+nv_test(data_device_transform_test SRCS data_device_transform_test.cu
+        DEPS operator op_registry init math_function)
+
+cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
+cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
+
+cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
+cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
+
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
+        framework_proto selected_rows data_device_transform data_type_transform data_layout_transform)
+
+cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
+cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
+device_context)
+cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
+cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
+cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
+    shape_inference data_transform lod_tensor)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
+
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
+nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+
+py_proto_compile(framework_py_proto SRCS framework.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
+add_custom_command(TARGET framework_py_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+
+cc_library(backward SRCS backward.cc DEPS net_op)
+cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context fill_constant_op)
+cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
+
+cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
+
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
+framework_proto backward glog lod_rank_table profiler feed_fetch_method)
+
+cc_library(prune SRCS prune.cc DEPS framework_proto)
+cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
+cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
+        proto_desc)
+cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
+cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
+
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
+cc_test(init_test SRCS init_test.cc DEPS init)
+
+cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
+      
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB FRAMEWORK_HEADERS *.h)
+  install(FILES ${FRAMEWORK_HEADERS} DESTINATION include/paddle/framework)
+  install(FILES ${CMAKE_CURRENT_BINARY_DIR}/framework.pb.h DESTINATION include/paddle/framework)
+  install(FILES details/cow_ptr.h details/op_registry.h DESTINATION include/paddle/framework/details)
+endif()
+
+cc_test(channel_test SRCS channel_test.cc)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
new file mode 100644
index 0000000000..5074e8f5a0
--- /dev/null
+++ b/paddle/framework/attribute.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/attribute.h"
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc) {
+  switch (attr_desc.type()) {
+    case proto::AttrType::BOOLEAN: {
+      return attr_desc.b();
+    }
+    case proto::AttrType::INT: {
+      return attr_desc.i();
+    }
+    case proto::AttrType::FLOAT: {
+      return attr_desc.f();
+    }
+    case proto::AttrType::STRING: {
+      return attr_desc.s();
+    }
+    case proto::AttrType::BOOLEANS: {
+      std::vector<bool> val(attr_desc.bools_size());
+      for (int i = 0; i < attr_desc.bools_size(); ++i) {
+        val[i] = attr_desc.bools(i);
+      }
+      return val;
+    }
+    case proto::AttrType::INTS: {
+      std::vector<int> val(attr_desc.ints_size());
+      for (int i = 0; i < attr_desc.ints_size(); ++i) {
+        val[i] = attr_desc.ints(i);
+      }
+      return val;
+    }
+    case proto::AttrType::FLOATS: {
+      std::vector<float> val(attr_desc.floats_size());
+      for (int i = 0; i < attr_desc.floats_size(); ++i) {
+        val[i] = attr_desc.floats(i);
+      }
+      return val;
+    }
+    case proto::AttrType::STRINGS: {
+      std::vector<std::string> val(attr_desc.strings_size());
+      for (int i = 0; i < attr_desc.strings_size(); ++i) {
+        val[i] = attr_desc.strings(i);
+      }
+      return val;
+    }
+    case proto::AttrType::LONG: {
+      return attr_desc.l();
+    }
+    default:
+      PADDLE_THROW("Unsupport attr type %d", attr_desc.type());
+  }
+  return boost::blank();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
new file mode 100644
index 0000000000..bcff9bc4c4
--- /dev/null
+++ b/paddle/framework/attribute.h
@@ -0,0 +1,284 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+template <typename T>
+inline proto::AttrType AttrTypeID() {
+  Attribute tmp = T();
+  return static_cast<proto::AttrType>(tmp.which() - 1);
+}
+
+Attribute GetAttrValue(const proto::OpDesc::Attr& attr_desc);
+
+class AttrReader {
+ public:
+  explicit AttrReader(const AttributeMap& attrs) : attrs_(attrs) {}
+
+  template <typename T>
+  inline const T& Get(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+ private:
+  const AttributeMap& attrs_;
+};
+
+// check whether a value(attribute) fit a certain limit
+template <typename T>
+class GreaterThanChecker {
+ public:
+  explicit GreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fails.");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+template <typename T>
+class EqualGreaterThanChecker {
+ public:
+  explicit EqualGreaterThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE_GE(value, lower_bound_, "equal_larger_than check fails.");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+// we can provide users more common Checker, like 'LessThanChecker',
+// 'BetweenChecker'...
+
+template <typename T>
+class DefaultValueSetter {
+ public:
+  explicit DefaultValueSetter(T default_value)
+      : default_value_(default_value) {}
+  void operator()(T& value) const { value = default_value_; }
+
+ private:
+  T default_value_;
+};
+
+template <typename T>
+class EnumInContainer {
+ public:
+  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
+  void operator()(T& val) const {
+    PADDLE_ENFORCE(container_.find(val) != container_.end(),
+                   "Value %s is not in enum container %s", val,
+                   ContainerDebugString());
+  }
+
+ private:
+  std::string ContainerDebugString() const {
+    std::ostringstream sout;
+    sout << "[";
+    size_t cnt = 0;
+    for (auto& v : container_) {
+      sout << v;
+      ++cnt;
+      if (cnt != container_.size()) {
+        sout << " ,";
+      }
+    }
+    sout << "]";
+    return sout.str();
+  }
+
+  std::unordered_set<T> container_;
+};
+
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, typeid(T).name(), attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+template <>
+struct ExtractAttribute<int64_t> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  int64_t* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<int64_t>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      int val = boost::get<float>(attr);
+      attr = static_cast<int64_t>(val);
+    }
+    int64_t* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<int64_t>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// check whether a certain attribute fit its limits
+// an attribute can have more than one limits
+template <typename T>
+class TypedAttrChecker {
+  typedef std::function<void(T&)> ValueChecker;
+
+ public:
+  explicit TypedAttrChecker(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
+    value_checkers_.push_back(EnumInContainer<T>(range));
+    return *this;
+  }
+
+  TypedAttrChecker& GreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(GreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  TypedAttrChecker& EqualGreaterThan(const T& lower_bound) {
+    value_checkers_.push_back(EqualGreaterThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  // we can add more common limits, like LessThan(), Between()...
+
+  TypedAttrChecker& SetDefault(const T& default_value) {
+    PADDLE_ENFORCE(default_value_setter_.empty(),
+                   "%s can't have more than one default value!", attr_name_);
+    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
+    return *this;
+  }
+
+  // allow users provide their own checker
+  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
+    value_checkers_.push_back(checker);
+    return *this;
+  }
+
+  void operator()(AttributeMap& attr_map) const {
+    if (!attr_map.count(attr_name_)) {
+      // user do not set this attr
+      PADDLE_ENFORCE(!default_value_setter_.empty(),
+                     "Attribute '%s' is required!", attr_name_);
+      // default_value_setter_ has no more than one element
+      T val;
+      (default_value_setter_[0])(val);
+      attr_map[attr_name_] = val;
+    }
+    Attribute& attr = attr_map.at(attr_name_);
+    ExtractAttribute<T> extract_attr(attr_name_);
+    T* attr_value = extract_attr(attr);
+    for (const auto& checker : value_checkers_) {
+      checker(*attr_value);
+    }
+  }
+
+ private:
+  std::string attr_name_;
+  std::vector<ValueChecker> value_checkers_;
+  std::vector<ValueChecker> default_value_setter_;
+};
+
+// check whether op's all attributes fit their own limits
+class OpAttrChecker {
+  typedef std::function<void(AttributeMap&)> AttrChecker;
+
+ public:
+  template <typename T>
+  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
+    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
+    AttrChecker& checker = attr_checkers_.back();
+    return *(checker.target<TypedAttrChecker<T>>());
+  }
+
+  void Check(AttributeMap& attr_map) const {
+    for (const auto& checker : attr_checkers_) {
+      checker(attr_map);
+    }
+  }
+
+ private:
+  std::vector<AttrChecker> attr_checkers_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
new file mode 100644
index 0000000000..85e693434a
--- /dev/null
+++ b/paddle/framework/backward.cc
@@ -0,0 +1,585 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/backward.h"
+#include "paddle/operators/net_op.h"
+
+#include <deque>
+#include <list>
+#include <memory>
+#include <unordered_set>
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace framework {
+
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
+  }
+  return *g_ctrl_flow_ops_;
+}
+
+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  OpDesc op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, {});
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDesc>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
+
+template <typename Map, typename T>
+static void ForEachVarName(const Map& names, T callback) {
+  for (auto& name : names) {
+    for (auto& n : name.second) {
+      if (callback(n)) return;
+    }
+  }
+}
+
+// return whether all the names + suffixes in the set
+static bool AllInSet(
+    const std::map<std::string, std::vector<std::string>>& names,
+    const std::string& suffix, const std::unordered_set<std::string>& set) {
+  bool all_in_set = true;
+  ForEachVarName(names, [&all_in_set, &set, &suffix](const std::string& n) {
+    all_in_set = set.find(n + suffix) != set.end();
+    return !all_in_set;
+  });
+  return all_in_set;
+}
+
+static std::unique_ptr<OperatorBase> NOP() {
+  auto net_op = new operators::NetOp();
+  net_op->SetType("@NOP@");
+  net_op->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(net_op);
+}
+
+//  Get backward operator from a forward operator, a recursive implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling
+//  BackwardRecursive. use `uid = uniq_id++;` to get the unique index, and
+//  pass `uniq_id` through recursive calling.
+//
+//  returns The backward operator. In a simple situation, it may be a simple
+//  operator, in a complex situation, it maybe a NetOp.
+//
+//  See Backward.h for details
+static std::unique_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.Inputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate.
+  //  Then all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.Outputs() /*names*/, kGradVarSuffix /*suffix*/,
+               no_grad_names /*set*/)) {
+    ForEachVarName(forwardOp.Inputs(),
+                   [&no_grad_names](const std::string& name) -> bool {
+                     no_grad_names.insert(GradVarName(name));
+                     return false;
+                   });
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::unique_ptr<operators::NetOp>(new operators::NetOp());
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const operators::NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in
+    // backward net's ops_. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet and collect all duplicate outputs.
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto& fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
+      ForEachVarName(bwd->Outputs(),
+                     [&dup_output_ops, local_op_id](const std::string& out) {
+                       dup_output_ops[out].emplace_back(local_op_id);
+                       return false;
+                     });
+      net->AppendOp(std::move(bwd));
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    // multiple operators which have the same output (y for example) may
+    // overwrite the same y variable when backward, special operations are token
+    // to handle this case. For each duplicate output, rename it to an alias
+    // (original name with a offset), append an `add` op for its operator,
+    // and finally sum all the alias variable to the final output variable y.
+    using Pos = std::pair<size_t, std::unique_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      // duplicate @Empty@ don't need to be added
+      if (name == kEmptyVarName) continue;
+
+      auto& dup_op = dup_output_op.second;
+      // no duplicate output
+      if (dup_op.size() == 1) continue;
+
+      // process the duplicate outputs
+      std::vector<std::string> dup_outputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        // rename each duplicate output to an alias
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      // collect all the offset for each alias,
+      // insert a sum operator to add all aliases to output
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}},
+                                AttributeMap{})});
+    }
+
+    // make sure the inserted `sum` ops follow the BFS order.
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, std::move(pos.second));
+    }
+  } else {
+    std::unique_ptr<OperatorBase> grad_op(
+        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
+
+    ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
+                                          const std::string& grad_input) {
+      if (no_grad_names.count(grad_input)) {
+        // +1 for \0
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        grad_op->Rename(grad_input, prefix + kZeroVarSuffix);
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}},
+                                           {{"Out", {grad_input}}},
+                                           AttributeMap{}));
+      }
+      return false;
+    });
+
+    ForEachVarName(grad_op->Outputs(),
+                   [&no_grad_names, &grad_op](const std::string& grad_output) {
+                     if (no_grad_names.count(grad_output)) {
+                       grad_op->Rename(grad_output, kEmptyVarName);
+                     }
+                     return false;
+                   });
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AppendOp(std::move(grad_op));
+  }
+  net->SetType("@GENERATED_BACKWARD@");
+  net->CompleteAddOp();
+  return std::unique_ptr<OperatorBase>(
+      static_cast<OperatorBase*>(net.release()));
+}
+
+// See header for comments
+std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size() + 1);
+
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + kGradVarSuffix);
+  }
+  size_t uid = 0;
+  std::unordered_map<std::string, std::string> grad_to_var;
+  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
+}
+
+// ====================================  //
+
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    sout << "All input {";
+    for (auto& name : names) {
+      sout << name << ",";
+    }
+    sout << "} is in {";
+    for (auto& name : set) {
+      sout << name << ",";
+    }
+    sout << "}";
+    VLOG(10) << sout.str();
+  }
+  return true;
+}
+
+static std::string FwdName(const std::string& grad_name) {
+  auto pos = grad_name.find("@GRAD");
+  if (pos == std::string::npos) {
+    return "";
+  } else {
+    return grad_name.substr(0, pos);
+  }
+}
+
+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDesc* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
+  auto ops = block_desc->AllOps();
+  for (size_t op_index = grad_op_start_index; op_index < ops.size();
+       ++op_index) {
+    std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
+                       if (block_desc->HasVar(grad_var_name)) {
+                         return false;
+                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
+                       return false;
+                     }
+                     auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
+                     new_vars.insert(var->Name());
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+    ops[op_index]->InferVarType(block_desc);
+    for (auto& arg : ops[op_index]->OutputArgumentNames()) {
+      if (new_vars.find(arg) == new_vars.end()) {
+        continue;
+      }
+      auto pname = FwdName(arg);
+      auto* param = block_desc->FindVarRecursive(pname);
+      auto* grad = block_desc->FindVar(arg);
+      if (param == nullptr) {
+        grad->SetDataType(proto::DataType::FP32);
+      } else {
+        grad->SetDataType(param->GetDataType());
+      }
+    }
+    ops[op_index]->InferShape(*block_desc);
+  }
+}
+
+std::vector<std::unique_ptr<OpDesc>> MakeOpGrad(
+    const OpDesc* op_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>()) {
+  std::vector<std::unique_ptr<OpDesc>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
+    return grad_op_descs;  // empty vector
+  }
+
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+
+  if (AllGradInSet(outputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator " << op_desc->Type();
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
+      for (const std::string& name : inputs) {
+        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
+    }
+
+    return grad_op_descs;  // empty vector
+  }
+
+  grad_op_descs =
+      OpInfoMap::Instance()
+          .Get(op_desc->Type())
+          .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var, grad_block);
+
+  std::list<std::unique_ptr<OpDesc>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars->count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDesc> fill_zeros_op(
+            new OpDesc("fill_zeros_like", {{"X", {prefix}}},
+                       {{"Out", {new_name}}}, AttributeMap{}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+  }
+
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx);
+
+std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
+    ProgramDesc& program_desc, int block_idx,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  VLOG(5) << "MakeBlockBackward";
+  BlockDesc* cur_block = program_desc.MutableBlock(block_idx);
+  std::vector<OpDesc*> op_descs = cur_block->AllOps();
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDesc>> backward_descs;
+
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    VLOG(5) << "Making backward " << (*it)->Type() << " op";
+    std::vector<std::unique_ptr<OpDesc>> op_grads;
+
+    if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+        (*it)->Type() == "parallel_do") {
+      int step_block_idx = (*it)->GetBlockAttr("sub_block");
+      BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
+                                                  grad_to_var, step_block_idx);
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else if ((*it)->Type() == "conditional_block") {
+      BlockDesc* backward_block =
+          CreateStepBlock(program_desc, no_grad_vars, grad_to_var,
+                          (*it)->GetBlockAttr("sub_block"));
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var, {backward_block});
+    } else {
+      op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
+    }
+
+    if (VLOG_IS_ON(10)) {
+      std::ostringstream sout;
+      sout << "Made ";
+      for (auto& op_grad : op_grads) {
+        sout << op_grad->Type() << " ";
+      }
+      VLOG(10) << sout.str();
+    }
+
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        if (out_name.find("@GRAD") == std::string::npos) {
+          // Not all outputs of a backward operator is a gradient. Only gradient
+          // need to be sum. Skip variables are not gradient.
+          continue;
+        }
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(op_grads.begin(), op_grads.end(),
+                   std::back_inserter(backward_descs),
+                   [](std::unique_ptr<OpDesc>& ptr) { return std::move(ptr); });
+  }
+
+  VLOG(5) << "Appending Sums";
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDesc>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      std::string next_g_name = out_name;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
+                 << " duplicated";
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
+        backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
+        sum_op_inputs.emplace_back(new_name);
+        next_g_name = sum_op_inputs.back();
+      }
+      std::unique_ptr<OpDesc> sum_op(new OpDesc("sum", {{"X", sum_op_inputs}},
+                                                {{"Out", {out_name}}},
+                                                AttributeMap{}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+
+  pending_sum_ops.sort([](const std::pair<size_t, std::unique_ptr<OpDesc>>& a,
+                          const std::pair<size_t, std::unique_ptr<OpDesc>>& b) {
+    return a.first > b.first;
+  });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+
+  VLOG(5) << "MakeBlockBackward Finished";
+
+  return backward_descs;
+}
+
+static BlockDesc* CreateStepBlock(
+    ProgramDesc& program_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    int step_block_idx) {
+  auto backward_block_op_descs = MakeBlockBackward(program_desc, step_block_idx,
+                                                   no_grad_vars, grad_to_var);
+  BlockDesc* backward_block =
+      program_desc.AppendBlock(*program_desc.MutableBlock(step_block_idx));
+  for (auto& ptr : backward_block_op_descs) {
+    backward_block->AppendAllocatedOp(move(ptr));
+  }
+  return backward_block;
+}
+
+ParamGradInfoMap AppendBackward(
+    ProgramDesc& program_desc, const VarDesc& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+
+  const int root_block_idx = 0;
+  auto root_block = program_desc.MutableBlock(root_block_idx);
+
+  std::string fill_one_op_out = GradVarName(target.Name());
+  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  PADDLE_ENFORCE(is_scalar, "target should be scalar");
+  VLOG(3) << "backward from loss=" << target.Name()
+          << " data_type=" << target.GetDataType();
+  std::unique_ptr<OpDesc> fill_one_op(
+      new OpDesc("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                 {{"shape", std::vector<int>{1}},
+                  {"value", static_cast<float>(1.0)},
+                  {"dtype", target.GetDataType()}}));
+  // infer var type of fill_one_op
+  fill_one_op->InferVarType(root_block);
+
+  root_block->AppendAllocatedOp(std::move(fill_one_op));
+  size_t forward_op_num = root_block->OpSize();
+  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
+  std::unordered_map<std::string, std::string> grad_to_var;
+  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
+                                             &no_grad_var_names, &grad_to_var);
+
+  for (auto& ptr : backward_op_descs) {
+    root_block->AppendAllocatedOp(std::move(ptr));
+  }
+  // Create Variable
+
+  // Create target gradient variable
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  auto var = root_block->Var(fill_one_op_out);
+  var->SetDataType(target.GetDataType());
+  var->SetShape(target.Shape());
+  auto& target_grad = retv[target.Name()];
+  target_grad.name_ = fill_one_op_out;
+  target_grad.block_idx_ = root_block_idx;
+  target_grad.op_idx_ = static_cast<int>(forward_op_num);
+
+  // create grad_var for all blocks in this program
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
+  for (size_t block_index = forward_block_num;
+       block_index < program_desc.Size(); ++block_index) {
+    CreateGradVarInBlock(0, grad_to_var, program_desc.MutableBlock(block_index),
+                         &retv);
+  }
+  return retv;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
new file mode 100644
index 0000000000..69ee380236
--- /dev/null
+++ b/paddle/framework/backward.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::unique_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+
+struct GradVarInfo {
+  GradVarInfo() {}
+  GradVarInfo(const std::string& name, int block_idx, int op_idx)
+      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
+
+  bool operator==(const GradVarInfo& b) const {
+    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
+           op_idx_ == b.op_idx_;
+  }
+
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDesc& program_desc, const VarDesc& target,
+    const std::unordered_set<std::string>& no_grad_vars);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
new file mode 100644
index 0000000000..72743b5fd0
--- /dev/null
+++ b/paddle/framework/backward_test.cc
@@ -0,0 +1,918 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_desc.h"
+#include "paddle/operators/net_op.h"
+
+USE_NO_KERNEL_OP(fill_constant);
+
+namespace paddle {
+namespace framework {
+
+using DeviceContext = platform::DeviceContext;
+
+class NoneOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+template <typename Place, typename T>
+class NoneKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {}
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add");
+    AddInput("b", "Bias of Add");
+    AddOutput("Out", "Out of Add");
+    AddComment("Add Op");
+  }
+};
+
+class RowWiseAddGradMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<OpDesc> Apply() const override {
+    auto grad_op = new OpDesc();
+    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
+    grad_op->SetType("rowwise_add_grad");
+    return std::unique_ptr<OpDesc>(grad_op);
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "A");
+    AddInput("Y", "B");
+    AddOutput("Out", "Out");
+    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Out", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Out", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public operators::NetOp {
+ public:
+  FcOp(const std::string &type, const VariableNameMap &inputs,
+       const VariableNameMap &outputs, const AttributeMap &attrs)
+      : NetOp(type, inputs, outputs, attrs) {
+    AppendOp(OpRegistry::CreateOp(
+        "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}},
+        {{"Out", {Output("mul_result")}}}, AttributeMap{}));
+    auto input_b = Inputs("b");
+    std::string before_act = "mul_result";
+    if (input_b.size() != 0) {
+      AppendOp(OpRegistry::CreateOp(
+          "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}},
+          {{"Out", {Output("add_result")}}}, AttributeMap{}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != kEmptyVarName) {
+        this->Rename(out_varname, kEmptyVarName);
+      }
+    }
+
+    AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}},
+                                  {{"Out", {Output("Out")}}}, AttributeMap{}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").AsIntermediate();
+    AddOutput("add_result", "").AsIntermediate();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddOutput("Out", "out");
+    AddComment("");
+  }
+};
+
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of sum operator.");
+    AddComment("");
+  }
+};
+
+class MultInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("H", "h");
+    AddOutput("Y", "y");
+    AddOutput("Z", "z");
+    AddComment("");
+  }
+};
+
+class MinusGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<OpDesc>> retv;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *op_desc = new OpDesc();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", x_g);
+      op_desc->SetAttr("scale", 1.0f);
+      retv.emplace_back(op_desc);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *op_desc = new OpDesc();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", y_g);
+      op_desc->SetAttr("scale", -1.0f);
+      retv.emplace_back(op_desc);
+    }
+    return retv;
+  }
+};
+
+class MinusOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("Out", "");
+    AddComment("minus for unittest");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+// rowwise_add
+REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
+                  f::RowWiseAddGradMaker);
+REGISTER_OP_CPU_KERNEL(rowwise_add,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// mul
+REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sigmoid
+REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
+// fill_zeros_like
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
+REGISTER_OP_CPU_KERNEL(fill_zeros_like,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sum
+REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sum_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// fc
+REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
+// many_output_op
+REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
+            many_output_op_grad, f::NoneOp);
+// mult_in_out
+REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
+            f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mult_in_out,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// minus
+REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
+REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
+// scale
+REGISTER_OPERATOR(scale, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"x"});
+  ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName);
+
+  auto no_input_gop = f::Backward(*fwd, {"x", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, static_cast<ops::NetOp *>(no_input_gop.get())->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_re"}},
+                               {"Out", {"out"}}},
+                              f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd =
+      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {}}},
+                              {{"mul_result", {"mul_res"}},
+                               {"add_result", {"add_res"}},
+                               {"Out", {"tmp"}}},
+                              f::AttributeMap{});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop =
+      f::Backward(*fwd, std::unordered_set<std::string>{});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.Type());
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.Type());
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x"}}, {"W", {"W1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_tmp_0"}},
+       {"add_result", {"add_tmp_0"}},
+       {"Out", {"hidden0"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_tmp_1"}},
+       {"add_result", {"add_tmp_1"}},
+       {"Out", {"hidden1"}}},
+      f::AttributeMap{}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"x"});  // x@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+
+  auto output_vars = bwd_net->OutputVars(true);
+  std::unordered_set<std::string> all_outputs =
+      std::unordered_set<std::string>(output_vars.begin(), output_vars.end());
+  all_outputs.erase(f::kEmptyVarName);
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_outputs.find(f::GradVarName(out)), all_outputs.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_outputs.find(f::GradVarName("X")), all_outputs.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(f::kEmptyVarName,
+            first_fc_grad->ops_[2]->Output(f::GradVarName("X")));
+}
+
+TEST(Backward, net_shared_weight) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}},
+                                       {{"Out", {"out"}}}, f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}},
+                                       {{"Out", {"FinalOut"}}},
+                                       f::AttributeMap{}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, std::unordered_set<std::string>{});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"x", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}},
+                              {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd =
+      f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}},
+                              {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.Type());
+  ASSERT_EQ(1UL, fill_zero.Inputs("X").size());
+  ASSERT_EQ("Z", fill_zero.Input("X"));
+  ASSERT_EQ(1UL, fill_zero.Outputs("Out").size());
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.Output("Out"));
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.Type());
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.Inputs().size());  // I/O/OG
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}},
+                                     {{"Out", {"out"}}}, f::AttributeMap{});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.Type(), "mul_grad");
+  ASSERT_EQ(grad_mul.Inputs().size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.Outputs().size(), 2UL);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("X")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("Y")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
+  ASSERT_EQ(grad_mul.Input("X"), "a");
+  ASSERT_EQ(grad_mul.Input("Y"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  ops::NetOp net;
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"x1"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+      {{"mul_result", {"mul_out1"}},
+       {"add_result", {"add_out1"}},
+       {"Out", {"out1"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+      {{"mul_result", {"mul_out2"}},
+       {"add_result", {"tmp_out2"}},
+       {"Out", {"out2"}}},
+      f::AttributeMap{}));
+  net.AppendOp(f::OpRegistry::CreateOp(
+      "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}},
+      {{"mul_result", {"mul_out3"}},
+       {"add_result", {"tmp_out3"}},
+       {"Out", {"out3"}}},
+      f::AttributeMap{}));
+  net.CompleteAddOp();
+
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<ops::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+
+  const char *all = paddle::operators::NetOp::kAll;
+  EXPECT_EQ(grad_fc.Inputs(all).size(),
+            2UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                + 2UL /* internal variable number*/
+            );
+  EXPECT_EQ(grad_fc.Outputs(all).size(),
+            2UL       /* input number of mul*/
+                + 2UL /* input number of rowwise_add*/
+                + 1UL /* input number of sigmod */
+                - 1UL /* out2 is not needed*/);
+  EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
+}
+
+TEST(Backward, simple_single_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  f::OpDesc *op = block->AppendOp();
+  op->SetType("rowwise_add");
+  op->SetInput("X", {"x"});
+  op->SetInput("b", {"b"});
+  op->SetOutput("Out", {"out"});
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  f::OpDesc *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op = block->AllOps()[2];
+  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b")}));
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
+  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
+}
+
+TEST(Backward, default_attribute) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op = block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {"x"});
+  op->SetInput("Y", {"y"});
+  op->SetOutput("Out", {"out"});
+  op->CheckAttrs();
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
+
+  f::OpDesc *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op = block->AllOps()[2];
+  ASSERT_EQ(grad_op->Type(), "mul_grad");
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
+}
+
+TEST(Backward, simple_mult_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDesc("out3");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDesc *grad_op2 = block->AllOps()[5];
+  EXPECT_EQ(grad_op2->Type(), "mul_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDesc *grad_op3 = block->AllOps()[4];
+  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  EXPECT_EQ(var_to_grad.size(), 7UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out2"),
+            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+}
+
+TEST(Backward, intermedia_var_no_grad) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"x2"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  f::OpDesc *op4 = block->AppendOp();
+  op4->SetType("mul");
+  op4->SetInput("X", {"out1"});
+  op4->SetInput("Y", {"out3"});
+  op4->SetOutput("Out", {"out4"});
+
+  auto target = f::VarDesc("out4");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"out3"});
+
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDesc *grad_op4 = block->AllOps()[5];
+  EXPECT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out4")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+}
+
+TEST(Backward, var_no_grad) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("mult_in_out");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("H", {"h1"});
+  op1->SetOutput("Y", {"y1"});
+  op1->SetOutput("Z", {"z1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mult_in_out");
+  op2->SetInput("X", {"y1"});
+  op2->SetInput("H", {"z1"});
+  op2->SetOutput("Y", {"y2"});
+  op2->SetOutput("Z", {"z2"});
+
+  auto target = f::VarDesc("z2");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op2 = block->AllOps()[3];
+  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
+            std::vector<std::string>({f::GradVarName("z2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
+
+  f::OpDesc *fill_zero_op = block->AllOps()[4];
+  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
+  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
+  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(fill_zero_op->Output("Out"),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+
+  f::OpDesc *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
+  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
+  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::GradVarName("h1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
+}
+
+TEST(Backward, shared_var) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  f::OpDesc *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDesc *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDesc *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out1"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDesc("out3");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad =
+      AppendBackward(program, target, std::unordered_set<std::string>{});
+
+  ASSERT_EQ(block->AllOps().size(), 8UL);
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDesc *grad_op3 = block->AllOps()[4];
+  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  f::OpDesc *grad_op4 = block->AllOps()[5];
+  ASSERT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDesc *sum_op = block->AllOps()[6];
+  ASSERT_EQ(sum_op->Type(), "sum");
+  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
+  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(sum_op->Input("X"),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
+                                      f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(sum_op->Output("Out"),
+            std::vector<std::string>({f::GradVarName("out1")}));
+
+  f::OpDesc *grad_op1 = block->AllOps()[7];
+  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 6UL);
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+}
+
+TEST(Backward, half_backward) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  auto *op1 = block->AppendOp();
+  op1->SetType("minus");
+  op1->SetInput("X", {"a"});
+  op1->SetInput("Y", {"b"});
+  op1->SetOutput("Out", {"out"});
+
+  auto target = f::VarDesc("out");
+  target.SetShape({1});
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"b"});
+  f::OpDesc *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+  auto ops = block->AllOps();
+  ASSERT_EQ(3UL, ops.size());
+
+  EXPECT_EQ(var_to_grad.size(), 2UL);
+  EXPECT_EQ(var_to_grad.at("a"),
+            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
+}
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
new file mode 100644
index 0000000000..dd2ed87252
--- /dev/null
+++ b/paddle/framework/block_desc.cc
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+VarDesc *BlockDesc::Var(const std::string &name) {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  need_update_ = true;
+  auto *var = new VarDesc(name);
+  vars_[name].reset(var);
+  return var;
+}
+
+VarDesc *BlockDesc::FindVar(const std::string &name) const {
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
+}
+
+bool BlockDesc::HasVar(const std::string &name) const {
+  return vars_.find(name) != vars_.end();
+}
+
+VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
+
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return Parent() == kNoneBlockIndex ? nullptr
+                                       : ParentBlock()->FindVarRecursive(name);
+  }
+  return it->second.get();
+}
+
+VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+  VarDesc *res = FindVarRecursive(name_bytes);
+  if (res == nullptr) {
+    res = Var(name_bytes);
+  }
+  return *res;
+}
+
+bool BlockDesc::HasVarRecursive(const std::string &name) const {
+  return FindVarRecursive(name) != nullptr;
+}
+
+std::vector<VarDesc *> BlockDesc::AllVars() const {
+  std::vector<VarDesc *> res;
+  for (const auto &p : vars_) {
+    res.push_back(p.second.get());
+  }
+  return res;
+}
+
+OpDesc *BlockDesc::AppendOp() {
+  need_update_ = true;
+  ops_.emplace_back(new OpDesc(this));
+  return ops_.back().get();
+}
+
+void BlockDesc::AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc) {
+  need_update_ = true;
+  ops_.emplace_back(std::move(op_desc));
+}
+
+OpDesc *BlockDesc::PrependOp() {
+  need_update_ = true;
+  ops_.emplace_front(new OpDesc(this));
+  return ops_.front().get();
+}
+
+void BlockDesc::RemoveOp(size_t s, size_t e) {
+  if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) {
+    return;
+  }
+  need_update_ = true;
+  for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) {
+    auto names = (*it)->InputArgumentNames();
+    for (auto n : names) {
+      // TODO(typhoonzero): delete vars if no other op use it.
+      VLOG(3) << "deleting var " << n;
+    }
+  }
+  ops_.erase(ops_.begin() + s, ops_.begin() + e);
+}
+
+std::vector<OpDesc *> BlockDesc::AllOps() const {
+  std::vector<OpDesc *> res;
+  for (const auto &op : ops_) {
+    res.push_back(op.get());
+  }
+  return res;
+}
+
+void BlockDesc::Flush() {
+  for (auto &op_desc : ops_) {
+    op_desc->Flush();
+  }
+
+  if (need_update_) {
+    auto &op_field = *this->desc_->mutable_ops();
+    this->ClearPBOps();
+    op_field.Reserve(static_cast<int>(ops_.size()));
+    for (auto &op_desc : ops_) {
+      op_field.AddAllocated(op_desc->Proto());
+    }
+    auto &var_field = *this->desc_->mutable_vars();
+    this->ClearPBVars();
+    var_field.Reserve(static_cast<int>(vars_.size()));
+    for (auto &var_desc : vars_) {
+      var_field.AddAllocated(var_desc.second->Proto());
+    }
+    need_update_ = false;
+  }
+}
+
+BlockDesc *BlockDesc::ParentBlock() const {
+  if (this->desc_->parent_idx() == kNoneBlockIndex) {
+    return nullptr;
+  }
+  return prog_->MutableBlock(static_cast<size_t>(this->desc_->parent_idx()));
+}
+
+proto::BlockDesc *BlockDesc::Proto() {
+  Flush();
+  return desc_;
+}
+
+BlockDesc::BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc)
+    : prog_(prog), desc_(desc), need_update_(false) {
+  for (const proto::VarDesc &var_desc : desc_->vars()) {
+    vars_[var_desc.name()].reset(new VarDesc(var_desc));
+  }
+  for (const proto::OpDesc &op_desc : desc_->ops()) {
+    ops_.emplace_back(new OpDesc(op_desc, prog, this));
+  }
+}
+
+BlockDesc::BlockDesc(const BlockDesc &other, proto::BlockDesc *desc,
+                     ProgramDesc *prog)
+    : prog_(prog), desc_(desc) {
+  need_update_ = true;
+  for (auto &op : other.ops_) {
+    ops_.emplace_back(new OpDesc(*op, this));
+  }
+
+  for (auto &it : other.vars_) {
+    auto *var = new VarDesc(*it.second);
+    vars_[it.first].reset(var);
+  }
+}
+
+void BlockDesc::ClearPBOps() {
+  auto ops = this->desc_->mutable_ops();
+  while (!ops->empty()) {
+    // we do not own the OpDesc, so release the ownership.
+    ops->ReleaseLast();
+  }
+}
+
+void BlockDesc::ClearPBVars() {
+  auto vars = this->desc_->mutable_vars();
+  while (!vars->empty()) {
+    // we do not own the VarDesc, so release the ownership.
+    vars->ReleaseLast();
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
new file mode 100644
index 0000000000..4b609e4bcb
--- /dev/null
+++ b/paddle/framework/block_desc.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <deque>
+#include <memory>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/framework/var_desc.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class ProgramDesc;
+
+// Each Protobuf Message, we provide a XXXBind class. In that class, we optimize
+// read/write speed. Only when we want the protobuf message, the local changes
+// will be synchronized (by `Sync` method).
+
+class BlockDesc {
+ public:
+  BlockDesc(ProgramDesc *prog, proto::BlockDesc *desc);
+
+  BlockDesc(const BlockDesc &other, proto::BlockDesc *desc, ProgramDesc *prog);
+
+  ~BlockDesc() {
+    this->ClearPBVars();
+    this->ClearPBOps();
+  }
+
+  int32_t ID() const { return desc_->idx(); }
+
+  int32_t Parent() const { return desc_->parent_idx(); }
+
+  VarDesc *Var(const std::string &name_bytes);
+
+  VarDesc *FindVar(const std::string &name_bytes) const;
+
+  bool HasVar(const std::string &var_name) const;
+
+  VarDesc *FindVarRecursive(const std::string &name_bytes) const;
+
+  VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
+
+  bool HasVarRecursive(const std::string &var_name) const;
+
+  std::set<std::string> LocalVarNames() const {
+    std::set<std::string> var_names;
+    for (auto &var : vars_) {
+      var_names.insert(var.first);
+    }
+    return var_names;
+  }
+
+  std::vector<VarDesc *> AllVars() const;
+
+  BlockDesc *ParentBlock() const;
+
+  OpDesc *AppendOp();
+
+  void AppendAllocatedOp(std::unique_ptr<OpDesc> &&op_desc);
+
+  OpDesc *PrependOp();
+
+  void RemoveOp(size_t s, size_t e);
+
+  std::vector<OpDesc *> AllOps() const;
+
+  size_t OpSize() const { return ops_.size(); }
+
+  OpDesc *Op(int idx) { return ops_.at(idx).get(); }
+
+  void Flush();
+
+  proto::BlockDesc *Proto();
+
+  ProgramDesc *Program() { return this->prog_; }
+
+ private:
+  void ClearPBOps();
+  void ClearPBVars();
+
+ private:
+  ProgramDesc *prog_;       // not_own
+  proto::BlockDesc *desc_;  // not_own
+  bool need_update_;
+
+  std::deque<std::unique_ptr<OpDesc>> ops_;
+  std::unordered_map<std::string, std::unique_ptr<VarDesc>> vars_;
+
+  DISABLE_COPY_AND_ASSIGN(BlockDesc);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
new file mode 100644
index 0000000000..0570980c5a
--- /dev/null
+++ b/paddle/framework/channel.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace framework {
+
+// Channel is the abstract class of buffered and un-buffered channels.
+template <typename T>
+class Channel {
+ public:
+  virtual void Send(T*) = 0;
+  virtual void Receive(T*) = 0;
+  virtual size_t Cap() = 0;
+  virtual void Close() = 0;
+  virtual ~Channel() {}
+};
+
+// Forward declaration of channel implementations.
+namespace details {
+template <typename T>
+class Buffered;
+template <typename T>
+class UnBuffered;
+}  // namespace details
+
+template <typename T>
+Channel<T>* MakeChannel(size_t buffer_size) {
+  if (buffer_size > 0) {
+    return new details::Buffered<T>(buffer_size);
+  }
+  return new details::UnBuffered<T>();
+}
+
+template <typename T>
+void CloseChannel(Channel<T>* ch) {
+  ch->Close();
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/details/buffered_channel.h"
+#include "paddle/framework/details/unbuffered_channel.h"
diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
new file mode 100644
index 0000000000..1510fb8abf
--- /dev/null
+++ b/paddle/framework/channel_test.cc
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/channel.h"
+
+#include <chrono>
+#include <thread>
+
+#include "gtest/gtest.h"
+
+using paddle::framework::Channel;
+using paddle::framework::MakeChannel;
+using paddle::framework::CloseChannel;
+
+TEST(Channel, MakeAndClose) {
+  using paddle::framework::details::Buffered;
+  using paddle::framework::details::UnBuffered;
+  {
+    // MakeChannel should return a buffered channel is buffer_size > 0.
+    auto ch = MakeChannel<int>(10);
+    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+  {
+    // MakeChannel should return an un-buffered channel is buffer_size = 0.
+    auto ch = MakeChannel<int>(0);
+    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    CloseChannel(ch);
+    delete ch;
+  }
+}
+
+TEST(Channel, SufficientBufferSizeDoesntBlock) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  for (size_t i = 0; i < buffer_size; ++i) {
+    ch->Send(&i);  // should not block
+  }
+
+  size_t out;
+  for (size_t i = 0; i < buffer_size; ++i) {
+    ch->Receive(&out);  // should not block
+    EXPECT_EQ(out, i);
+  }
+  CloseChannel(ch);
+  delete ch;
+}
+
+TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
+  const size_t buffer_size = 10;
+  auto ch = MakeChannel<size_t>(buffer_size);
+  size_t sum = 0;
+  std::thread t([&]() {
+    // Try to write more than buffer size.
+    for (size_t i = 0; i < 2 * buffer_size; ++i) {
+      ch->Send(&i);  // should not block
+      sum += i;
+    }
+  });
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum, 45U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
diff --git a/paddle/framework/data_device_transform.cc b/paddle/framework/data_device_transform.cc
new file mode 100644
index 0000000000..5daf5a4e0a
--- /dev/null
+++ b/paddle/framework/data_device_transform.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_device_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+    const platform::Place& src_place, const platform::Place& dst_place) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    return pool.Get(src_place);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    return pool.Get(dst_place);
+  } else {
+    PADDLE_THROW(
+        "Currently, model parallelism is only supported between CPU and CUDA");
+  }
+}
+
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out) {
+  VLOG(3) << "DeviceTransform in, src_place " << in.place()
+          << " dst_place: " << dst_place;
+  auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+  dev_ctx->Wait();
+  Copy(in, dst_place, *dev_ctx, out);
+  dev_ctx->Wait();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform.h b/paddle/framework/data_device_transform.h
new file mode 100644
index 0000000000..39750a85f2
--- /dev/null
+++ b/paddle/framework/data_device_transform.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+void TransDataDevice(const Tensor& in, const platform::Place& dst_place,
+                     Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_device_transform_test.cu b/paddle/framework/data_device_transform_test.cu
new file mode 100644
index 0000000000..efc05b3106
--- /dev/null
+++ b/paddle/framework/data_device_transform_test.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input1 of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
+    AddComment("This is test op");
+  }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    if (Attr<bool>("use_gpu")) {
+      VLOG(3) << "force use gpu kernel";
+      return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+    } else {
+      VLOG(3) << "use default kernel";
+      return OpKernelType(proto::DataType::FP32,
+                          ctx.Input<Tensor>("input")->place());
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TestKernel : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+
+    const Tensor* input = ctx.Input<Tensor>("input");
+
+    std::cout << "input place:" << input->place() << std::endl;
+    auto* output = ctx.Output<framework::LoDTensor>("output");
+    output->Resize(input->dims());
+    output->mutable_data<T>(ctx.GetPlace());
+
+    operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
+        input, input, output, ctx.template device_context<DeviceContext>(),
+        AddFunctor<T>());
+    functor.Run();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    test_op, paddle::framework::TestOpWithKernel,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    test_op,
+    paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+TEST(Operator, CPUtoGPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  InitDevices();
+
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  // create an op to run on CPU
+  paddle::framework::proto::OpDesc cpu_op_desc;
+  cpu_op_desc.set_type("test_op");
+  BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+  auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+  // prepare input
+  auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
+  auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
+  for (int i = 0; i < 2 * 3; ++i) {
+    src_ptr[i] = static_cast<float>(i);
+  }
+
+  // get output
+  auto* output = scope.Var("OUT1");
+  cpu_op->Run(scope, cpu_place);
+
+  auto* output_ptr = output->Get<LoDTensor>().data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
+  }
+
+  // create an op to run on GPU
+  paddle::framework::proto::OpDesc gpu_op_desc;
+  gpu_op_desc.set_type("test_op");
+  BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+  BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+  auto attr = gpu_op_desc.mutable_attrs()->Add();
+  attr->set_name("use_gpu");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+  paddle::platform::CUDAPlace cuda_place(0);
+  // get output
+  auto* output2 = scope.Var("OUT2");
+  gpu_op->Run(scope, cuda_place);
+  VLOG(3) << "after gpu_op run";
+
+  // auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(cuda_place);
+
+  paddle::framework::Tensor output_tensor;
+  Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
+       &output_tensor);
+
+  dev_ctx->Wait();
+  float* output2_ptr = output_tensor.data<float>();
+  for (int i = 0; i < 2 * 3; ++i) {
+    ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
+  }
+}
diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h
new file mode 100644
index 0000000000..31817251ed
--- /dev/null
+++ b/paddle/framework/data_layout.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cctype>
+#include <ostream>
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+enum class DataLayout {
+  kNHWC = 0,
+  kNCHW = 1,
+  kAnyLayout = 2,
+};
+
+inline DataLayout StringToDataLayout(const std::string& str) {
+  std::string s(str);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+
+  if (s == "NHWC") {
+    return DataLayout::kNHWC;
+  } else if (s == "NCHW") {
+    return DataLayout::kNCHW;
+  } else if (s == "ANYLAYOUT") {
+    return DataLayout::kAnyLayout;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", s);
+  }
+}
+
+inline std::string DataLayoutToString(const DataLayout& data_layout) {
+  switch (data_layout) {
+    case DataLayout::kNHWC:
+      return "NHWC";
+    case DataLayout::kNCHW:
+      return "NCHW";
+    case DataLayout::kAnyLayout:
+      return "ANY_LAYOUT";
+    default:
+      PADDLE_THROW("unknown DataLayou %d", data_layout);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, const DataLayout& l) {
+  out << DataLayoutToString(l);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.cc b/paddle/framework/data_layout_transform.cc
new file mode 100644
index 0000000000..9d0a6d5ea3
--- /dev/null
+++ b/paddle/framework/data_layout_transform.cc
@@ -0,0 +1,91 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/data_layout_transform.h"
+
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to) {
+  PADDLE_ENFORCE_NE(from, to,
+                    "layout transform should transform different layout");
+  if (from == DataLayout::kNCHW && to == DataLayout::kNHWC) {
+    return {0, 2, 3, 1};
+  } else if (from == DataLayout::kNHWC && to == DataLayout::kNCHW) {
+    return {0, 3, 1, 2};
+  } else {
+    PADDLE_THROW("unsupported transform");
+  }
+}
+
+struct CastDataLayout {
+  CastDataLayout(const platform::DeviceContext* ctx,
+                 const std::vector<int>& axis, const framework::Tensor& in,
+                 framework::Tensor* out)
+      : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+  const std::vector<int> axis_;
+
+  template <typename T>
+  void operator()() {
+    auto place = ctx_->GetPlace();
+
+    if (platform::is_cpu_place(place)) {
+      operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans4(*context, in_, out_, axis_);
+    } else {
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out) {
+  PADDLE_ENFORCE(
+      platform::places_are_same_class(kernel_type_for_var.place_,
+                                      expected_kernel_type.place_),
+      "TransDataLayout only support DataLayout transform on same place!");
+
+  PADDLE_ENFORCE(arity(in.dims()) == 4, "Input Arity only support 4!");
+
+  auto& pool = platform::DeviceContextPool::Instance();
+
+  auto src_dim = in.dims();
+  std::vector<int64_t> dst_dim;
+
+  auto axis = GetAxis(kernel_type_for_var.data_layout_,
+                      expected_kernel_type.data_layout_);
+  dst_dim.resize(axis.size());
+  for (size_t i = 0; i < axis.size(); i++) {
+    dst_dim[i] = src_dim[axis[i]];
+  }
+
+  out->Resize(make_ddim(dst_dim));
+  out->mutable_data(expected_kernel_type.place_, in.type());
+
+  framework::VisitDataType(
+      framework::ToDataType(in.type()),
+      CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out));
+
+  out->set_layout(expected_kernel_type.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform.h b/paddle/framework/data_layout_transform.h
new file mode 100644
index 0000000000..368f7fc989
--- /dev/null
+++ b/paddle/framework/data_layout_transform.h
@@ -0,0 +1,31 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<int> GetAxis(const DataLayout& from, const DataLayout& to);
+
+void TransDataLayout(const OpKernelType& kernel_type_for_var,
+                     const OpKernelType& expected_kernel_type, const Tensor& in,
+                     Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_layout_transform_test.cc b/paddle/framework/data_layout_transform_test.cc
new file mode 100644
index 0000000000..093e8d4d34
--- /dev/null
+++ b/paddle/framework/data_layout_transform_test.cc
@@ -0,0 +1,44 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/data_layout_transform.h"
+
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+TEST(DataTransform, DataLayoutFunction) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto place = CPUPlace();
+  Tensor in = Tensor();
+  Tensor out = Tensor();
+  in.mutable_data<double>(make_ddim({2, 3, 1, 2}), place);
+  in.set_layout(DataLayout::kNHWC);
+
+  auto kernel_nhwc = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNHWC, LibraryType::kPlain);
+  auto kernel_ncwh = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kNCHW, LibraryType::kPlain);
+
+  TransDataLayout(kernel_nhwc, kernel_ncwh, in, &out);
+
+  EXPECT_TRUE(out.layout() == DataLayout::kNCHW);
+  EXPECT_TRUE(out.dims() == make_ddim({2, 2, 3, 1}));
+
+  TransDataLayout(kernel_ncwh, kernel_nhwc, in, &out);
+
+  EXPECT_TRUE(in.layout() == DataLayout::kNHWC);
+  EXPECT_TRUE(in.dims() == make_ddim({2, 3, 1, 2}));
+}
\ No newline at end of file
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
new file mode 100644
index 0000000000..b6fd46401f
--- /dev/null
+++ b/paddle/framework/data_transform.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_transform.h"
+
+#include "paddle/framework/data_device_transform.h"
+#include "paddle/framework/data_layout_transform.h"
+#include "paddle/framework/data_type_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static void PassTensorData(Tensor* from, Tensor* to) {
+  to->ShareDataWith(*from);
+  *from = Tensor();
+}
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* output_tensor) {
+  bool transformed = false;
+  Tensor in;
+  in.ShareDataWith(input_tensor);
+  Tensor out;
+
+  // do layout transform
+  if (NeedTransformLayout(expected_kernel_type.data_layout_,
+                          kernel_type_for_var.data_layout_)) {
+    TransDataLayout(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
+    TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  // do device transform
+  if (!platform::is_same_place(kernel_type_for_var.place_,
+                               expected_kernel_type.place_)) {
+    TransDataDevice(in, expected_kernel_type.place_, &out);
+    transformed = true;
+    PassTensorData(&out, &in);
+  }
+
+  PADDLE_ENFORCE(transformed, "No transform is applied, please check!");
+  // get output data
+  output_tensor->ShareDataWith(in);
+}
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var) {
+  if (in_var.IsType<LoDTensor>()) {
+    auto& in_lod_tensor = in_var.Get<LoDTensor>();
+    auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
+    tran_lod_tensor->set_lod(in_lod_tensor.lod());
+    tran_lod_tensor->set_layout(in_lod_tensor.layout());
+    tran_lod_tensor->ShareDataWith(tensor);
+  } else if (in_var.IsType<SelectedRows>()) {
+    auto& in_selected_rows = in_var.Get<SelectedRows>();
+    auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
+    trans_selected_rows->set_height(in_selected_rows.height());
+    trans_selected_rows->set_rows(in_selected_rows.rows());
+    trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+  } else {
+    PADDLE_THROW("unknown var type");
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
new file mode 100644
index 0000000000..a4b7890237
--- /dev/null
+++ b/paddle/framework/data_transform.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <utility>
+#include <vector>
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/variable.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/macros.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+void DataTransform(const OpKernelType& expected_kernel_type,
+                   const OpKernelType& kernel_type_for_var,
+                   const Tensor& input_tensor, Tensor* out);
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+                            Variable& out_var);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
new file mode 100644
index 0000000000..98eb3e857d
--- /dev/null
+++ b/paddle/framework/data_type.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+inline proto::DataType ToDataType(std::type_index type) {
+  using namespace paddle::framework::proto;
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+    return DataType::INT64;
+  } else if (typeid(bool).hash_code() == type.hash_code()) {
+    return DataType::BOOL;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+
+inline std::type_index ToTypeIndex(proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP32:
+      return typeid(float);
+    case DataType::FP64:
+      return typeid(double);
+    case DataType::INT32:
+      return typeid(int);
+    case DataType::INT64:
+      return typeid(int64_t);
+    case DataType::BOOL:
+      return typeid(bool);
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+template <typename Visitor>
+inline void VisitDataType(proto::DataType type, Visitor visitor) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP32:
+      visitor.template operator()<float>();
+      break;
+    case DataType::FP64:
+      visitor.template operator()<double>();
+      break;
+    case DataType::INT32:
+      visitor.template operator()<int>();
+      break;
+    case DataType::INT64:
+      visitor.template operator()<int64_t>();
+      break;
+    case DataType::BOOL:
+      visitor.template operator()<bool>();
+      break;
+    default:
+      PADDLE_THROW("Not supported");
+  }
+}
+
+inline std::string DataTypeToString(const proto::DataType type) {
+  using namespace paddle::framework::proto;
+  switch (type) {
+    case DataType::FP16:
+      return "float16";
+    case DataType::FP32:
+      return "float32";
+    case DataType::FP64:
+      return "float64";
+    case DataType::INT16:
+      return "int16";
+    case DataType::INT32:
+      return "int32";
+    case DataType::INT64:
+      return "int64";
+    case DataType::BOOL:
+      return "bool";
+    default:
+      PADDLE_THROW("Not support type %d", type);
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out,
+                                const proto::DataType& type) {
+  out << DataTypeToString(type);
+  return out;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.cc b/paddle/framework/data_type_transform.cc
new file mode 100644
index 0000000000..7df1cc6b75
--- /dev/null
+++ b/paddle/framework/data_type_transform.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type_transform.h"
+
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename InType, typename OutType>
+struct CastDataTypeFunctor {
+  HOSTDEVICE inline OutType operator()(InType in) const {
+    return static_cast<OutType>(in);
+  }
+};
+
+template <typename InType>
+struct CastDataType {
+  CastDataType(const framework::Tensor& in, framework::Tensor* out,
+               const platform::DeviceContext* ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+  const framework::Tensor in_;
+  framework::Tensor* out_;
+  const platform::DeviceContext* ctx_;
+
+  template <typename OutType>
+  void operator()() {
+    auto* in_begin = in_.data<InType>();
+    auto* in_end = in_begin + in_.numel();
+    auto* out_begin = out_->mutable_data<OutType>(in_.place());
+
+    if (platform::is_cpu_place(in_.place())) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
+      trans(*context, in_begin, in_end, out_begin,
+            CastDataTypeFunctor<InType, OutType>());
+    } else {
+      // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
+      PADDLE_THROW("Unsupport CPU <-> GPU!");
+    }
+  }
+};
+
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+  out->Resize(in.dims());
+  auto src_type = kernel_type_for_var.data_type_;
+  auto dst_type = expected_kernel_type.data_type_;
+  auto ctx = pool.Get(in.place());
+
+  switch (src_type) {
+    case proto::DataType::FP32:
+      framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
+      break;
+    case proto::DataType::FP64:
+      framework::VisitDataType(dst_type, CastDataType<double>(in, out, ctx));
+      break;
+    case proto::DataType::INT32:
+      framework::VisitDataType(dst_type, CastDataType<int>(in, out, ctx));
+      break;
+    case proto::DataType::INT64:
+      framework::VisitDataType(dst_type, CastDataType<int64_t>(in, out, ctx));
+      break;
+    case proto::DataType::BOOL:
+      framework::VisitDataType(dst_type, CastDataType<bool>(in, out, ctx));
+      break;
+    default:
+      PADDLE_THROW("Not support type %d", src_type);
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type_transform.h b/paddle/framework/data_type_transform.h
new file mode 100644
index 0000000000..067c0c2a5b
--- /dev/null
+++ b/paddle/framework/data_type_transform.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/variable.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
+
+void TransDataType(const OpKernelType& kernel_type_for_var,
+                   const OpKernelType& expected_kernel_type, const Tensor& in,
+                   Tensor* out);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/data_type_transform_test.cc b/paddle/framework/data_type_transform_test.cc
new file mode 100644
index 0000000000..89d32f5283
--- /dev/null
+++ b/paddle/framework/data_type_transform_test.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type_transform.h"
+
+#include "gtest/gtest.h"
+
+TEST(DataTypeTransform, CPUTransform) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  auto place = CPUPlace();
+
+  Tensor in;
+  Tensor out;
+
+  float* ptr = in.mutable_data<float>(make_ddim({2, 3}), place);
+  int data_number = 2 * 3;
+
+  for (int i = 0; i < data_number; ++i) {
+    ptr[i] = i / 3;
+  }
+
+  auto kernel_fp32 = OpKernelType(proto::DataType::FP32, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_fp64 = OpKernelType(proto::DataType::FP64, place,
+                                  DataLayout::kAnyLayout, LibraryType::kPlain);
+  auto kernel_int32 = OpKernelType(proto::DataType::INT32, place,
+                                   DataLayout::kAnyLayout, LibraryType::kPlain);
+
+  TransDataType(kernel_fp32, kernel_fp64, in, &out);
+  double* out_data_double = out.data<double>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
+  }
+
+  TransDataType(kernel_fp32, kernel_int32, in, &out);
+  int* out_data_int = out.data<int>();
+  for (int i = 0; i < data_number; ++i) {
+    ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
+  }
+}
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
new file mode 100644
index 0000000000..8b6f42b82d
--- /dev/null
+++ b/paddle/framework/ddim.cc
@@ -0,0 +1,318 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/ddim.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+/// @cond HIDDEN
+
+template <int i>
+Dim<i> make_dim(const int64_t* d) {
+  return Dim<i>(*d, make_dim<i - 1>(d + 1));
+}
+
+template <>
+Dim<1> make_dim<1>(const int64_t* d) {
+  return Dim<1>(*d);
+}
+
+void make_ddim(DDim& ddim, const int64_t* dims, int n) {
+  switch (n) {
+    case 1:
+      ddim = make_dim<1>(dims);
+      break;
+    case 2:
+      ddim = make_dim<2>(dims);
+      break;
+    case 3:
+      ddim = make_dim<3>(dims);
+      break;
+    case 4:
+      ddim = make_dim<4>(dims);
+      break;
+    case 5:
+      ddim = make_dim<5>(dims);
+      break;
+    case 6:
+      ddim = make_dim<6>(dims);
+      break;
+    case 7:
+      ddim = make_dim<7>(dims);
+      break;
+    case 8:
+      ddim = make_dim<8>(dims);
+      break;
+    case 9:
+      ddim = make_dim<9>(dims);
+      break;
+    default:
+      PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
+  }
+}
+
+/// @endcond
+
+DDim make_ddim(std::initializer_list<int64_t> dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, dims.begin(), dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int64_t>& dims) {
+  DDim result(make_dim(0));
+  make_ddim(result, &dims[0], dims.size());
+  return result;
+}
+
+DDim make_ddim(const std::vector<int>& dims) {
+  std::vector<int64_t> res(dims.size());
+  std::transform(dims.begin(), dims.end(), res.begin(),
+                 [](int d) { return static_cast<int64_t>(d); });
+  return make_ddim(res);
+}
+
+/// @cond HIDDEN
+// XXX For some reason, putting this in an anonymous namespace causes errors
+class DynamicMutableIndexer : public boost::static_visitor<int64_t&> {
+ public:
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int64_t& operator()(Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+ private:
+  int idx_;
+};
+
+class DynamicConstIndexer : public boost::static_visitor<int64_t> {
+ public:
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
+
+  template <int D>
+  int64_t operator()(const Dim<D>& dim) const {
+    return dim[idx_];
+  }
+
+ private:
+  int idx_;
+};
+
+/// @endcond
+
+int64_t& DDim::operator[](int idx) {
+  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
+}
+
+int64_t DDim::operator[](int idx) const {
+  return boost::apply_visitor(DynamicConstIndexer(idx), var);
+}
+
+int DDim::size() const { return arity(*this); }
+
+bool DDim::operator==(DDim d) const {
+  if (var.which() != d.getVar().which()) {
+    return false;
+  } else {
+    std::vector<int64_t> v1 = vectorize(*this);
+    std::vector<int64_t> v2 = vectorize(d);
+
+    for (unsigned int i = 0; i < v1.size(); i++) {
+      if (v1[i] != v2[i]) {
+        return false;
+      }
+    }
+
+    return true;
+  }
+}
+
+bool DDim::operator!=(DDim d) const { return !(*this == d); }
+
+DDim DDim::operator+(DDim d) const {
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
+
+  std::vector<int64_t> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] + v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+DDim DDim::operator*(DDim d) const {
+  std::vector<int64_t> v1 = vectorize(*this);
+  std::vector<int64_t> v2 = vectorize(d);
+
+  std::vector<int64_t> v3;
+
+  assert(v1.size() == v2.size());
+
+  for (unsigned int i = 0; i < v1.size(); i++) {
+    v3.push_back(v1[i] * v2[i]);
+  }
+
+  return make_ddim(v3);
+}
+
+int64_t get(const DDim& ddim, int idx) { return ddim[idx]; }
+
+void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
+
+/// @cond HIDDEN
+struct VectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int64_t>& vector;
+
+  explicit VectorizeVisitor(std::vector<int64_t>& v) : vector(v) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    vector.push_back(t.head);
+    this->operator()(t.tail);
+  }
+
+  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
+};
+/// @endcond
+
+std::vector<int64_t> vectorize(const DDim& ddim) {
+  std::vector<int64_t> result;
+  VectorizeVisitor visitor(result);
+  boost::apply_visitor(visitor, ddim);
+  return result;
+}
+
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> vectorize2int(const DDim& ddim) {
+  std::vector<int64_t> temp = vectorize(ddim);
+  std::vector<int> result(temp.begin(), temp.end());
+  return result;
+}
+
+struct ProductVisitor : public boost::static_visitor<int64_t> {
+  template <int D>
+  int64_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
+int64_t product(const DDim& ddim) {
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int64_t>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int64_t>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
+  }
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int64_t> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
+}
+
+/// \cond HIDDEN
+
+struct ArityVisitor : boost::static_visitor<int> {
+  template <int D>
+  int operator()(Dim<D>) const {
+    return D;
+  }
+};
+
+/// \endcond
+
+int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
+
+/// \cond HIDDEN
+
+struct DDimPrinter : boost::static_visitor<void> {
+  std::ostream& os;
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
+
+  template <typename T>
+  void operator()(const T& t) {
+    os << t;
+  }
+};
+
+/// \endcond
+
+std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
+  DDimPrinter printer(os);
+  boost::apply_visitor(printer, ddim);
+  return os;
+}
+
+DDim::DDim(std::initializer_list<int64_t> init_list) {
+  *this = make_ddim(init_list);
+}
+
+DDim flatten_to_2d(const DDim& src, int num_col_dims) {
+  int rank = src.size();
+  return make_ddim({product(slice_ddim(src, 0, num_col_dims)),
+                    product(slice_ddim(src, num_col_dims, rank))});
+}
+
+DDim flatten_to_1d(const DDim& src) { return make_ddim({product(src)}); }
+
+DDim stride(const DDim& ddim) {
+  std::vector<int64_t> strides(ddim.size());
+  strides[ddim.size() - 1] = 1;
+  for (int i = ddim.size() - 2; i >= 0; --i) {
+    strides[i] = strides[i + 1] * ddim[i + 1];
+  }
+  return framework::make_ddim(strides);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
new file mode 100644
index 0000000000..4ca5e49566
--- /dev/null
+++ b/paddle/framework/ddim.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <initializer_list>
+#include <stdexcept>
+#include <vector>
+#include "paddle/framework/dim.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * \brief A dynamically sized dimension.
+ *
+ * The number of dimensions must be between [1, 9].
+ */
+struct DDim {
+  typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>,
+                         Dim<8>, Dim<9>>
+      DDimVar;
+  DDimVar var;
+
+  DDim() : var(Dim<1>()) {}
+
+  template <int D>
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int64_t> init_list);
+
+  template <int D>
+  DDim& operator=(const Dim<D>& in) {
+    var = in;
+    return *this;
+  }
+
+  int64_t& operator[](int idx);
+  int64_t operator[](int idx) const;
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) {
+    return var.apply_visitor(visitor);
+  }
+
+  template <typename Visitor>
+  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
+    return var.apply_visitor(visitor);
+  }
+
+  DDimVar getVar() { return var; }
+
+  bool operator==(DDim d) const;
+
+  bool operator!=(DDim d) const;
+
+  DDim operator+(DDim d) const;
+
+  DDim operator*(DDim d) const;
+
+  int size() const;
+};
+
+/**
+ * \brief Make a DDim from std::vector<int64_t>
+ *
+ * \param dims An vector of ints. Must be sized between [1, 9]
+ */
+DDim make_ddim(const std::vector<int64_t>& dims);
+
+DDim make_ddim(const std::vector<int>& dims);
+
+/**
+ * \brief Make a DDim from an initializer list
+ *
+ * \param dims An initializer list of ints. Must be sized between [1, 9]
+ *
+ */
+DDim make_ddim(std::initializer_list<int64_t> dims);
+
+int64_t get(const DDim& dim, int idx);
+void set(DDim& dim, int idx, int val);
+
+std::vector<int64_t> vectorize(const DDim& ddim);
+std::vector<int> vectorize2int(const DDim& ddim);
+
+int64_t product(const DDim& ddim);
+
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
+/**
+ * \brief What is the length of this dimension?
+ *
+ * \param Dynamic dimension to inspect
+ */
+
+int arity(const DDim& ddim);
+
+std::ostream& operator<<(std::ostream&, const DDim&);
+
+// Reshape a tensor to a matrix. The matrix's first dimension(column length)
+// will be the product of tensor's first `num_col_dims` dimensions.
+DDim flatten_to_2d(const DDim& src, int num_col_dims);
+
+DDim flatten_to_1d(const DDim& src);
+
+DDim stride(const DDim& ddim);
+}  // namespace framework
+}  // namespace paddle
+
+namespace boost {
+
+template <typename T>
+T get(const paddle::framework::DDim& in) {
+  return boost::get<T>(in.var);
+}
+
+}  // namespace boost
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
new file mode 100644
index 0000000000..bc259d1f60
--- /dev/null
+++ b/paddle/framework/ddim_test.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <sstream>
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/ddim.h"
+
+TEST(DDim, Equality) {
+  // construct a DDim from an initialization list
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({9, 1, 5});
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // construct a DDim from a vector
+  std::vector<int64_t> vec({9, 1, 5});
+  paddle::framework::DDim vddim = paddle::framework::make_ddim(vec);
+  EXPECT_EQ(ddim[0], 9);
+  EXPECT_EQ(ddim[1], 1);
+  EXPECT_EQ(ddim[2], 5);
+
+  // mutate a DDim
+  ddim[1] = 2;
+  EXPECT_EQ(ddim[1], 2);
+  paddle::framework::set(ddim, 0, 6);
+  EXPECT_EQ(paddle::framework::get(ddim, 0), 6);
+
+  // vectorize a DDim
+  std::vector<int64_t> res_vec = paddle::framework::vectorize(vddim);
+  EXPECT_EQ(res_vec[0], 9);
+  EXPECT_EQ(res_vec[1], 1);
+  EXPECT_EQ(res_vec[2], 5);
+  paddle::framework::Dim<3> d(3, 2, 1);
+  res_vec = paddle::framework::vectorize(paddle::framework::DDim(d));
+  EXPECT_EQ(res_vec[0], 3);
+  EXPECT_EQ(res_vec[1], 2);
+  EXPECT_EQ(res_vec[2], 1);
+
+  // add two DDims
+  paddle::framework::DDim ddim_sum = ddim + vddim;
+  EXPECT_EQ(ddim_sum[0], 15);
+  EXPECT_EQ(ddim_sum[1], 3);
+  EXPECT_EQ(ddim_sum[2], 10);
+
+  // multiply two DDims
+  paddle::framework::DDim ddim_mul = ddim * vddim;
+  EXPECT_EQ(ddim_mul[0], 54);
+  EXPECT_EQ(ddim_mul[1], 2);
+  EXPECT_EQ(ddim_mul[2], 25);
+
+  // arity of a DDim
+  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
+
+  // product of a DDim
+  EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
+}
+
+TEST(DDim, Print) {
+  // print a DDim
+  std::stringstream ss;
+  paddle::framework::DDim ddim = paddle::framework::make_ddim({2, 3, 4});
+  ss << ddim;
+  EXPECT_EQ("2, 3, 4", ss.str());
+}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
new file mode 100644
index 0000000000..b093e15892
--- /dev/null
+++ b/paddle/framework/details/buffered_channel.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename T>
+class Buffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual void Send(T*);
+  virtual void Receive(T*);
+  virtual size_t Cap() { return cap_; }
+  virtual void Close();
+  virtual ~Buffered();
+
+ private:
+  size_t cap_;
+  std::mutex mu_;
+  std::condition_variable empty_cond_var_;
+  std::condition_variable full_cond_var_;
+  std::deque<T> channel_;
+  bool closed_;
+
+  Buffered(size_t cap) : cap_(cap), closed_(false) {
+    PADDLE_ENFORCE_GT(cap, 0);
+  }
+
+  void NotifyAllSenders(std::unique_lock<std::mutex>*);
+};
+
+template <typename T>
+void Buffered<T>::Send(T* item) {
+  std::unique_lock<std::mutex> lock(mu_);
+  full_cond_var_.wait(lock,
+                      [this]() { return channel_.size() < cap_ || closed_; });
+  if (!closed_) {
+    channel_.push_back(std::move(*item));
+    lock.unlock();
+    empty_cond_var_.notify_one();
+  }
+}
+
+template <typename T>
+void Buffered<T>::Receive(T* item) {
+  std::unique_lock<std::mutex> lock(mu_);
+  empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  if (!closed_) {
+    *item = std::move(channel_.front());
+    channel_.pop_front();
+    NotifyAllSenders(&lock);
+  } else {
+    item = nullptr;
+  }
+}
+
+template <typename T>
+void Buffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  NotifyAllSenders(&lock);
+}
+
+template <typename T>
+Buffered<T>::~Buffered() {
+  std::unique_lock<std::mutex> lock(mu_);
+  closed_ = true;
+  channel_.clear();
+  NotifyAllSenders(&lock);
+}
+
+template <typename T>
+void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  full_cond_var_.notify_all();
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
new file mode 100644
index 0000000000..7e308ffb5a
--- /dev/null
+++ b/paddle/framework/details/cow_ptr.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <thread>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
+ public:
+  ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+
+  ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags& operator=(
+      const ThreadUnsafeOwnershipFlags& other) = delete;
+  ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
+
+  void SetOwnership(bool flag) { flag_ = flag; }
+
+  // Invoke the callback if it is not owned.
+  template <typename Callback>
+  void AcquireOwnershipOnce(Callback acquire) {
+    if (!flag_) {
+      acquire();
+      flag_ = true;
+    }
+  }
+
+ private:
+  bool flag_;
+};
+
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+//   * a constructor takes a bool. True if own.
+//   * SetOwnership(bool flag).
+//   * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+//     owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
+class COWPtr {
+ public:
+  // Ctor from raw pointer.
+  explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+
+  // Move methods. Steal ownership from origin
+  COWPtr(COWPtr&& other)
+      : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+  COWPtr& operator=(COWPtr&& origin) = default;
+
+  // Copy methods. Not own payload
+  COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+  COWPtr& operator=(const COWPtr& other) {
+    payload_ = other.payload_;
+    ownership_.SetOwnership(false);
+    return *this;
+  }
+
+  // Access read only data.
+  const T& Data() const { return *payload_; }
+
+  // Access mutable data. If the data is not owned, the data will be copied
+  // before.
+  T* MutableData() {
+    ownership_.AcquireOwnershipOnce(
+        [this] { payload_.reset(new T(*payload_)); });
+    return payload_.get();
+  }
+
+ private:
+  // Actual data pointer.
+  std::shared_ptr<T> payload_;
+
+  // Ownership flag.
+  OwnershipFlags ownership_;
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
new file mode 100644
index 0000000000..936954a233
--- /dev/null
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/details/cow_ptr.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(COWPtr, all) {
+  COWPtr<int> ptr(new int{0});
+  ASSERT_EQ(ptr.Data(), 0);
+  COWPtr<int> ptr2 = ptr;
+  ASSERT_EQ(ptr2.Data(), 0);
+  ASSERT_EQ(&ptr2.Data(), &ptr.Data());
+  *ptr2.MutableData() = 10;
+  ASSERT_EQ(ptr.Data(), 0);
+  ASSERT_EQ(ptr2.Data(), 10);
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
new file mode 100644
index 0000000000..6d50e820b2
--- /dev/null
+++ b/paddle/framework/details/op_registry.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/var_type_inference.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+enum OpInfoFillType {
+  kOperator = 0,
+  kOpProtoAndCheckerMaker = 1,
+  kGradOpDescMaker = 2,
+  kVarTypeInference = 3,
+  kShapeInference = 4
+};
+
+template <typename T>
+struct OpInfoFillTypeID {
+  static constexpr OpInfoFillType ID() {
+    return std::is_base_of<OperatorBase, T>::value
+               ? kOperator
+               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
+                      ? kOpProtoAndCheckerMaker
+                      : (std::is_base_of<GradOpDescMakerBase, T>::value
+                             ? kGradOpDescMaker
+                             : (std::is_base_of<VarTypeInference, T>::value
+                                    ? kVarTypeInference
+                                    : (std::is_base_of<InferShapeBase, T>::value
+                                           ? kShapeInference
+                                           : static_cast<OpInfoFillType>(
+                                                 -1)))));
+  }
+};
+
+template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
+struct OpInfoFiller;
+
+template <size_t I, bool at_end, typename... ARGS>
+class OperatorRegistrarRecursive;
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, false, ARGS...> {
+ public:
+  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
+    OpInfoFiller<T> fill;
+    fill(op_type, info);
+    constexpr auto size = sizeof...(ARGS);
+    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
+                                                                  info);
+    (void)(reg);
+  }
+};
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, true, ARGS...> {
+ public:
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOperator> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
+                        const VariableNameMap& outputs,
+                        const AttributeMap& attrs) {
+      return new T(type, inputs, outputs, attrs);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->proto_ = new proto::OpProto;
+    info->checker_ = new OpAttrChecker();
+    auto maker = T(info->proto_, info->checker_);
+    maker.Validate();
+    info->proto_->set_type(op_type);
+    PADDLE_ENFORCE(
+        info->proto_->IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, info->proto_->InitializationErrorString());
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kGradOpDescMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->grad_op_maker_ = [](
+        const OpDesc& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var,
+        const std::vector<BlockDesc*>& grad_block) {
+      T maker(fwd_op, no_grad_set, grad_to_var, grad_block);
+      return maker();
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kVarTypeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+      T inference;
+      inference(fwd_op, block);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kShapeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_shape_ = [](InferShapeContext* ctx) {
+      T inference;
+      inference(ctx);
+    };
+  }
+};
+
+}  // namespace details
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
new file mode 100644
index 0000000000..cc2d2e587e
--- /dev/null
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+#include "paddle/framework/channel.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+template <typename T>
+class UnBuffered : public paddle::framework::Channel<T> {
+  friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
+  friend void paddle::framework::CloseChannel<T>(Channel<T>*);
+
+ public:
+  virtual void Send(T*);
+  virtual void Receive(T*);
+  virtual size_t Cap() { return 0; }
+  virtual void Close();
+  virtual ~UnBuffered();
+
+ private:
+  UnBuffered() {}
+};
+
+template <typename T>
+void UnBuffered<T>::Send(T* channel_element) {}
+
+template <typename T>
+void UnBuffered<T>::Receive(T*) {}
+
+template <typename T>
+void UnBuffered<T>::Close() {}
+
+template <typename T>
+UnBuffered<T>::~UnBuffered() {}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/majel/dim.h b/paddle/framework/dim.h
similarity index 68%
rename from paddle/majel/dim.h
rename to paddle/framework/dim.h
index c4b0c6aea6..ec17d7c615 100644
--- a/paddle/majel/dim.h
+++ b/paddle/framework/dim.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 #pragma once
 
 #include <iostream>
@@ -5,10 +18,11 @@
 #include <stdexcept>
 #include <type_traits>
 
-#include "paddle/majel/detail/cuda_assert.h"
-#include "paddle/majel/detail/hostdevice.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
 
-namespace majel {
+namespace paddle {
+namespace framework {
 
 // Statically sized, statically indexed dimension
 template <int i>
@@ -16,13 +30,13 @@ struct Dim {
   static constexpr int dimensions = i;
 
   template <typename... Args>
-  HOSTDEVICE Dim(int _head, Args... _tail) : head(_head), tail(_tail...) {
+  HOSTDEVICE Dim(int64_t _head, Args... _tail) : head(_head), tail(_tail...) {
     static_assert(sizeof...(_tail) == i - 1,
                   "Dim initialized with the wrong number of parameters");
   }
 
   HOSTDEVICE
-  Dim(int _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
+  Dim(int64_t _head, const Dim<i - 1>& _tail) : head(_head), tail(_tail) {}
 
   HOSTDEVICE
   Dim() : head(0), tail() {}
@@ -30,12 +44,12 @@ struct Dim {
   /** Construct a Dim from a linear index and size.  Uses Fortran order
    * indexing. */
   HOSTDEVICE
-  Dim(int idx, const Dim<i>& size)
+  Dim(int64_t idx, const Dim<i>& size)
       : head(idx % size.head), tail(idx / size.head, size.tail) {}
 
   /** Construct a Dim with each dimension set to the given index */
   HOSTDEVICE
-  Dim(int idx) : head(idx), tail(idx) {}
+  Dim(int64_t idx) : head(idx), tail(idx) {}
 
   HOSTDEVICE
   bool operator==(const Dim<i>& o) const {
@@ -46,13 +60,13 @@ struct Dim {
   bool operator!=(const Dim<i>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
   HOST std::string to_string() const;
 
-  int head;
+  int64_t head;
   Dim<i - 1> tail;
 };
 
@@ -62,7 +76,7 @@ struct Dim<1> {
   static constexpr int dimensions = 1;
 
   HOSTDEVICE
-  Dim(int _head) : head(_head) {}
+  Dim(int64_t _head) : head(_head) {}
 
   HOSTDEVICE
   Dim() : head(0) {}
@@ -74,7 +88,7 @@ struct Dim<1> {
       throw std::invalid_argument("Index out of range.");
     }
 #else
-    MAJEL_ASSERT(idx < size.head);
+    PADDLE_ASSERT(idx < size.head);
 #endif
   }
 
@@ -85,11 +99,11 @@ struct Dim<1> {
   bool operator!=(const Dim<1>& o) const { return !(*this == o); }
 
   HOSTDEVICE
-  int& operator[](int idx);
+  int64_t& operator[](int idx);
   HOSTDEVICE
-  int operator[](int idx) const;
+  int64_t operator[](int idx) const;
 
-  int head;
+  int64_t head;
 };
 
 namespace {
@@ -99,12 +113,12 @@ template <int i>
 struct DimGetter {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return DimGetter<i - 1>::impl(d.tail);
   }
 };
@@ -114,24 +128,24 @@ template <>
 struct DimGetter<0> {
   // Return a copy if Dim is const
   template <typename D>
-  HOSTDEVICE static int impl(const D& d) {
+  HOSTDEVICE static int64_t impl(const D& d) {
     return d.head;
   }
   // Return a reference if Dim is mutable
   template <typename D>
-  HOSTDEVICE static int& impl(D& d) {
+  HOSTDEVICE static int64_t& impl(D& d) {
     return d.head;
   }
 };
 
 template <int D>
-HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t& indexer(Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
   }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
   if (idx == 0) {
     return dim.head;
@@ -140,25 +154,25 @@ HOSTDEVICE int& indexer(Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int& indexer<1>(Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t& indexer<1>(Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
   }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
   return dim.head;
 }
 
 template <int D>
-HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
+HOSTDEVICE int64_t indexer(const Dim<D>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx < 0) {
     throw std::invalid_argument("Tried to access a negative dimension");
   }
 #else
-  MAJEL_ASSERT(idx >= 0);
+  PADDLE_ASSERT(idx >= 0);
 #endif
   if (idx == 0) {
     return dim.head;
@@ -167,13 +181,13 @@ HOSTDEVICE int indexer(const Dim<D>& dim, int idx) {
 }
 
 template <>
-HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
+HOSTDEVICE int64_t indexer<1>(const Dim<1>& dim, int idx) {
 #ifndef __CUDA_ARCH__
   if (idx != 0) {
     throw std::invalid_argument("Invalid index");
   }
 #else
-  MAJEL_ASSERT(idx == 0);
+  PADDLE_ASSERT(idx == 0);
 #endif
   return dim.head;
 }
@@ -181,73 +195,76 @@ HOSTDEVICE int indexer<1>(const Dim<1>& dim, int idx) {
 }  // namespace
 // Static access to constant Dim
 template <int i, int l>
-HOSTDEVICE int get(const Dim<l>& d) {
+HOSTDEVICE int64_t get(const Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Static access to mutable Dim
 template <int i, int l>
-HOSTDEVICE int& get(Dim<l>& d) {
+HOSTDEVICE int64_t& get(Dim<l>& d) {
   return DimGetter<i>::impl(d);
 }
 
 // Dynamic access to constant Dim
 template <int l>
-HOSTDEVICE int Dim<l>::operator[](int i) const {
+HOSTDEVICE int64_t Dim<l>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE int& Dim<l>::operator[](int i) {
+HOSTDEVICE int64_t& Dim<l>::operator[](int i) {
   return indexer(*this, i);
 }
 
 // Dynamic access to constant Dim
-inline HOSTDEVICE int Dim<1>::operator[](int i) const {
+inline HOSTDEVICE int64_t Dim<1>::operator[](int i) const {
   return indexer(*this, i);
 }
 
 // Dynamic access to mutable Dim
-inline HOSTDEVICE int& Dim<1>::operator[](int i) { return indexer(*this, i); }
+inline HOSTDEVICE int64_t& Dim<1>::operator[](int i) {
+  return indexer(*this, i);
+}
 
 // Dynamic access to constant Dim
 // without std::enable_if will try to instantiate this on get<0>(d)
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int>::type get(const Dim<l>& d,
-                                                           int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t>::type get(const Dim<l>& d,
+                                                               int i) {
   return d[i];
 }
 
 // Dynamic access to mutable Dim
 template <int l>
-HOSTDEVICE typename std::enable_if<(l > 0), int&>::type get(Dim<l>& d, int i) {
+HOSTDEVICE typename std::enable_if<(l > 0), int64_t&>::type get(Dim<l>& d,
+                                                                int i) {
   return d[i];
 }
 
 // Dot product of two dims
 template <int i>
-HOSTDEVICE int linearize(const Dim<i>& a, const Dim<i>& b) {
+HOSTDEVICE int64_t linearize(const Dim<i>& a, const Dim<i>& b) {
   return a.head * b.head + linearize(a.tail, b.tail);
 }
 
 // Base case dot product of two Dims
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int linearize(const Dim<1>& a, const Dim<1>& b) {
+HOSTDEVICE inline int64_t linearize(const Dim<1>& a, const Dim<1>& b) {
   return a.head * b.head;
 }
 
 // Product of a Dim
 template <int i>
-HOSTDEVICE int product(const Dim<i>& a, int prod = 1) {
+HOSTDEVICE int64_t product(const Dim<i>& a, int prod = 1) {
   return prod * a.head * product(a.tail);
 }
 
 // Base case product of a Dim
 // Notice it is inline because it is no longer a template
 template <>
-HOSTDEVICE inline int product(const Dim<1>& a, int prod) {
+HOSTDEVICE inline int64_t product(const Dim<1>& a, int prod) {
   return prod * a.head;
 }
 
@@ -265,29 +282,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -305,31 +299,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
@@ -411,7 +380,7 @@ HOSTDEVICE Dim<sizeof...(Args)> make_dim(Args... idxes) {
 // XXX For some reason, overloading fails to resolve this correctly
 template <int i>
 typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
   os << d.head << ", " << d.tail;
   return os;
 }
@@ -420,7 +389,7 @@ typename std::enable_if<(i > 1), std::ostream&>::type operator<<(
 // XXX I wish this could be an overload instead of a template
 template <int i>
 typename std::enable_if<(i == 1), std::ostream&>::type operator<<(
-    std::ostream& os, const majel::Dim<i>& d) {
+    std::ostream& os, const Dim<i>& d) {
   os << d.head;
   return os;
 }
@@ -448,4 +417,5 @@ HOSTDEVICE Dim<D> linear_to_dimension(int linear_index, Dim<D> extents) {
   return result;
 }
 
-}  // namespace majel
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
new file mode 100644
index 0000000000..2bcab7c5c2
--- /dev/null
+++ b/paddle/framework/dim_test.cu
@@ -0,0 +1,114 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <thrust/device_vector.h>
+#include <sstream>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
+
+__global__ void test(paddle::framework::Dim<2>* o) {
+  o[0] = paddle::framework::make_dim(5, 6);
+}
+
+__global__ void dyn_idx_gpu(int64_t* o) {
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
+}
+
+TEST(Dim, Equality) {
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int64_t> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int64_t res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
+}
+
+TEST(Dim, Bool) {
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
+}
+
+TEST(Dim, Print) {
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
+}
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
new file mode 100644
index 0000000000..54bbeafcab
--- /dev/null
+++ b/paddle/framework/eigen.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    Type ret;
+    for (int64_t d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, DDim dims) {
+    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+
+  static ConstType From(const Tensor& tensor, DDim dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims_);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {
+  static typename EigenMatrix::Type Reshape(Tensor& tensor, int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+
+  static typename EigenMatrix::ConstType Reshape(const Tensor& tensor,
+                                                 int num_col_dims) {
+    int rank = tensor.dims_.size();
+    PADDLE_ENFORCE(num_col_dims > 0 && num_col_dims < rank,
+                   "`num_col_dims` must be between (0, rank_of_tensor).");
+    return EigenMatrix::From(tensor,
+                             flatten_to_2d(tensor.dims(), num_col_dims));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(tensor, {product(tensor.dims_)});
+  }
+
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(tensor, {product(tensor.dims_)});
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
new file mode 100644
index 0000000000..9e368a522c
--- /dev/null
+++ b/paddle/framework/eigen_test.cc
@@ -0,0 +1,132 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/eigen.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+TEST(EigenDim, From) {
+  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
+  ASSERT_EQ(1, ed[0]);
+  ASSERT_EQ(2, ed[1]);
+  ASSERT_EQ(3, ed[2]);
+}
+
+TEST(Eigen, Tensor) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+
+  ASSERT_EQ(1, et.dimension(0));
+  ASSERT_EQ(2, et.dimension(1));
+  ASSERT_EQ(3, et.dimension(2));
+
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 3; k++) {
+        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
+      }
+    }
+  }
+}
+
+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
+TEST(Eigen, VectorFrom) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
+  for (int i = 0; i < 6; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::From(t);
+
+  ASSERT_EQ(6, ev.dimension(0));
+
+  for (int i = 0; i < 6; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, VectorFlatten) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
+
+  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
+
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, Matrix) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
+
+  ASSERT_EQ(2, em.dimension(0));
+  ASSERT_EQ(3, em.dimension(1));
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+TEST(Eigen, MatrixReshape) {
+  Tensor t;
+  float* p = t.mutable_data<float>({2, 3, 6, 4}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 6 * 4; ++i) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::Reshape(t, 2);
+
+  ASSERT_EQ(2 * 3, em.dimension(0));
+  ASSERT_EQ(6 * 4, em.dimension(1));
+
+  for (int i = 0; i < 2 * 3; i++) {
+    for (int j = 0; j < 6 * 4; j++) {
+      ASSERT_NEAR(i * 6 * 4 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
new file mode 100644
index 0000000000..9a232b0843
--- /dev/null
+++ b/paddle/framework/executor.cc
@@ -0,0 +1,310 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/executor.h"
+
+#include <set>
+
+#include "gflags/gflags.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"
+
+DECLARE_bool(benchmark);
+DEFINE_bool(check_nan_inf, false,
+            "Checking whether operator produce NAN/INF or not. It will be "
+            "extremely slow so please use this flag wisely.");
+
+namespace paddle {
+namespace framework {
+
+Executor::Executor(const platform::Place& place) : place_(place) {}
+
+static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
+  if (var_type == proto::VarDesc::LOD_TENSOR) {
+    var->GetMutable<LoDTensor>();
+  } else if (var_type == proto::VarDesc::SELECTED_ROWS) {
+    var->GetMutable<SelectedRows>();
+  } else if (var_type == proto::VarDesc::FEED_MINIBATCH) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarDesc::FETCH_LIST) {
+    var->GetMutable<FeedFetchList>();
+  } else if (var_type == proto::VarDesc::STEP_SCOPES) {
+    var->GetMutable<std::vector<framework::Scope>>();
+  } else if (var_type == proto::VarDesc::LOD_RANK_TABLE) {
+    var->GetMutable<LoDRankTable>();
+  } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
+    var->GetMutable<LoDTensorArray>();
+  } else if (var_type == proto::VarDesc::PLACE_LIST) {
+    var->GetMutable<platform::PlaceList>();
+  } else {
+    PADDLE_THROW(
+        "Variable type %d is not in "
+        "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
+        " PLACE_LIST]",
+        var_type);
+  }
+}
+
+static void CheckTensorNANOrInf(const std::string& name,
+                                const framework::Tensor& tensor) {
+  if (tensor.memory_size() == 0) {
+    return;
+  }
+  if (tensor.type().hash_code() != typeid(float).hash_code() &&
+      tensor.type().hash_code() != typeid(double).hash_code()) {
+    return;
+  }
+  PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
+  PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
+}
+
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
+                   bool create_local_scope, bool create_vars) {
+  // TODO(tonyyang-svail):
+  //    - only runs on the first device (i.e. no interdevice communication)
+  //    - will change to use multiple blocks for RNN op and Cond Op
+  PADDLE_ENFORCE_LT(static_cast<size_t>(block_id), pdesc.Size());
+  auto& block = pdesc.Block(block_id);
+
+  Scope* local_scope = scope;
+  if (create_vars) {
+    if (create_local_scope) {
+      local_scope = &scope->NewScope();
+      for (auto& var : block.AllVars()) {
+        if (var->Name() == framework::kEmptyVarName) {
+          continue;
+        }
+
+        if (var->Persistable()) {
+          auto* ptr = scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " global, which pointer is " << ptr;
+        } else {
+          auto* ptr = local_scope->Var(var->Name());
+          CreateTensor(ptr, var->GetType());
+          VLOG(3) << "Create Variable " << var->Name()
+                  << " locally, which pointer is " << ptr;
+        }
+      }
+    } else {
+      for (auto& var : block.AllVars()) {
+        auto* ptr = local_scope->Var(var->Name());
+        CreateTensor(ptr, var->GetType());
+        VLOG(3) << "Create variable " << var->Name() << ", which pointer is "
+                << ptr;
+      }
+    }  // if (create_local_scope)
+  }    // if (create_vars)
+
+  for (auto& op_desc : block.AllOps()) {
+    auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
+    VLOG(4) << op->DebugStringEx(local_scope);
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    platform::RecordEvent record_event(op->Type(), pool.Get(place_));
+
+    op->Run(*local_scope, place_);
+    VLOG(3) << op->DebugStringEx(local_scope);
+    if (FLAGS_benchmark) {
+      VLOG(2) << "Memory used after operator " + op->Type() + " running: "
+              << memory::memory_usage(place_);
+    }
+    if (FLAGS_check_nan_inf) {
+      for (auto& vname : op->OutputVars(true)) {
+        auto* var = local_scope->FindVar(vname);
+        if (var == nullptr) continue;
+        if (var->IsType<framework::LoDTensor>()) {
+          CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
+        }
+      }
+    }
+  }
+  if (create_vars && create_local_scope) {
+    scope->DeleteScope(local_scope);
+  }
+  if (FLAGS_benchmark) {
+    VLOG(2) << "-------------------------------------------------------";
+    VLOG(2) << "Memory used after deleting local scope: "
+            << memory::memory_usage(place_);
+    VLOG(2) << "-------------------------------------------------------";
+  }
+}
+
+// Check whether the block already has feed operators and feed_holder.
+// Return false if the block does not have any feed operators.
+// If some feed operators have been prepended to the block, check that
+// the info contained in these feed operators matches the feed_targets
+// and feed_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has feed operators and holder of matching info.
+static bool has_feed_operators(
+    BlockDesc* block, std::map<std::string, const LoDTensor*>& feed_targets,
+    const std::string& feed_holder_name) {
+  size_t feed_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_count++;
+      PADDLE_ENFORCE_EQ(op->Input("X")[0], feed_holder_name,
+                        "Input to feed op should be '%s'", feed_holder_name);
+      std::string feed_target_name = op->Output("Out")[0];
+      PADDLE_ENFORCE(
+          feed_targets.find(feed_target_name) != feed_targets.end(),
+          "Feed operator output name '%s' cannot be found in 'feed_targets'",
+          feed_target_name);
+    }
+  }
+
+  if (feed_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        feed_count, feed_targets.size(),
+        "The number of feed operators should match 'feed_targets'");
+
+    // When feed operator are present, so should be feed_holder
+    auto var = block->FindVar(feed_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            feed_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FEED_MINIBATCH,
+                      "'%s' variable should be 'FEED_MINIBATCH' type",
+                      feed_holder_name);
+  }
+
+  return feed_count > 0;
+}
+
+// Check whether the block already has fetch operators and fetch_holder.
+// Return false if the block does not have any fetch operators.
+// If some fetch operators have been appended to the block, check that
+// the info contained in these fetch operators matches the fetch_targets
+// and fetch_holder_name. Raise exception when any mismatch is found.
+// Return true if the block has fetch operators and holder of matching info.
+static bool has_fetch_operators(
+    BlockDesc* block, std::map<std::string, LoDTensor*>& fetch_targets,
+    const std::string& fetch_holder_name) {
+  size_t fetch_count = 0;
+  for (auto* op : block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_count++;
+      PADDLE_ENFORCE_EQ(op->Output("Out")[0], fetch_holder_name,
+                        "Output of fetch op should be '%s'", fetch_holder_name);
+      std::string fetch_target_name = op->Input("X")[0];
+      PADDLE_ENFORCE(
+          fetch_targets.find(fetch_target_name) != fetch_targets.end(),
+          "Fetch operator input name '%s' cannot be found in 'fetch_targets'",
+          fetch_target_name);
+    }
+  }
+
+  if (fetch_count > 0) {
+    PADDLE_ENFORCE_EQ(
+        fetch_count, fetch_targets.size(),
+        "The number of fetch operators should match 'fetch_targets'");
+
+    // When fetch operator are present, so should be fetch_holder
+    auto var = block->FindVar(fetch_holder_name);
+    PADDLE_ENFORCE_NOT_NULL(var, "Block should already have a '%s' variable",
+                            fetch_holder_name);
+    PADDLE_ENFORCE_EQ(var->GetType(), proto::VarDesc::FETCH_LIST,
+                      "'%s' variable should be 'FETCH_LIST' type",
+                      fetch_holder_name);
+  }
+
+  return fetch_count > 0;
+}
+
+void Executor::Run(const ProgramDesc& program, Scope* scope,
+                   std::map<std::string, const LoDTensor*>& feed_targets,
+                   std::map<std::string, LoDTensor*>& fetch_targets,
+                   const std::string& feed_holder_name,
+                   const std::string& fetch_holder_name) {
+  auto* copy_program = new ProgramDesc(program);
+  auto* global_block = copy_program->MutableBlock(0);
+
+  if (!has_feed_operators(global_block, feed_targets, feed_holder_name)) {
+    // create feed_holder variable
+    auto* feed_holder = global_block->Var(feed_holder_name);
+    feed_holder->SetType(proto::VarDesc::FEED_MINIBATCH);
+    feed_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& feed_target : feed_targets) {
+      std::string var_name = feed_target.first;
+      VLOG(3) << "feed target's name: " << var_name;
+
+      // prepend feed op
+      auto* op = global_block->PrependOp();
+      op->SetType(kFeedOpType);
+      op->SetInput("X", {feed_holder_name});
+      op->SetOutput("Out", {var_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  // map the data of feed_targets to feed_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      std::string feed_target_name = op->Output("Out")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      SetFeedVariable(scope, *feed_targets[feed_target_name], feed_holder_name,
+                      idx);
+    }
+  }
+
+  if (!has_fetch_operators(global_block, fetch_targets, fetch_holder_name)) {
+    // create fetch_holder variable
+    auto* fetch_holder = global_block->Var(fetch_holder_name);
+    fetch_holder->SetType(proto::VarDesc::FETCH_LIST);
+    fetch_holder->SetPersistable(true);
+
+    int i = 0;
+    for (auto& fetch_target : fetch_targets) {
+      std::string var_name = fetch_target.first;
+      VLOG(3) << "fetch target's name: " << var_name;
+
+      // append fetch op
+      auto* op = global_block->AppendOp();
+      op->SetType(kFetchOpType);
+      op->SetInput("X", {var_name});
+      op->SetOutput("Out", {fetch_holder_name});
+      op->SetAttr("col", {static_cast<int>(i)});
+      op->CheckAttrs();
+
+      i++;
+    }
+  }
+
+  Run(*copy_program, scope, 0, true, true);
+
+  // obtain the data of fetch_targets from fetch_holder
+  for (auto* op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      std::string fetch_target_name = op->Input("X")[0];
+      int idx = boost::get<int>(op->GetAttr("col"));
+      *fetch_targets[fetch_target_name] =
+          GetFetchVariable(*scope, fetch_holder_name, idx);
+    }
+  }
+
+  delete copy_program;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
new file mode 100644
index 0000000000..035ff48a52
--- /dev/null
+++ b/paddle/framework/executor.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+class Executor {
+ public:
+  // TODO(dzhwinter) : Do not rely on this function, it will be removed
+  explicit Executor(const platform::DeviceContext& device)
+      : Executor(device.GetPlace()) {}
+
+  explicit Executor(const platform::Place& place);
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
+  void Run(const ProgramDesc&, Scope*, int, bool create_local_scope = true,
+           bool create_vars = true);
+
+  void Run(const ProgramDesc& program, Scope* scope,
+           std::map<std::string, const LoDTensor*>& feed_targets,
+           std::map<std::string, LoDTensor*>& fetch_targets,
+           const std::string& feed_holder_name = "feed",
+           const std::string& fetch_holder_name = "fetch");
+
+ private:
+  const platform::Place place_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.cc b/paddle/framework/feed_fetch_method.cc
new file mode 100644
index 0000000000..21201b6755
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_method.h"
+#include "glog/logging.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
new file mode 100644
index 0000000000..b71945fcc8
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index);
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
new file mode 100644
index 0000000000..168f456675
--- /dev/null
+++ b/paddle/framework/feed_fetch_type.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using FeedFetchType = LoDTensor;
+using FeedFetchList = std::vector<FeedFetchType>;
+
+static const std::string kFeedOpType = "feed";
+static const std::string kFetchOpType = "fetch";
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
new file mode 100644
index 0000000000..5b6ef03f61
--- /dev/null
+++ b/paddle/framework/framework.proto
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle.framework.proto;
+
+enum AttrType {
+  INT = 0;
+  FLOAT = 1;
+  STRING = 2;
+  INTS = 3;
+  FLOATS = 4;
+  STRINGS = 5;
+  BOOLEAN = 6;
+  BOOLEANS = 7;
+  BLOCK = 8;
+  LONG = 9;
+}
+
+// OpDesc describes an instance of a C++ framework::OperatorBase
+// derived class type.
+message OpDesc {
+
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    optional int32 i = 3;
+    optional float f = 4;
+    optional string s = 5;
+    repeated int32 ints = 6;
+    repeated float floats = 7;
+    repeated string strings = 8;
+    optional bool b = 10;
+    repeated bool bools = 11;
+    optional int32 block_idx = 12;
+    optional int64 l = 13;
+  };
+
+  message Var {
+    required string parameter = 1;
+    repeated string arguments = 2;
+  };
+
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+
+// OpProto describes a C++ framework::OperatorBase derived class.
+message OpProto {
+
+  // VarProto describes the C++ type framework::Variable.
+  message Var {
+    required string name = 1;
+    required string comment = 2;
+
+    optional bool duplicable = 3 [ default = false ];
+    optional bool intermediate = 4 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
+  }
+
+  // AttrProto describes the C++ type Attribute.
+  message Attr {
+    required string name = 1;
+    required AttrType type = 2;
+    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third
+    // language binding has responsibility to fill that
+    // attribute. End-User should not set that attribute.
+    optional bool generated = 4 [ default = false ];
+  }
+
+  required string type = 1;
+  repeated Var inputs = 2;
+  repeated Var outputs = 3;
+  repeated Attr attrs = 4;
+  required string comment = 5;
+}
+
+enum DataType {
+  BOOL = 0;
+  INT16 = 1;
+  INT32 = 2;
+  INT64 = 3;
+  FP16 = 4;
+  FP32 = 5;
+  FP64 = 6;
+}
+
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
+message LoDTensorArrayDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
+}
+
+message VarDesc {
+  enum VarType {
+    LOD_TENSOR = 1;
+    SELECTED_ROWS = 2;
+    FEED_MINIBATCH = 3;
+    FETCH_LIST = 4;
+    STEP_SCOPES = 5;
+    LOD_RANK_TABLE = 6;
+    LOD_TENSOR_ARRAY = 7;
+    PLACE_LIST = 8;
+  }
+  required string name = 1;
+  required VarType type = 2;
+  optional LoDTensorDesc lod_tensor = 3;
+  optional TensorDesc selected_rows = 4;
+  optional LoDTensorArrayDesc tensor_array = 6;
+  optional bool persistable = 5 [ default = false ];
+}
+
+message BlockDesc {
+  required int32 idx = 1;
+  required int32 parent_idx = 2;
+  repeated VarDesc vars = 3;
+  repeated OpDesc ops = 4;
+}
+
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
+message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
new file mode 100644
index 0000000000..2082f8bb76
--- /dev/null
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include <vector>
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+  This functor class is responsible for creating the gradient ops for the given
+  operator fwd_op. After it is called (through operator()), the pairs of
+  (gradient variable, corresponding input variable of fwd_op) will be added to
+  grad_to_var. If an input variable of fwd_op is contained in no_grad_set, its
+  gradient varialbe will be ignored or kEmptyVarName depending on the template
+  argument DropEmptyIG in the derived classes.
+ */
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(
+      const OpDesc& fwd_op, const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var,
+      const std::vector<BlockDesc*>& grad_block = std::vector<BlockDesc*>())
+      : fwd_op_(fwd_op),
+        no_grad_set_(no_grad_set),
+        grad_to_var_(grad_to_var),
+        grad_block_(grad_block) {}
+
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<std::unique_ptr<OpDesc>> operator()() const = 0;
+
+ protected:
+  std::vector<std::string> InputGrad(const std::string& name,
+                                     bool drop_empty_grad = true) const {
+    std::vector<std::string> ret_val;
+    auto var_names = this->Input(name);
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     if (no_grad_set_.count(g_name)) {
+                       return kEmptyVarName;
+                     } else {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     }
+                   });
+    if (!drop_empty_grad) {
+      return ret_val;
+    }
+    PADDLE_ENFORCE_LE(var_names.size(), 1UL,
+                      "BUG from operator developer:"
+                      " for input argument with a list of variables, "
+                      " drop_empty_grad is not allowed because it makes"
+                      " the correspondence bewteen a variable and its gradient"
+                      " ambiguous. Use REGISTER_OP_EX to register the op"
+                      " or call InputGrad(?,false) in GradOpDescMaker."
+                      " Op type %s",
+                      fwd_op_.Type());
+
+    std::vector<std::string> dropped_ret_val;
+    dropped_ret_val.reserve(ret_val.size());
+    std::copy_if(ret_val.begin(), ret_val.end(),
+                 std::back_inserter(dropped_ret_val),
+                 [](const std::string& str) { return str != kEmptyVarName; });
+    return dropped_ret_val;
+  }
+
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    std::vector<std::string> ret_val;
+    auto onames = this->Output(name);
+    ret_val.reserve(onames.size());
+    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     (*this->grad_to_var_)[g_name] = fwd_var_name;
+                     return g_name;
+                   });
+    return ret_val;
+  }
+
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
+  }
+
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
+  }
+
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+
+ private:
+  const OpDesc& fwd_op_;
+  const std::unordered_set<std::string>& no_grad_set_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+
+ protected:
+  std::vector<BlockDesc*> grad_block_;
+};
+
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDesc>> operator()() const {
+    std::vector<std::unique_ptr<OpDesc>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
+
+ protected:
+  virtual std::unique_ptr<OpDesc> Apply() const = 0;
+};
+
+template <bool DropEmptyIG = true>
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<OpDesc> Apply() const {
+    auto* grad = new OpDesc();
+    grad->SetType(this->GradOpType());
+
+    for (auto& input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param),
+                      this->InputGrad(input_param, DropEmptyIG));
+    }
+
+    for (auto& output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+
+    grad->SetAttrMap(this->Attrs());
+
+    return std::unique_ptr<OpDesc>(grad);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+class EmptyGradOpMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+  std::vector<std::unique_ptr<OpDesc>> operator()() const override {
+    return {};
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
new file mode 100644
index 0000000000..3f6ea121b3
--- /dev/null
+++ b/paddle/framework/init.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string.h>  // for strdup
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/operator.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/string/piece.h"
+
+namespace paddle {
+namespace framework {
+
+std::once_flag gflags_init_flag;
+
+void InitGflags(std::vector<std::string> &argv) {
+  std::call_once(gflags_init_flag, [&]() {
+    int argc = argv.size();
+    char **arr = new char *[argv.size()];
+    std::string line;
+    for (size_t i = 0; i < argv.size(); i++) {
+      arr[i] = &argv[i][0];
+      line += argv[i];
+      line += ' ';
+    }
+    google::ParseCommandLineFlags(&argc, &arr, true);
+    VLOG(1) << "Init commandline: " << line;
+  });
+}
+
+void InitDevices() {
+  /*Init all avaiable devices by default */
+
+  std::vector<platform::Place> places;
+  places.emplace_back(platform::CPUPlace());
+  int count = 0;
+
+#ifdef PADDLE_WITH_CUDA
+  try {
+    count = platform::GetCUDADeviceCount();
+  } catch (const std::exception &exp) {
+    LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime.";
+  }
+#else
+  LOG(WARNING)
+      << "'CUDA' is not supported, Please re-compile with WITH_GPU option";
+#endif
+
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(platform::CUDAPlace(i));
+  }
+
+  platform::DeviceContextPool::Init(places);
+}
+
+void InitGLOG(const std::string &prog_name) {
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
+  google::InstallFailureSignalHandler();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
new file mode 100644
index 0000000000..c8fd964d00
--- /dev/null
+++ b/paddle/framework/init.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <mutex>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace framework {
+
+void InitGflags(std::vector<std::string> &argv);
+
+void InitGLOG(const std::string &prog_name);
+
+void InitDevices();
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/init_test.cc b/paddle/framework/init_test.cc
new file mode 100644
index 0000000000..01e076dd8e
--- /dev/null
+++ b/paddle/framework/init_test.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
+
+TEST(InitDevices, CPU) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifndef PADDLE_WITH_CUDA
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U);
+#endif
+}
+
+TEST(InitDevices, CUDA) {
+  using paddle::framework::InitDevices;
+  using paddle::platform::DeviceContextPool;
+
+#ifdef PADDLE_WITH_CUDA
+  int count = paddle::platform::GetCUDADeviceCount();
+  InitDevices();
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  ASSERT_EQ(pool.size(), 1U + static_cast<unsigned>(count));
+#endif
+}
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
new file mode 100644
index 0000000000..1e30848354
--- /dev/null
+++ b/paddle/framework/library_type.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cctype>
+
+namespace paddle {
+namespace framework {
+
+// For more details about the design of LibraryType, Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library
+
+enum class LibraryType {
+  kPlain = 0,
+  kMKLDNN = 1,
+  kCUDNN = 2,
+};
+
+inline std::string LibraryTypeToString(const LibraryType& library_type) {
+  switch (library_type) {
+    case LibraryType::kPlain:
+      return "PLAIN";
+    case LibraryType::kMKLDNN:
+      return "MKLDNN";
+    case LibraryType::kCUDNN:
+      return "CUDNN";
+    default:
+      PADDLE_THROW("unknown LibraryType %d", static_cast<int>(library_type));
+  }
+}
+
+inline LibraryType StringToLibraryType(const char* ctype) {
+  std::string s(ctype);
+  for (size_t i = 0; i < s.size(); ++i) {
+    s[i] = toupper(s[i]);
+  }
+  if (s == std::string("PLAIN")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("MKLDNN")) {
+    return LibraryType::kMKLDNN;
+  } else if (s == std::string("CUDNN")) {
+    return LibraryType::kCUDNN;
+    // To be compatible with register macro.
+    // CPU, CUDA, PLAIN are same library type.
+  } else if (s == std::string("CPU")) {
+    return LibraryType::kPlain;
+  } else if (s == std::string("CUDA")) {
+    return LibraryType::kPlain;
+  } else {
+    PADDLE_THROW("Unknown LibraryType %s", s.c_str());
+  }
+}
+
+inline std::ostream& operator<<(std::ostream& out, LibraryType l) {
+  out << LibraryTypeToString(l);
+  return out;
+}
+
+}  // namespace
+}  // framework
diff --git a/paddle/framework/lod_rank_table.cc b/paddle/framework/lod_rank_table.cc
new file mode 100644
index 0000000000..704bce2a0e
--- /dev/null
+++ b/paddle/framework/lod_rank_table.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+
+namespace paddle {
+namespace framework {
+void LoDRankTable::Reset(const LoD& lod, size_t level) {
+  this->coarse_lod_.clear();
+  this->items_.clear();
+  PADDLE_ENFORCE(level < lod.size(),
+                 "Cannot rank lod since the level %d is less than lod size %d",
+                 level, lod.size());
+  coarse_lod_.reserve(level);
+  for (size_t i = 0; i < level; ++i) {
+    coarse_lod_.push_back(lod[i]);
+  }
+  auto& vec = lod[level];
+  for (size_t i = 0; i < vec.size() - 1; ++i) {
+    TableItem item;
+    item.index = i;
+    item.length = vec[i + 1] - vec[i];
+    VLOG(10) << "Add item to rank table " << item.index << " " << item.length;
+    items_.emplace_back(item);
+  }
+  // NOTE(yuyang18):
+  //
+  // The time complexity of stable_sort is O(N*log(N)) if additional memory is
+  // available. It is easy to debug and unit test when using `stable_sort`
+  // instead of `sort`. Also, the items of a rank table will not be too large.
+  std::stable_sort(items_.begin(), items_.end(),
+                   [](const TableItem& a, const TableItem& b) {
+                     return a.length > b.length;
+                   });
+}
+
+}  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table) {
+  out << "NumOfSequence " << table.items().size() << "\n";
+  for (auto& each_item : table.items()) {
+    out << "\tSeq #" << each_item.index << ", Len=" << each_item.length << "\n";
+  }
+  return out;
+}
+}  // namespace paddle
diff --git a/paddle/framework/lod_rank_table.h b/paddle/framework/lod_rank_table.h
new file mode 100644
index 0000000000..df188709e9
--- /dev/null
+++ b/paddle/framework/lod_rank_table.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iosfwd>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+// LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+// length in descending order. It is useful when implement dynamic RNN and is
+// shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+// output operators.
+//
+// The table item contains two element. The length of sequence and the index of
+// sequence in that level.
+//
+// LoDRankTable also stores the coarse_lod, which is the lod information whose
+// level is less than input level, in order to restore the output LoD
+// information.
+class LoDRankTable {
+ public:
+  struct TableItem {
+    size_t index;
+    size_t length;
+  };
+
+  LoDRankTable() {}
+
+  void Reset(const LoD& lod, size_t level);
+
+  const std::vector<TableItem>& items() const { return this->items_; }
+
+  const LoD& coarse_lod() const { return this->coarse_lod_; }
+
+  size_t level() const { return coarse_lod_.size(); }
+
+ private:
+  LoD coarse_lod_;
+  std::vector<TableItem> items_;
+};
+
+}  // namespace framework
+
+std::ostream& operator<<(std::ostream& out,
+                         const framework::LoDRankTable& table);
+
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
new file mode 100644
index 0000000000..53b0d0fe08
--- /dev/null
+++ b/paddle/framework/lod_tensor.cc
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
+  os << "{";
+  for (auto &v : lod) {
+    os << "{";
+    for (auto &i : v) {
+      os << i << ",";
+    }
+    os << "}";
+  }
+  os << "}";
+
+  return os;
+}
+
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+  PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+  if (!platform::is_cpu_place(t.place())) {
+    LoDTensor tt;
+    framework::Copy(t, platform::CPUPlace(), &tt);
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(t.place());
+    dev_ctx.Wait();
+
+    os << tt;
+    return os;
+  }
+
+  os << "dim: " << t.dims() << "\n";
+  os << "lod: " << t.lod() << "\n";
+
+  // only print first ten elements
+  int64_t size = t.numel() < 10 ? t.numel() : 10;
+  for (int64_t i = 0; i < size; ++i) {
+    os << t.data<float>()[i] << " ";
+  }
+
+  return os;
+}
+
+std::string LoDToString(const LoD &lod) {
+  std::ostringstream stream;
+  stream << lod;
+  return stream.str();
+}
+
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
+                 size_t elem_end) {
+  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+  LoD res;
+  res.resize(in.size() - level);
+  // copy the first level
+  res[0].assign(in[level].begin() + elem_begin,
+                in[level].begin() + elem_end + 1);
+  for (size_t lvl = 1; lvl < res.size(); lvl++) {
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
+    out_level.assign(in_level.begin() + above_level.front(),
+                     in_level.begin() + above_level.back() + 1);
+  }
+  for (size_t lvl = 0; lvl < res.size(); lvl++) {
+    // to make the first offset equals 0, all the elements minus the first
+    // element
+    size_t front = res[lvl].front();
+    for (auto &ele : res[lvl]) {
+      ele -= front;
+    }
+  }
+  return res;
+}
+
+LoD ToAbsOffset(const LoD &in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (auto level = static_cast<int>(in.size() - 2); level >= 0; level--) {
+    for (size_t i = 0; i < in[level].size(); ++i) {
+      size_t index = in[level][i];
+      result[level][i] = result[level + 1][index];
+    }
+  }
+  return result;
+}
+
+bool operator==(const LoD &a, const LoD &b) {
+  if (a.size() != b.size()) {
+    return false;
+  }
+
+  for (size_t i = 0; i < a.size(); i++) {
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
+    if (a_level.size() != b_level.size()) {
+      return false;
+    }
+    for (size_t j = 0; j < a_level.size(); j++) {
+      if (a_level[j] != b_level[j]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool CheckLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+    // check: the first offset(the begin offset) of each level should be 0.
+    if (level.front() != 0) return false;
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      LOG(INFO) << "ascending error";
+      return false;
+    }
+  }
+  // check: the lowest level's last offset should equals `tensor_height` if
+  //        tensor_height>0.
+  if (tensor_height > 0 && (size_t)tensor_height != in.back().back())
+    return false;
+
+  // check: the higher level's last offset should equals the lower level's
+  // size-1.
+  // NOTE LoD store the levels from top to bottom, so the higher level goes
+  // first.
+  for (size_t level = 0; level < in.size() - 1; level++) {
+    if (in[level].back() != in[level + 1].size() - 1) return false;
+  }
+  return true;
+}
+
+bool CheckAbsLoD(const LoD &in, int tensor_height) {
+  if (in.empty()) return true;
+  for (const auto &level : in) {
+    // check: all the offsets in a level should be ascending(no same items
+    // allows).
+    if (!std::is_sorted(level.begin(), level.begin(), [](size_t a, size_t b) {
+          if (a < b) return true;
+          return false;
+        })) {
+      return false;
+    }
+
+    // check: there should be more than 2 offsets existing in each level.
+    if (level.size() < 2) return false;
+
+    // check: the first offset of each level should be 0, and the last should be
+    // the same(the height of underlying tensor).
+    if (level.front() != 0) return false;
+    if (tensor_height < 0) {
+      tensor_height = level.back();
+    } else if ((size_t)tensor_height != level.back()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
+                                        size_t end_idx, size_t start_level) {
+  LoD sub_lod;
+
+  for (size_t level_idx = start_level; level_idx < lod.size(); ++level_idx) {
+    PADDLE_ENFORCE_LE(start_idx, end_idx);
+    PADDLE_ENFORCE_LT(end_idx, lod[level_idx].size());
+    std::vector<size_t> level_lens;
+    for (size_t i = start_idx; i < end_idx; ++i) {
+      level_lens.push_back(lod[level_idx][i + 1] - lod[level_idx][i]);
+    }
+    sub_lod.emplace_back(level_lens);
+    start_idx = lod[level_idx][start_idx];
+    end_idx = lod[level_idx][end_idx];
+  }
+
+  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
+}
+
+void AppendLoD(LoD *lod, const LoD &lod_length) {
+  PADDLE_ENFORCE(
+      lod->empty() || lod->size() == lod_length.size(),
+      "The lod_length should has the same size with the appended lod.");
+  if (lod->empty()) {
+    for (size_t i = 0; i < lod_length.size(); ++i) {
+      lod->emplace_back(1, 0);  // size = 1, value = 0;
+    }
+    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
+  }
+  for (size_t i = 0; i < lod->size(); ++i) {
+    auto &level = (*lod)[i];
+    for (size_t len : lod_length[i]) {
+      level.push_back(level.back() + len);
+    }
+  }
+}
+
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  {  // the 1st field, uint32_t version for LoDTensor
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, LoD information
+    // uint64_t lod_level
+    // uint64_t lod_level_1 size in byte.
+    // int*     lod_level_1 data
+    // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  // the 3st field, Tensor
+  SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
+                           const platform::DeviceContext &dev_ctx) {
+  {
+    // the 1st field, unit32_t version for LoDTensor
+    uint32_t version;
+    is.read(reinterpret_cast<char *>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, LoD information
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+  // the 3st filed, Tensor
+  DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
+}
+
+std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
+    const std::vector<platform::Place> places) const {
+  check_memory_size();
+  int batch_size =
+      lod().empty() ? dims()[0] : static_cast<int>(lod()[0].size()) - 1;
+  size_t result_size = std::min(static_cast<size_t>(batch_size), places.size());
+  size_t remainder = batch_size % places.size();
+
+  std::vector<LoDTensor> results;
+  results.reserve(result_size);
+
+  int step_width = static_cast<int>(batch_size / result_size);
+  for (size_t i = 0; i < result_size; ++i) {
+    int begin = static_cast<int>(i * step_width);
+    int end = static_cast<int>((i + 1) * step_width);
+    if (i + 1 == places.size()) {  // last
+      end += remainder;
+    }
+
+    LoDTensor dst;
+    if (lod().empty()) {
+      auto src = Slice(begin, end);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+    } else {
+      auto lod_and_offset = GetSubLoDAndAbsoluteOffset(lod(), begin, end, 0);
+
+      auto &offset = lod_and_offset.second;
+      auto src = Slice(offset.first, offset.second);
+      auto &dst_place = places[i];
+      framework::Copy(src, dst_place, &dst);
+
+      LoD my_lod;
+      for (auto &l : lod_and_offset.first) {
+        std::vector<size_t> v{0};
+        for (auto &ll : l) {
+          v.push_back(ll + v.back());
+        }
+        my_lod.emplace_back(v);
+      }
+      dst.set_lod(my_lod);
+    }
+    results.emplace_back(dst);
+  }
+
+  return results;
+}
+
+void LoDTensor::MergeLoDTensor(
+    const std::vector<const LoDTensor *> &lod_tensors,
+    platform::Place dst_place) {
+  PADDLE_ENFORCE(!lod_tensors.empty());
+
+  framework::DDim new_dim = lod_tensors[0]->dims();
+  std::type_index new_type = lod_tensors[0]->type();
+  framework::DataLayout new_layout = lod_tensors[0]->layout();
+  LoD new_lod = lod_tensors[0]->lod();
+  for (size_t i = 1; i < lod_tensors.size(); ++i) {
+    auto *t = lod_tensors[i];
+    PADDLE_ENFORCE_EQ(new_type.hash_code(), t->type().hash_code());
+    PADDLE_ENFORCE_EQ(new_layout, t->layout());
+
+    PADDLE_ENFORCE_EQ(framework::product(new_dim) / new_dim[0],
+                      framework::product(t->dims()) / t->dims()[0]);
+    new_dim[0] += t->dims()[0];
+
+    auto &lod = t->lod();
+    for (size_t j = 0; j < lod.size(); ++j) {
+      auto &sub_lod = new_lod[j];
+      auto &offset = sub_lod.back();
+      for (size_t k = 1; k < lod[j].size(); ++k) {
+        sub_lod.push_back(lod[j][k] + offset);
+      }
+    }
+  }
+  Resize(new_dim);
+  set_layout(new_layout);
+  set_lod(new_lod);
+  mutable_data(dst_place, new_type);
+
+  int begin = 0;
+  for (auto *src : lod_tensors) {
+    int end = begin + src->dims()[0];
+    auto dst = Slice(begin, end);
+    framework::Copy(*src, dst_place, &dst);
+    begin = end;
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
new file mode 100644
index 0000000000..9d1294fdeb
--- /dev/null
+++ b/paddle/framework/lod_tensor.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#ifdef PADDLE_WITH_CUDA
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/system/cuda/experimental/pinned_allocator.h>
+#endif
+
+#include <glog/logging.h>
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+#ifndef PADDLE_WITH_CUDA
+template <typename T>
+using Vector = std::vector<T>;
+#else
+template <typename T>
+using Vector = thrust::host_vector<
+    T, thrust::system::cuda::experimental::pinned_allocator<T>>;
+#endif
+
+/*
+ * LoD is short for Level of Details.
+ *
+ * - in a level, each element indicates relative offset of the lower level
+ * - the first element should be 0 and that indicates that this sequence start
+ * from 0
+ * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
+ */
+using LoD = std::vector<Vector<size_t>>;
+
+std::ostream& operator<<(std::ostream& os, const LoD& lod);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
+
+std::string LoDToString(const LoD& lod);
+
+LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+                 size_t elem_end);
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD& in);
+
+bool operator==(const LoD& a, const LoD& b);
+
+/*
+ * Check whether this lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *
+ *  1. all the offsets in a level should be ascending(no same items allows).
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the higher level's last offset should equals the lower level's size-1.
+ *  4. the first offset(the begin offset) of each level should be 0.
+ *  5. the lowest level's last offset should equals `tensor_height` if
+ * tensor_height>0.
+ */
+
+bool CheckLoD(const LoD& in, int tensor_height = -1);
+/*
+ * Check whether this absolute lod's format is valid.
+ *
+ * ATTENTION:
+ *   - Empty lod is treated as valid.
+ *
+ * It will check two things:
+ *  1. all the offsets in a level should be ascending(no same items allows)
+ *  2. there should be more than 2 offsets existing in each level.
+ *  3. the first offset of each level should be 0, and the last should be the
+ *     same(the height of underlying tensor) or `tensor_height` if
+ *     tensor_height>0.
+ */
+bool CheckAbsLoD(const LoD& in, int tensor_height = -1);
+
+/*
+ * LoDTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LoDTensor : public Tensor {
+ public:
+  LoDTensor() {}
+
+  explicit LoDTensor(const LoD& lod) : lod_(lod) {}
+
+  void set_lod(const LoD& lod) { lod_ = lod; }
+
+  const LoD& lod() const { return lod_; }
+
+  LoD* mutable_lod() { return &lod_; }
+
+  /*
+   * Get the start offset and end offset of an  element from LoD.
+   */
+  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE_LT(level, NumLevels());
+    PADDLE_ENFORCE_LT(elem, NumElements(level));
+    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
+  }
+
+  /*
+   * Number of LoDTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const { return lod_.size(); }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE_LT(level, NumLevels());
+    // the last offset is the end of last element
+    return (lod_)[level].size() - 1;
+  }
+
+  std::vector<LoDTensor> SplitLoDTensor(
+      const std::vector<platform::Place> places) const;
+
+  void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
+                      platform::Place place);
+
+ private:
+  LoD lod_;
+};
+
+/*
+ * Expand the `source` to fit the LoD of `lod`. For example, a `source`
+ * LoDTensor is
+ *  - LoD: [0, 2]
+ *  - tensor: [a0, a1]
+ * a `lod` is
+ *  - LoD: [0 3 5]
+ * returns a new LoDTensor
+ *  - [a0 a0 a0 a1 a1]
+ */
+template <typename T>
+LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
+                    const platform::Place& place) {
+  LoD abs_lod = ToAbsOffset(lod);
+  const auto& lod_level = lod[level];
+  size_t num_instances = source.dims()[0];
+
+  // new tensor
+  LoDTensor tensor;
+  tensor.set_lod(lod);
+  auto dims = source.dims();
+  dims[0] = lod_level.back();
+  tensor.Resize(dims);
+  tensor.mutable_data<T>(place);
+
+  PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
+  for (size_t ins = 0; ins < num_instances; ins++) {
+    for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
+      auto slice = tensor.Slice(elem, elem + 1);
+      Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+           platform::CPUDeviceContext(), &slice);
+    }
+  }
+  return tensor;
+}
+
+// Get the absolute offset of a lod[start_level][start_idx:end_idx] and
+// relative length of details for every levels(i.e., [start_level: ]).
+//
+// For example,
+//   lod = [[0, 3, 4, 8], [0, 9, 10, 11, 13, 17, 19, 22, 24]]
+//   start_level = 0
+//   start_idx = 1
+//   end_idx = 3
+//
+// Returns:
+//  LoD = [[1, 4], [2, 4, 2, 3, 2]]
+//  pair<size_t, size_t> = {11, 24}
+std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
+    const LoD& lod, size_t start_idx, size_t end_idx, size_t start_level);
+
+void AppendLoD(LoD* lod, const LoD& lod_length);
+
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
+                           const platform::DeviceContext& dev_ctx);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
new file mode 100644
index 0000000000..10a8a7867f
--- /dev/null
+++ b/paddle/framework/lod_tensor.md
@@ -0,0 +1,165 @@
+# Design Doc: LoD (Level-of-Detail) Tensor
+
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
+
+|                       | TensorFlow | PaddlePaddle |
+|-----------------------|------------|--------------|
+| RNN                   | Support    | Support      |
+| recursive RNN         | Support    | Support      |
+| padding zeros         | Must       | No need      |
+| blob data type        | Tensor     | LoDTensor    |
+
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
+
+
+## The Challenge: Variable-length Sequences
+
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
+
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
+
+```
+3   1 2
+||| | ||
+```
+
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+### A Mini-Batch of Videos
+
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
+
+```
+3     1  2
+口口口 口 口口
+```
+
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
+
+### A Mini-Batch of Images
+
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
+
+```
+1 1 1 1     1
+口口口口 ... 口
+```
+
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
+
+```
+口口口口 ... 口
+```
+
+### Model Parameters
+
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
+
+
+## The LoD Tensor
+
+Let us revisit above example of the 2-level LoD Tensor
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
+
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
+
+We can save the LoD index in the above example
+
+```
+3           1  2
+3   2  4    1  2  3
+```
+
+in a not-full 2D matrix:
+
+```c++
+typedef std::vector<std::vector<int> > LoD;
+```
+
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
+
+```
+3 2 4 1 2 3
+```
+
+into offsets
+
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
+
+so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
+
+Similarly, the lengths in the top level LoD
+
+```
+3 1 2
+```
+
+are transformed into offsets of elements/words as follows:
+
+```
+0 3 4   6
+  = =   =
+  3 3+1 4+2
+```
+
+## Slicing of LoD Tensors
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
+
+For example, the <2>-slice of above example is
+
+```
+10      15
+10  12  15
+  || |||
+```
+
+and the <2,0>-slice of above slice is
+
+```
+10  12
+  ||
+```
diff --git a/paddle/framework/lod_tensor_array.h b/paddle/framework/lod_tensor_array.h
new file mode 100644
index 0000000000..4a8e7f4fa5
--- /dev/null
+++ b/paddle/framework/lod_tensor_array.h
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using LoDTensorArray = std::vector<LoDTensor>;
+}
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
new file mode 100644
index 0000000000..4d172c43c7
--- /dev/null
+++ b/paddle/framework/lod_tensor_test.cc
@@ -0,0 +1,217 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+TEST(LodExpand, test) {
+  LoD lod{{0, 2}};
+  LoDTensor tensor;
+  tensor.set_lod(lod);
+  tensor.Resize({2, 1});
+  tensor.mutable_data<float>(platform::CPUPlace());
+  tensor.data<float>()[0] = 0;
+  tensor.data<float>()[1] = 1;
+
+  LoD target;
+  target.emplace_back(std::vector<size_t>{0, 3, 5});
+  auto new_tensor = LodExpand<float>(tensor, target, 0UL, platform::CPUPlace());
+  std::vector<int> result{{0, 0, 0, 1, 1}};
+  for (size_t i = 0; i < 5; i++) {
+    ASSERT_EQ(new_tensor.data<float>()[i], result[i]);
+  }
+}
+
+TEST(LoD, GetFineGrainedLoDLength) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
+  lod.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26, 29}));
+
+  auto lod_and_offset =
+      paddle::framework::GetSubLoDAndAbsoluteOffset(lod, 1, 2, 0);
+  LoD lod_length = lod_and_offset.first;
+  size_t start_offset = lod_and_offset.second.first;
+  size_t end_offset = lod_and_offset.second.second;
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>{2});
+  expected.push_back(std::vector<size_t>{2, 2});
+  expected.push_back(std::vector<size_t>{2, 3, 4, 2});
+  EXPECT_EQ(lod_length, expected);
+  EXPECT_EQ(start_offset, 15UL);
+  EXPECT_EQ(end_offset, 26UL);
+}
+
+TEST(LoD, AppendLoD) {
+  LoD lod_lens;
+  lod_lens.push_back(std::vector<size_t>({2}));
+  lod_lens.push_back(std::vector<size_t>({2, 2}));
+  lod_lens.push_back(std::vector<size_t>({2, 3, 4, 2}));
+
+  LoD origin;
+  origin.push_back(std::vector<size_t>({0, 2}));
+  origin.push_back(std::vector<size_t>({0, 1, 6}));
+  origin.push_back(std::vector<size_t>({0, 2, 5, 7, 10, 12, 15}));
+
+  paddle::framework::AppendLoD(&origin, lod_lens);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 2, 4}));
+  expected.push_back(std::vector<size_t>({0, 1, 6, 8, 10}));
+  expected.push_back(
+      std::vector<size_t>({0, 2, 5, 7, 10, 12, 15, 17, 20, 24, 26}));
+  EXPECT_EQ(origin, expected);
+}
+
+TEST(LoD, ToAbsOffset) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
+
+  LoD expected;
+  expected.push_back(std::vector<size_t>({0, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 5}));
+  expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  EXPECT_EQ(abs_lod, expected);
+}
+
+TEST(LoD, SplitLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+  LoDTensor lod_tensor;
+  lod_tensor.Resize({20, 1});
+  float* dst_ptr = lod_tensor.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+  lod_tensor.set_lod(lod);
+
+  std::vector<platform::Place> places{platform::CPUPlace(),
+                                      platform::CPUPlace()};
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+
+  auto lods = lod_tensor.SplitLoDTensor(places);
+  EXPECT_EQ(lods[0].lod(), lod0);
+  EXPECT_EQ(lods[1].lod(), lod1);
+}
+
+TEST(LoD, MergeLoDTensor) {
+  LoD lod;
+  lod.push_back(std::vector<size_t>({0, 2, 4, 5, 6}));
+  lod.push_back(std::vector<size_t>({0, 1, 6, 8, 13, 15, 20}));
+
+  platform::CPUPlace place;
+
+  LoDTensor lod_tensor0;
+  LoD lod0;
+  lod0.push_back(std::vector<size_t>({0, 2, 4}));
+  lod0.push_back(std::vector<size_t>({0, 1, 6, 8, 13}));
+  lod_tensor0.set_lod(lod0);
+
+  lod_tensor0.Resize({13, 1});
+  float* dst_ptr = lod_tensor0.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor0.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  LoDTensor lod_tensor1;
+  LoD lod1;
+  lod1.push_back(std::vector<size_t>({0, 1, 2}));
+  lod1.push_back(std::vector<size_t>({0, 2, 7}));
+  lod_tensor1.set_lod(lod1);
+  lod_tensor1.Resize({7, 1});
+  dst_ptr = lod_tensor1.mutable_data<float>(place);
+  for (int i = 0; i < lod_tensor1.numel(); ++i) {
+    dst_ptr[i] = i;
+  }
+
+  std::vector<const LoDTensor*> lods{&lod_tensor0, &lod_tensor1};
+
+  LoDTensor lod_tensor;
+  lod_tensor.MergeLoDTensor(lods, place);
+  EXPECT_EQ(lod_tensor.lod(), lod);
+}
+
+TEST(LoD, CheckLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  // check compatible
+  ASSERT_TRUE(CheckLoD(relative_lod));
+  relative_lod[1].back()++;
+  ASSERT_FALSE(CheckLoD(relative_lod));
+  relative_lod[1].back()--;  // recover it
+
+  // check empty
+  LoD empty_lod;
+  ASSERT_TRUE(CheckLoD(empty_lod));
+
+  // check less than 2 offsets in a level
+  LoD some_lod0;
+  some_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckLoD(some_lod0));
+
+  // check with underlying tensor storage.
+  ASSERT_TRUE(CheckLoD(relative_lod, 5));
+  ASSERT_FALSE(CheckLoD(relative_lod, 9));
+}
+
+TEST(LoD, CheckAbsLoD) {
+  LoD relative_lod;
+  relative_lod.push_back(std::vector<size_t>({0, 2}));
+  relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
+  relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
+
+  auto abs_lod = ToAbsOffset(relative_lod);
+
+  ASSERT_TRUE(CheckAbsLoD(abs_lod));
+
+  // check less than 2 offsets in a level.
+
+  // check the last item should be compatible with tensor height.
+  abs_lod.back().back()++;
+  ASSERT_FALSE(CheckAbsLoD(abs_lod));
+  abs_lod.back().back()--;  // restore
+
+  // check less than 2 offsets in a lod.
+  LoD abs_lod0;
+  abs_lod0.push_back(std::vector<size_t>({0}));
+  ASSERT_FALSE(CheckAbsLoD(abs_lod0));
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
new file mode 100644
index 0000000000..1e253a2f6f
--- /dev/null
+++ b/paddle/framework/lod_tensor_test.cu
@@ -0,0 +1,51 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/assert.h"
+
+#include <gtest/gtest.h>
+
+__global__ void test(size_t* a, int size) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    a[i] *= 2;
+  }
+}
+
+TEST(LoDTensor, LoDInGPU) {
+  paddle::framework::LoDTensor lod_tensor;
+  paddle::platform::CUDAPlace place(0);
+
+  paddle::framework::LoD src_lod;
+  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
+
+  lod_tensor.Resize({14, 16});
+  lod_tensor.mutable_data<float>(place);
+
+  lod_tensor.set_lod(src_lod);
+  EXPECT_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
+  EXPECT_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
+
+  auto lod = lod_tensor.lod();
+
+  test<<<1, 8>>>(lod[0].data(), lod[0].size());
+  cudaDeviceSynchronize();
+
+  for (size_t i = 0; i < src_lod[0].size(); ++i) {
+    EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
+  }
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
new file mode 100644
index 0000000000..f8df2cf97a
--- /dev/null
+++ b/paddle/framework/op_desc.cc
@@ -0,0 +1,504 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_desc.h"
+#include <functional>
+#include <mutex>
+#include <unordered_map>
+#include "glog/logging.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+
+class OpDesc;
+class BlockDesc;
+class CompileTimeInferShapeContext : public InferShapeContext {
+ public:
+  CompileTimeInferShapeContext(const OpDesc &op, const BlockDesc &block);
+
+  bool HasInput(const std::string &name) const override;
+
+  bool HasOutput(const std::string &name) const override;
+
+  bool HasInputs(const std::string &name) const override;
+
+  bool HasOutputs(const std::string &name) const override;
+
+  DDim GetInputDim(const std::string &name) const override;
+
+  void SetOutputDim(const std::string &name, const DDim &dim) override;
+
+  AttrReader Attrs() const override;
+
+  const std::vector<std::string> &Inputs(
+      const std::string &name) const override;
+
+  const std::vector<std::string> &Outputs(
+      const std::string &name) const override;
+
+  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
+    auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
+    if (in_var->GetType() != proto::VarDesc::LOD_TENSOR) {
+      VLOG(3) << "input " << in << " is not LodTensor";
+      return;
+    }
+    PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
+                      "The %d-th output of Output(%s) must be LoDTensor.", j,
+                      out);
+    out_var->SetLoDLevel(in_var->GetLoDLevel());
+  }
+
+  bool IsRuntime() const override;
+
+ protected:
+  proto::VarDesc::VarType GetVarType(const std::string &name) const override;
+
+  DDim GetDim(const std::string &name) const override;
+
+  void SetDim(const std::string &name, const DDim &dim) override;
+
+  const OpDesc &op_;
+  const BlockDesc &block_;
+};
+
+OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs, const AttributeMap &attrs) {
+  desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+  need_update_ = true;
+}
+
+void OpDesc::CopyFrom(const OpDesc &op_desc) {
+  desc_.set_type(op_desc.Type());
+  inputs_ = op_desc.inputs_;
+  outputs_ = op_desc.outputs_;
+  attrs_ = op_desc.attrs_;
+  need_update_ = true;
+}
+
+OpDesc::OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block)
+    : desc_(desc), need_update_(false) {
+  // restore inputs_
+  int input_size = desc_.inputs_size();
+  for (int i = 0; i < input_size; ++i) {
+    const proto::OpDesc::Var &var = desc_.inputs(i);
+    std::vector<std::string> &args = inputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore outputs_
+  int output_size = desc_.outputs_size();
+  for (int i = 0; i < output_size; ++i) {
+    const proto::OpDesc::Var &var = desc_.outputs(i);
+    std::vector<std::string> &args = outputs_[var.parameter()];
+    int argu_size = var.arguments_size();
+    args.reserve(argu_size);
+    for (int j = 0; j < argu_size; ++j) {
+      args.push_back(var.arguments(j));
+    }
+  }
+  // restore attrs_
+  for (const proto::OpDesc::Attr &attr : desc_.attrs()) {
+    std::string attr_name = attr.name();
+    if (attr.type() != proto::AttrType::BLOCK) {
+      attrs_[attr_name] = GetAttrValue(attr);
+    } else {
+      auto bid = attr.block_idx();
+      attrs_[attr_name] = prog->MutableBlock(bid);
+    }
+  }
+  this->block_ = block;
+}
+
+proto::OpDesc *OpDesc::Proto() {
+  Flush();
+  return &desc_;
+}
+
+const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name,
+                 Type());
+  return it->second;
+}
+
+std::vector<std::string> OpDesc::InputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->inputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
+void OpDesc::SetInput(const std::string &param_name,
+                      const std::vector<std::string> &args) {
+  need_update_ = true;
+  inputs_[param_name] = args;
+}
+
+const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s",
+                 name, Type());
+  return it->second;
+}
+
+std::vector<std::string> OpDesc::OutputArgumentNames() const {
+  std::vector<std::string> retv;
+  for (auto &ipt : this->outputs_) {
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
+  }
+  return retv;
+}
+
+void OpDesc::SetOutput(const std::string &param_name,
+                       const std::vector<std::string> &args) {
+  need_update_ = true;
+  this->outputs_[param_name] = args;
+}
+
+proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return static_cast<proto::AttrType>(it->second.which() - 1);
+}
+
+std::vector<std::string> OpDesc::AttrNames() const {
+  std::vector<std::string> retv;
+  retv.reserve(attrs_.size());
+  for (auto &attr : attrs_) {
+    retv.push_back(attr.first);
+  }
+  return retv;
+}
+
+void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  this->attrs_[name] = v;
+  need_update_ = true;
+}
+
+void OpDesc::SetBlockAttr(const std::string &name, BlockDesc &block) {
+  this->attrs_[name] = &block;
+  need_update_ = true;
+}
+
+void OpDesc::SetAttrMap(
+    const std::unordered_map<std::string, Attribute> &attr_map) {
+  attrs_ = attr_map;
+  need_update_ = true;
+}
+
+Attribute OpDesc::GetAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return it->second;
+}
+
+int OpDesc::GetBlockAttr(const std::string &name) const {
+  auto it = attrs_.find(name);
+  PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
+  return boost::get<BlockDesc *>(it->second)->ID();
+}
+
+const std::unordered_map<std::string, Attribute> &OpDesc::GetAttrMap() const {
+  return attrs_;
+}
+
+void OpDesc::Rename(const std::string &old_name, const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDesc::RenameOutput(const std::string &old_name,
+                          const std::string &new_name) {
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+void OpDesc::RenameInput(const std::string &old_name,
+                         const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  need_update_ = true;
+}
+
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(proto::OpDesc::Attr *attr) : attr_(attr) {}
+  mutable proto::OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+
+  // Please refer to https://github.com/PaddlePaddle/Paddle/issues/7162
+  template <class T,
+            class = typename std::enable_if<std::is_same<bool, T>::value>::type>
+  void operator()(T b) const {
+    attr_->set_b(b);
+  }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->ID()); }
+  void operator()(int64_t v) const { attr_->set_l(v); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
+void OpDesc::Flush() {
+  if (need_update_) {
+    this->desc_.mutable_inputs()->Clear();
+    for (auto &ipt : inputs_) {
+      auto *input = desc_.add_inputs();
+      input->set_parameter(ipt.first);
+      VectorToRepeated(ipt.second, input->mutable_arguments());
+    }
+
+    this->desc_.mutable_outputs()->Clear();
+    for (auto &opt : outputs_) {
+      auto *output = desc_.add_outputs();
+      output->set_parameter(opt.first);
+      VectorToRepeated(opt.second, output->mutable_arguments());
+    }
+
+    this->desc_.mutable_attrs()->Clear();
+    for (auto &attr : attrs_) {
+      auto *attr_desc = desc_.add_attrs();
+      attr_desc->set_name(attr.first);
+      attr_desc->set_type(
+          static_cast<proto::AttrType>(attr.second.which() - 1));
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
+    }
+
+    need_update_ = false;
+  }
+}
+
+static std::once_flag init_infer_shape_funcs;
+
+static void InitInferShapeFuncs() {
+  std::call_once(init_infer_shape_funcs, [] {
+    auto &map = OpInfoMap::Instance();
+    auto &info_map = *map.mutable_map();
+
+    for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) {
+      auto op_type = kern_pair.first;
+      auto &op_info = info_map.at(op_type);
+      auto op = static_cast<OperatorWithKernel *>(op_info.Creator()(
+          "", VariableNameMap{}, VariableNameMap{}, AttributeMap{}));
+      if (op_info.infer_shape_) {  // infer_shape has been registered.
+        continue;
+      }
+      op_info.infer_shape_ = [op](InferShapeContext *ctx) {
+        op->InferShape(ctx);
+      };
+    }
+  });
+}
+
+void OpDesc::CheckAttrs() {
+  PADDLE_ENFORCE(!Type().empty(),
+                 "CheckAttr() can not be called before type is setted.");
+  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
+  if (checker == nullptr) {
+    // checker is not configured. That operator could be generated by Paddle,
+    // not by users.
+    return;
+  }
+  checker->Check(attrs_);
+}
+
+void OpDesc::InferShape(const BlockDesc &block) const {
+  VLOG(3) << "CompileTime infer shape on " << Type();
+  InitInferShapeFuncs();
+  auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
+  PADDLE_ENFORCE(static_cast<bool>(infer_shape),
+                 "%s's infer_shape has not been registered", this->Type());
+  CompileTimeInferShapeContext ctx(*this, block);
+  if (VLOG_IS_ON(10)) {
+    std::ostringstream sout;
+    auto inames = this->InputArgumentNames();
+    sout << " From [";
+    std::copy(inames.begin(), inames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "] to [";
+    auto onames = this->OutputArgumentNames();
+    std::copy(onames.begin(), onames.end(),
+              std::ostream_iterator<std::string>(sout, ", "));
+    sout << "]";
+    VLOG(10) << sout.str();
+  }
+  infer_shape(&ctx);
+}
+
+void OpDesc::InferVarType(BlockDesc *block) const {
+  auto &info = OpInfoMap::Instance().Get(this->Type());
+  if (info.infer_var_type_) {
+    info.infer_var_type_(*this, block);
+  } else {
+    // all output type is LoDTensor by default
+    VLOG(10) << this->Type()
+             << " has not registered InferVarType. Set output variables to "
+                "LOD_TENSOR";
+    for (auto &out_pair : this->outputs_) {
+      for (auto &out_var_name : out_pair.second) {
+        block->FindRecursiveOrCreateVar(out_var_name)
+            .SetType(proto::VarDesc::LOD_TENSOR);
+      }
+    }
+  }
+}
+
+CompileTimeInferShapeContext::CompileTimeInferShapeContext(
+    const OpDesc &op, const BlockDesc &block)
+    : op_(op), block_(block) {}
+
+bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  auto length = input_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(input_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  auto length = output_names.size();
+  if (length == 0) {
+    return false;
+  }
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Output(%s) should have only one value, "
+                    "but it have %d now",
+                    name, length);
+  return block_.HasVarRecursive(output_names[0]);
+}
+
+bool CompileTimeInferShapeContext::HasInputs(const std::string &name) const {
+  const std::vector<std::string> &input_names = op_.Input(name);
+  if (input_names.empty()) {
+    return false;
+  }
+  for (auto &input : input_names) {
+    if (!block_.HasVarRecursive(input)) return false;
+  }
+  return true;
+}
+
+bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
+  const std::vector<std::string> &output_names = op_.Output(name);
+  if (output_names.empty()) {
+    return false;
+  }
+  for (auto &output : output_names) {
+    if (!block_.HasVarRecursive(output)) return false;
+  }
+  return true;
+}
+
+DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
+  std::vector<DDim> ddims = GetInputsDim(name);
+  auto length = ddims.size();
+  PADDLE_ENFORCE_EQ(length, 1UL,
+                    "Input(%s) should have 1 value, "
+                    "but it has %d now",
+                    name, length);
+  return ddims[0];
+}
+
+void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
+                                                const DDim &dim) {
+  SetOutputsDim(name, {dim});
+}
+
+AttrReader CompileTimeInferShapeContext::Attrs() const {
+  return AttrReader(op_.GetAttrMap());
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Inputs(
+    const std::string &name) const {
+  return op_.Input(name);
+}
+
+const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
+    const std::string &name) const {
+  return op_.Output(name);
+}
+
+DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
+  auto var = block_.FindVarRecursive(name);
+  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
+  try {
+    auto shape = var->Shape();
+    if (shape.empty()) {
+      return framework::make_ddim({0UL});
+    } else {
+      return framework::make_ddim(var->Shape());
+    }
+  } catch (...) {
+    VLOG(5) << "GetDim of variable " << name << " error";
+    std::rethrow_exception(std::current_exception());
+  }
+}
+
+void CompileTimeInferShapeContext::SetDim(const std::string &name,
+                                          const DDim &dim) {
+  block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+}
+bool CompileTimeInferShapeContext::IsRuntime() const { return false; }
+
+proto::VarDesc::VarType CompileTimeInferShapeContext::GetVarType(
+    const std::string &name) const {
+  return block_.FindVarRecursive(name)->GetType();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
new file mode 100644
index 0000000000..13695cff59
--- /dev/null
+++ b/paddle/framework/op_desc.h
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/framework/var_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class BlockDesc;
+class ProgramDesc;
+class OpDesc {
+ public:
+  OpDesc() {}
+
+  OpDesc(const std::string &type, const VariableNameMap &inputs,
+         const VariableNameMap &outputs, const AttributeMap &attrs);
+
+  OpDesc(const proto::OpDesc &desc, ProgramDesc *prog, BlockDesc *block);
+
+  explicit OpDesc(BlockDesc *block) : block_(block) {}
+
+  OpDesc(const OpDesc &other, BlockDesc *block) {
+    *this = other;
+    block_ = block;
+  }
+
+  void CopyFrom(const OpDesc &op_desc);
+
+  proto::OpDesc *Proto();
+
+  std::string Type() const { return desc_.type(); }
+
+  void SetType(const std::string &type) { desc_.set_type(type); }
+
+  const std::vector<std::string> &Input(const std::string &name) const;
+
+  std::vector<std::string> InputArgumentNames() const;
+
+  void SetInput(const std::string &param_name,
+                const std::vector<std::string> &args);
+
+  const std::vector<std::string> &Output(const std::string &name) const;
+
+  std::vector<std::string> OutputArgumentNames() const;
+
+  void SetOutput(const std::string &param_name,
+                 const std::vector<std::string> &args);
+
+  bool HasAttr(const std::string &name) const {
+    return attrs_.find(name) != attrs_.end();
+  }
+
+  proto::AttrType GetAttrType(const std::string &name) const;
+
+  std::vector<std::string> AttrNames() const;
+
+  void SetAttr(const std::string &name, const Attribute &v);
+
+  void SetBlockAttr(const std::string &name, BlockDesc &block);
+
+  Attribute GetAttr(const std::string &name) const;
+
+  int GetBlockAttr(const std::string &name) const;
+
+  void Rename(const std::string &old_name, const std::string &new_name);
+
+  void RenameOutput(const std::string &old_name, const std::string &new_name);
+
+  void RenameInput(const std::string &old_name, const std::string &new_name);
+
+  // Only be used in C++
+  const AttributeMap &GetAttrMap() const;
+
+  // Only be used in C++
+  void SetAttrMap(const AttributeMap &attr_map);
+
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
+
+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
+
+  const VariableNameMap &Inputs() const { return inputs_; }
+
+  const VariableNameMap &Outputs() const { return outputs_; }
+
+  AttributeMap *MutableAttrMap() {
+    this->need_update_ = true;
+    return &this->attrs_;
+  }
+
+  void CheckAttrs();
+
+  void InferShape(const BlockDesc &block) const;
+
+  void InferVarType(BlockDesc *block) const;
+
+  void MarkAsTarget() { desc_.set_is_target(true); }
+
+  void Flush();
+
+  BlockDesc *Block() { return this->block_; }
+
+  void SetBlock(BlockDesc *block) { this->block_ = block; }
+
+ private:
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), std::back_inserter(ret_val),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
+
+  proto::OpDesc desc_;
+  BlockDesc *block_;  // not_own
+  // input arg name => input variable names
+  VariableNameMap inputs_;
+  // output arg name => output variable names
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
+
+  // need_update_ indicate there some local changes not be synchronized. If
+  // local changes should be synchronized, need_update_ should be set to true.
+  bool need_update_{false};
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_info.cc b/paddle/framework/op_info.cc
new file mode 100644
index 0000000000..b520108109
--- /dev/null
+++ b/paddle/framework/op_info.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_info.h"
+
+namespace paddle {
+namespace framework {
+
+static OpInfoMap* g_op_info_map = nullptr;
+
+OpInfoMap& OpInfoMap::Instance() {
+  if (g_op_info_map == nullptr) {
+    g_op_info_map = new OpInfoMap();
+  }
+  return *g_op_info_map;
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
new file mode 100644
index 0000000000..d9b89f9cac
--- /dev/null
+++ b/paddle/framework/op_info.h
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class InferShapeBase {
+ public:
+  virtual ~InferShapeBase() = default;
+  virtual void operator()(InferShapeContext*) const = 0;
+};
+
+struct OpInfo {
+  OpCreator creator_;
+  GradOpMakerFN grad_op_maker_;
+  proto::OpProto* proto_{nullptr};
+  OpAttrChecker* checker_{nullptr};
+  InferVarTypeFN infer_var_type_;
+  InferShapeFN infer_shape_;
+
+  bool HasOpProtoAndChecker() const {
+    return proto_ != nullptr && checker_ != nullptr;
+  }
+
+  const proto::OpProto& Proto() const {
+    PADDLE_ENFORCE_NOT_NULL(proto_, "Operator Proto has not been registered");
+    PADDLE_ENFORCE(proto_->IsInitialized(),
+                   "Operator Proto must be initialized in op info");
+    return *proto_;
+  }
+
+  const OpCreator& Creator() const {
+    PADDLE_ENFORCE_NOT_NULL(creator_,
+                            "Operator Creator has not been registered");
+    return creator_;
+  }
+
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+
+  const OpAttrChecker* Checker() const { return checker_; }
+};
+
+class OpInfoMap {
+ public:
+  static OpInfoMap& Instance();
+
+  bool Has(const std::string& op_type) const {
+    return map_.find(op_type) != map_.end();
+  }
+
+  void Insert(const std::string& type, const OpInfo& info) {
+    PADDLE_ENFORCE(!Has(type), "Operator %s has been registered", type);
+    map_.insert({type, info});
+  }
+
+  const OpInfo& Get(const std::string& type) const {
+    auto op_info_ptr = GetNullable(type);
+    PADDLE_ENFORCE_NOT_NULL(op_info_ptr, "Operator %s has not been registered",
+                            type);
+    return *op_info_ptr;
+  }
+
+  const OpInfo* GetNullable(const std::string& type) const {
+    auto it = map_.find(type);
+    if (it == map_.end()) {
+      return nullptr;
+    } else {
+      return &it->second;
+    }
+  }
+
+  const std::unordered_map<std::string, OpInfo>& map() const { return map_; }
+
+  std::unordered_map<std::string, OpInfo>* mutable_map() { return &map_; }
+
+ private:
+  OpInfoMap() = default;
+  std::unordered_map<std::string, OpInfo> map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
new file mode 100644
index 0000000000..44adb94d2a
--- /dev/null
+++ b/paddle/framework/op_kernel_type.h
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/library_type.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+struct OpKernelType {
+  struct Hash {
+    size_t operator()(const OpKernelType& key) const {
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
+      int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
+      int library_type = static_cast<int>(key.library_type_)
+                         << (LEFT_SHIFT * 3);
+
+      std::hash<int> hasher;
+      return hasher(place + data_type + data_layout + library_type);
+    }
+  };
+
+  // place, data_type, library_type kinds less than 2^8
+  constexpr static int LEFT_SHIFT = 8;
+
+  proto::DataType data_type_;
+  DataLayout data_layout_;
+  platform::Place place_;
+  LibraryType library_type_;
+
+  OpKernelType(proto::DataType data_type, platform::Place place,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(place),
+        library_type_(library_type) {}
+
+  OpKernelType(proto::DataType data_type,
+               const platform::DeviceContext& dev_ctx,
+               DataLayout data_layout = DataLayout::kAnyLayout,
+               LibraryType library_type = LibraryType::kPlain)
+      : data_type_(data_type),
+        data_layout_(data_layout),
+        place_(dev_ctx.GetPlace()),
+        library_type_(library_type) {}
+
+  bool operator==(const OpKernelType& o) const {
+    return platform::places_are_same_class(place_, o.place_) &&
+           data_type_ == o.data_type_ && data_layout_ == o.data_layout_ &&
+           library_type_ == o.library_type_;
+  }
+
+  bool operator!=(const OpKernelType& o) const { return !(*this == o); }
+};
+
+inline std::ostream& operator<<(std::ostream& os,
+                                const OpKernelType& kernel_key) {
+  os << "data_type[" << kernel_key.data_type_ << "]:data_layout["
+     << kernel_key.data_layout_ << "]:place[" << kernel_key.place_
+     << "]:library_type[" << kernel_key.library_type_ << "]";
+  return os;
+}
+
+inline std::string KernelTypeToString(const OpKernelType& kernel_key) {
+  std::ostringstream stream;
+  stream << kernel_key;
+  return stream.str();
+}
+
+inline bool NeedTransformLayout(const DataLayout& l, const DataLayout& r) {
+  return l != DataLayout::kAnyLayout && r != DataLayout::kAnyLayout && l != r;
+}
+
+inline bool TransFromNeeded(const OpKernelType& l, const OpKernelType& r) {
+  return (!platform::places_are_same_class(l.place_, r.place_)) ||
+         (l.data_type_ != r.data_type_) ||
+         NeedTransformLayout(l.data_layout_, r.data_layout_);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc
new file mode 100644
index 0000000000..cb23bbde01
--- /dev/null
+++ b/paddle/framework/op_kernel_type_test.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_kernel_type.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OpKernelType, ToString) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                              LibraryType::kCUDNN);
+
+  ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type),
+            "data_type[float32]:data_layout[NCHW]:place[CPUPlace]:library_type["
+            "CUDNN]");
+}
+
+TEST(OpKernelType, Hash) {
+  using OpKernelType = paddle::framework::OpKernelType;
+  using DataType = paddle::framework::proto::DataType;
+  using CPUPlace = paddle::platform::CPUPlace;
+  using CUDAPlace = paddle::platform::CUDAPlace;
+  using DataLayout = paddle::framework::DataLayout;
+  using LibraryType = paddle::framework::LibraryType;
+
+  OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+  OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW,
+                                LibraryType::kCUDNN);
+
+  OpKernelType::Hash hasher;
+  ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2));
+}
diff --git a/paddle/framework/op_proto_maker.cc b/paddle/framework/op_proto_maker.cc
new file mode 100644
index 0000000000..151d61d5b1
--- /dev/null
+++ b/paddle/framework/op_proto_maker.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_proto_maker.h"
+
+namespace paddle {
+namespace framework {
+
+void OpProtoAndCheckerMaker::Validate() {
+  validated_ = true;
+  CheckNoDuplicatedInOutAttrs();
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddInput(
+    const std::string& name, const std::string& comment) {
+  auto* input = proto_->add_inputs();
+  input->set_name(name);
+  input->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{input};
+}
+
+OpProtoAndCheckerMaker::VariableBuilder OpProtoAndCheckerMaker::AddOutput(
+    const std::string& name, const std::string& comment) {
+  auto* output = proto_->add_outputs();
+  output->set_name(name);
+  output->set_comment(comment);
+  return OpProtoAndCheckerMaker::VariableBuilder{output};
+}
+
+void OpProtoAndCheckerMaker::CheckNoDuplicatedInOutAttrs() {
+  std::unordered_set<std::string> names;
+  auto checker = [&](const std::string& name) {
+    PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+    names.insert(name);
+  };
+  for (auto& attr : proto_->attrs()) {
+    checker(attr.name());
+  }
+  for (auto& input : proto_->inputs()) {
+    checker(input.name());
+  }
+  for (auto& output : proto_->outputs()) {
+    checker(output.name());
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
new file mode 100644
index 0000000000..efd3a5ca53
--- /dev/null
+++ b/paddle/framework/op_proto_maker.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  using OpProto = proto::OpProto;
+  using OpAttrChecker = framework::OpAttrChecker;
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+  virtual ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate();
+
+ protected:
+  struct VariableBuilder {
+    OpProto::Var* var_;
+
+    VariableBuilder& AsDuplicable() {
+      var_->set_duplicable(true);
+      return *this;
+    }
+
+    VariableBuilder& AsIntermediate() {
+      var_->set_intermediate(true);
+      return *this;
+    }
+
+    VariableBuilder& AsDispensable() {
+      var_->set_dispensable(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name, const std::string& comment);
+
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment);
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment,
+                               bool generated = false) {
+    auto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
+    attr->set_generated(generated);
+    attr->set_type(AttrTypeID<T>());
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }
+
+ private:
+  void CheckNoDuplicatedInOutAttrs();
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+  bool validated_{false};
+};
+
+class NOPMaker : public OpProtoAndCheckerMaker {
+ public:
+  NOPMaker(OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {}
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
new file mode 100644
index 0000000000..f16cb6fa3a
--- /dev/null
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_proto_maker.h"
+
+#include "gtest/gtest.h"
+
+class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(paddle::framework::proto::OpProto* proto,
+                     paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(paddle::framework::proto::OpProto* proto,
+                      paddle::framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  paddle::framework::proto::OpProto op_proto;
+  paddle::framework::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
new file mode 100644
index 0000000000..dfa151316d
--- /dev/null
+++ b/paddle/framework/op_registry.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const std::string& type, const VariableNameMap& inputs,
+    const VariableNameMap& outputs, AttributeMap attrs) {
+  auto& info = OpInfoMap::Instance().Get(type);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
+  auto op = info.Creator()(type, inputs, outputs, attrs);
+  return std::unique_ptr<OperatorBase>(op);
+}
+
+static VariableNameMap ConvertOpDescVarsToVarNameMap(
+    const google::protobuf::RepeatedPtrField<proto::OpDesc::Var>&
+        op_desc_vars) {
+  VariableNameMap ret_val;
+  for (auto& var : op_desc_vars) {
+    auto& var_names = ret_val[var.parameter()];
+    auto& var_names_in_proto = var.arguments();
+    var_names.reserve(static_cast<size_t>(var_names_in_proto.size()));
+    std::copy(var_names_in_proto.begin(), var_names_in_proto.end(),
+              std::back_inserter(var_names));
+  }
+  return ret_val;
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
+    const proto::OpDesc& op_desc) {
+  VLOG(1) << "CreateOp directly from OpDesc is deprecated. It should only be"
+             "used in unit tests. Use CreateOp(const OpDesc& op_desc) "
+             "instead.";
+  VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
+  VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
+  AttributeMap attrs;
+  for (auto& attr : op_desc.attrs()) {
+    attrs[attr.name()] = GetAttrValue(attr);
+  }
+
+  return CreateOp(op_desc.type(), inputs, outputs, attrs);
+}
+
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
+                  op_desc.GetAttrMap());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
new file mode 100644
index 0000000000..5de9ae559c
--- /dev/null
+++ b/paddle/framework/op_registry.h
@@ -0,0 +1,246 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+#include <typeinfo>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "glog/logging.h"  // For VLOG()
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/details/op_registry.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/shape_inference.h"
+
+namespace paddle {
+namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which
+  // are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+template <typename... ARGS>
+struct OperatorRegistrar : public Registrar {
+  explicit OperatorRegistrar(const char* op_type) {
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
+    static_assert(sizeof...(ARGS) != 0,
+                  "OperatorRegistrar should be invoked at least by OpClass");
+    OpInfo info;
+    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
+  }
+};
+
+class OpRegistry {
+ public:
+  static std::unique_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VariableNameMap& inputs,
+                                                const VariableNameMap& outputs,
+                                                AttributeMap attrs);
+
+  static std::unique_ptr<OperatorBase> CreateOp(const proto::OpDesc& op_desc);
+
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+};
+
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
+
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+
+  void operator()(const char* op_type, const char* library_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(),
+                     DataLayout::kAnyLayout, StringToLibraryType(library_type));
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type, library_type);
+  }
+};
+
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type, const char* library_type) const {}
+};
+
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
+class OpKernelRegistrar : public Registrar {
+ public:
+  explicit OpKernelRegistrar(const char* op_type, const char* library_type) {
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
+    func(op_type, library_type);
+  }
+};
+
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+/*
+  The variadic arguments should be class types derived from one of the
+  following classes:
+    OpProtoAndCheckerMaker
+    GradOpDescMakerBase
+    VarTypeInference
+    InferShapeBase
+*/
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
+  class _OpClass_##op_type##_ : public op_class {                      \
+   public:                                                             \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
+  };                                                                   \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
+  int TouchOpRegistrar_##op_type() {                                   \
+    __op_registrar_##op_type##__.Touch();                              \
+    return 0;                                                          \
+  }
+
+/**
+ * Macro to register Operator. When the input is duplicable, you should
+ * use REGISTER_OP_EX with deop_empty_grad=false instead.
+ */
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, \
+                    grad_op_class)                                   \
+  REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,    \
+                 grad_op_class, true)
+
+// When an argument is duplicable, we need to use this version.
+// Perhaps we can omit DropEmptyIG template parameter and
+// only have one version of REGISTER_OP.
+#define REGISTER_OP_EX(op_type, op_class, op_maker_class, grad_op_type,       \
+                       grad_op_class, drop_empty_grad)                        \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                             \
+  class _GradOpDescMaker_##grad_op_type##_                                    \
+      : public ::paddle::framework::DefaultGradOpDescMaker<drop_empty_grad> { \
+    using ::paddle::framework::DefaultGradOpDescMaker<                        \
+        drop_empty_grad>::DefaultGradOpDescMaker;                             \
+                                                                              \
+   protected:                                                                 \
+    virtual std::string GradOpType() const { return #grad_op_type; }          \
+  };                                                                          \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_,    \
+                    op_maker_class);
+
+#define REGISTER_OP_WITH_KERNEL(op_type, ...)                         \
+  REGISTER_OPERATOR(op_type, ::paddle::framework::OperatorWithKernel, \
+                    ##__VA_ARGS__)
+
+#define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
+
+/**
+ * Macro to register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(op_type, LIBRARY_TYPE, place_class, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                          \
+      __reg_op_kernel_##op_type##_##LIBRARY_TYPE##__,                      \
+      "REGISTER_OP_KERNEL must be called in global namespace");            \
+  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__>  \
+      __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__(#op_type,       \
+                                                           #LIBRARY_TYPE); \
+  int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE() {                \
+    __op_kernel_registrar_##op_type##_##LIBRARY_TYPE##__.Touch();          \
+    return 0;                                                              \
+  }
+
+#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
+/**
+ * Macro to mark what Operator and Kernel
+ * we will use and tell the compiler to
+ * link them into target.
+ */
+#define USE_OP_ITSELF(op_type)                                    \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_itself_##op_type,                                  \
+      "USE_OP_ITSELF must be called in global namespace");        \
+  extern int TouchOpRegistrar_##op_type();                        \
+  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type()
+
+#define USE_OP_DEVICE_KERNEL(op_type, LIBRARY_TYPE)               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_kernel_##op_type##_##LIBRARY_TYPE##__,             \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");        \
+  extern int TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE(); \
+  static int use_op_kernel_##op_type##_##LIBRARY_TYPE##_          \
+      __attribute__((unused)) =                                   \
+          TouchOpKernelRegistrar_##op_type##_##LIBRARY_TYPE()
+
+// TODO(fengjiayi): The following macros
+// seems ugly, do we have better method?
+
+#ifndef PADDLE_WITH_CUDA
+#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
+#else
+#define USE_OP_KERNEL(op_type)        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU); \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
+#endif
+
+#define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type);
+
+#define USE_CPU_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU);
+
+#define USE_CUDA_ONLY_OP(op_type) \
+  USE_OP_ITSELF(op_type);         \
+  USE_OP_DEVICE_KERNEL(op_type, CUDA)
+
+#define USE_OP(op_type)   \
+  USE_OP_ITSELF(op_type); \
+  USE_OP_KERNEL(op_type)
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
new file mode 100644
index 0000000000..341da8befd
--- /dev/null
+++ b/paddle/framework/op_registry_test.cc
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/framework/op_registry.h"
+
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace framework {
+
+class CosineOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is cos op");
+  }
+};
+
+class MyTestOp : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op").AsDuplicable();
+    AddOutput("output", "output of cosine op").AsIntermediate();
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddComment("This is my_test op");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    var->add_arguments(arg_name);
+  }
+}
+REGISTER_OP_WITHOUT_GRADIENT(cos_sim, paddle::framework::CosineOp,
+                             paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP_WITHOUT_GRADIENT(my_test_op, paddle::framework::MyTestOp,
+                             paddle::framework::MyTestOpProtoAndCheckerMaker);
+
+TEST(OpRegistry, CreateOp) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  float scale = 3.3;
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(scale);
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
+  float scale_get = op->Attr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
+}
+
+TEST(OpRegistry, IllegalAttr) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(-2.0);
+
+  bool caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "larger_than check fail";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+}
+
+TEST(OpRegistry, DefaultValue) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  BuildVar("input", {"aa"}, op_desc.add_inputs());
+  BuildVar("output", {"bb"}, op_desc.add_outputs());
+
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace cpu_place;
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(op->Attr<float>("scale"), 1.0);
+}
+
+TEST(OpRegistry, CustomChecker) {
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("my_test_op");
+  BuildVar("input", {"ii"}, op_desc.add_inputs());
+  BuildVar("output", {"oo"}, op_desc.add_outputs());
+
+  // attr 'test_attr' is not set
+  bool caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "Attribute 'test_attr' is required!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to an illegal value
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(3);
+  caught = false;
+  try {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "'test_attr' must be even!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to a legal value
+  op_desc.mutable_attrs()->Clear();
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(4);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  op->Run(scope, cpu_place);
+  int test_attr = op->Attr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
+}
+
+class CosineOpComplete : public paddle::framework::CosineOp {
+ public:
+  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
+  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
+};
+
+TEST(OperatorRegistrar, Test) {
+  using namespace paddle::framework;
+  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {}
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_kernel,
+                             paddle::framework::OpWithKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_CPU_KERNEL(
+    op_with_kernel,
+    paddle::framework::OpKernelTest<paddle::platform::CPUDeviceContext, float>);
+
+REGISTER_OP_CUDA_KERNEL(op_with_kernel,
+                        paddle::framework::OpKernelTest<
+                            paddle::platform::CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, CPU) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cpu_place);
+}
+
+TEST(OperatorRegistrar, CUDA) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  op->Run(scope, cuda_place);
+}
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(InferShapeContext* ctx) const override {}
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+        framework::LibraryType::kCUDNN);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    ++op_test_value;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    --op_test_value;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template <typename T>
+class OpMultiKernelTest2<CPUDeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value += 10;
+  }
+};
+
+template <typename T>
+class OpMultiKernelTest2<CUDADeviceContext, T>
+    : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const {
+    op_test_value -= 10;
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+                             paddle::framework::OpWithMultiKernelTest,
+                             paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+    paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
+REGISTER_OP_KERNEL(
+    op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+    paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+  paddle::framework::proto::OpDesc op_desc;
+  paddle::platform::CUDAPlace cuda_place(0);
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  op_desc.set_type("op_with_multi_kernel");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+  // TODO(qiao) add priority back
+  // use all available kernels
+  op->Run(scope, cuda_place);
+  EXPECT_EQ(op_test_value, -10);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
new file mode 100644
index 0000000000..4e854f54dd
--- /dev/null
+++ b/paddle/framework/operator.cc
@@ -0,0 +1,578 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+
+#include <algorithm>
+
+#include "paddle/framework/data_transform.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/shape_inference.h"
+#include "paddle/framework/var_type.h"
+
+DECLARE_bool(benchmark);
+
+namespace paddle {
+namespace framework {
+
+std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
+    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN),
+    std::make_tuple(platform::CPUPlace(), LibraryType::kPlain),
+};
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  if (var == nullptr) {
+    return DDim({-1});
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().dims();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->Get<SelectedRows>().GetCompleteDims();
+  } else {
+    return DDim({-1});
+  }
+}
+
+static LoD GetLoD(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  auto default_lod = LoD({{}});
+
+  if (var == nullptr) {
+    return default_lod;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().lod();
+  } else {
+    return default_lod;
+  }
+}
+
+std::string OperatorBase::Input(const std::string& name) const {
+  auto& ins = Inputs(name);
+  PADDLE_ENFORCE_LE(ins.size(), 1UL,
+                    "Operator %s's input %s should contain only one variable.",
+                    type_, name);
+  return ins.empty() ? kEmptyVarName : ins[0];
+}
+
+const std::vector<std::string>& OperatorBase::Inputs(
+    const std::string& name) const {
+  auto it = inputs_.find(name);
+  PADDLE_ENFORCE(it != inputs_.end(), "Operator %s does not have the input %s.",
+                 type_, name);
+  return it->second;
+}
+
+std::string OperatorBase::Output(const std::string& name) const {
+  auto& outs = Outputs(name);
+  PADDLE_ENFORCE_LE(outs.size(), 1UL,
+                    "Operator %s's output %s should contain only one variable.",
+                    type_, name);
+  return outs.empty() ? kEmptyVarName : outs[0];
+}
+
+const std::vector<std::string>& OperatorBase::Outputs(
+    const std::string& name) const {
+  auto it = outputs_.find(name);
+  PADDLE_ENFORCE(it != outputs_.end(),
+                 "Operator %s does not have an output called %s.", type_, name);
+  return it->second;
+}
+
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
+  std::stringstream ss;
+  ss << "Op(" << type_ << "), inputs:{";
+  for (auto it = inputs_.begin(); it != inputs_.end();) {
+    auto& input = *it;
+    ss << input.first << "[";
+    for (size_t i = 0; i < input.second.size(); ++i) {
+      ss << input.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
+      }
+      if (i != input.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != inputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}, outputs:{";
+  for (auto it = outputs_.begin(); it != outputs_.end();) {
+    auto& output = *it;
+    ss << output.first << "[";
+    for (size_t i = 0; i < output.second.size(); ++i) {
+      ss << output.second[i];
+      if (scope) {
+        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
+      }
+      if (i != output.second.size() - 1) {
+        ss << ", ";
+      }
+    }
+    ss << "]";
+    ++it;
+    if (it != outputs_.end()) {
+      ss << ", ";
+    }
+  }
+  ss << "}.";
+  return ss.str();
+}
+
+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  for (auto& input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto& output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+}
+
+OperatorBase::OperatorBase(const std::string& type,
+                           const VariableNameMap& inputs,
+                           const VariableNameMap& outputs,
+                           const AttributeMap& attrs)
+    : type_(type), inputs_(inputs), outputs_(outputs), attrs_(attrs) {
+  GenerateTemporaryNames();
+  CheckAllInputOutputSet();
+}
+
+std::vector<std::string> OperatorBase::InputVars() const {
+  std::vector<std::string> ret_val;
+  for (auto& o : inputs_) {
+    ret_val.reserve(ret_val.size() + o.second.size());
+    ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+  }
+  return ret_val;
+}
+
+std::vector<std::string> OperatorBase::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> ret_val;
+  if (has_intermediate) {
+    // push all outputs into ret_val
+    for (auto& o : outputs_) {
+      ret_val.reserve(ret_val.size() + o.second.size());
+      ret_val.insert(ret_val.end(), o.second.begin(), o.second.end());
+    }
+    return ret_val;
+  }
+  auto& info = OpInfoMap::Instance().Get(Type());
+
+  // get all OpProto::Var for outputs
+  for (auto& o : info.Proto().outputs()) {
+    // ignore all intermediate output
+    if (o.intermediate()) continue;
+    auto out = outputs_.find(o.name());
+    if (out != outputs_.end()) {
+      ret_val.reserve(ret_val.size() + out->second.size());
+      ret_val.insert(ret_val.end(), out->second.begin(), out->second.end());
+    }
+  }
+  return ret_val;
+}
+
+void OperatorBase::CheckAllInputOutputSet() const {
+  auto& info_map = OpInfoMap::Instance();
+  auto* op_info = info_map.GetNullable(Type());
+  if (op_info == nullptr || op_info->proto_ == nullptr) return;
+
+  for (auto& in : op_info->Proto().inputs()) {
+    PADDLE_ENFORCE(inputs_.find(in.name()) != inputs_.end(),
+                   "Type %s's input %s is not set", Type(), in.name());
+  }
+
+  for (auto& out : op_info->Proto().outputs()) {
+    PADDLE_ENFORCE(outputs_.find(out.name()) != outputs_.end(),
+                   "Type %s's output %s is not set", Type(), out.name());
+  }
+}
+
+void OperatorBase::GenerateTemporaryNames() {
+  static std::atomic<size_t> gUniqId(0UL);
+  for (auto& output : outputs_) {
+    for (auto& output_name : output.second) {
+      if (output_name == kTempVarName) {
+        output_name += type_;
+        output_name += "@";
+        output_name += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+}
+
+static bool VarIsTensor(const Variable* var) {
+  return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
+}
+
+static const Tensor* GetTensorFromVar(Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
+  }
+}
+
+static Tensor* GetMutableTensorFromVar(Variable* var) {
+  if (var->IsType<LoDTensor>()) {
+    return var->GetMutable<LoDTensor>();
+  } else if (var->IsType<SelectedRows>()) {
+    return var->GetMutable<SelectedRows>()->mutable_value();
+  } else {
+    PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+                 var->Type().name());
+  }
+}
+
+template <>
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
+  auto* var = InputVar(name);
+  return var == nullptr ? nullptr
+                        : GetTensorFromVar(const_cast<Variable*>(var));
+}
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Inputs(name);
+  std::vector<const Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr : GetTensorFromVar(var);
+                 });
+  return res;
+}
+
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
+  auto var = OutputVar(name);
+  return var == nullptr ? nullptr : GetMutableTensorFromVar(var);
+}
+
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const {
+  auto names = op().Outputs(name);
+  std::vector<Tensor*> res;
+  res.reserve(names.size());
+  std::transform(names.begin(), names.end(), std::back_inserter(res),
+                 [&](const std::string& sub_name) {
+                   auto var = scope_.FindVar(sub_name);
+                   return var == nullptr ? nullptr
+                                         : GetMutableTensorFromVar(var);
+                 });
+  return res;
+}
+
+bool OpSupportGPU(const std::string& op_type) {
+  auto& all_kernels = OperatorWithKernel::AllOpKernels();
+  auto it = all_kernels.find(op_type);
+  if (it == all_kernels.end()) {
+    // All control operator must support GPU
+
+    return true;
+  }
+  for (auto& kern_pair : it->second) {
+    if (platform::is_gpu_place(kern_pair.first.place_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+class RuntimeInferShapeContext : public InferShapeContext {
+ public:
+  RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}
+
+  bool HasInput(const std::string& name) const override {
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasOutput(const std::string& name) const override {
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
+    auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+    return var != nullptr;
+  }
+
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
+    return GetDim(op_.Input(name));
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetDim(op_.Output(name), dim);
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Inputs(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Outputs(name);
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+
+    // TODO(dzhwinter) : reuse ShareLoD in most operators.
+    // Need to call ShareLayout explicitly in sequence related ops.
+    // Shall we have a better method to shared info between in/out Tensor?
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+                   size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+    Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_layout(in_tensor.layout());
+  }
+
+  bool IsRuntime() const override { return true; }
+
+ protected:
+  DDim GetDim(const std::string& name) const override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      return var->Get<LoDTensor>().dims();
+    } else if (var->IsType<SelectedRows>()) {
+      return var->Get<SelectedRows>().GetCompleteDims();
+    } else {
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
+    }
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    Variable* var = scope_.FindVar(name);
+    if (var->IsType<LoDTensor>()) {
+      var->GetMutable<LoDTensor>()->Resize(dim);
+    } else if (var->IsType<SelectedRows>()) {
+      var->GetMutable<SelectedRows>()->set_height(dim[0]);
+    } else {
+      PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+                   name, var->Type().name());
+    }
+  }
+
+  proto::VarDesc::VarType GetVarType(const std::string& name) const override {
+    auto* var = scope_.FindVar(name);
+    return ToVarType(var->Type());
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+void OperatorWithKernel::Run(const Scope& scope,
+                             const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx = pool.Get(place);
+
+  // check if op[type] has kernel registered.
+  auto& all_op_kernels = AllOpKernels();
+  auto kernels_iter = all_op_kernels.find(type_);
+  if (kernels_iter == all_op_kernels.end()) {
+    PADDLE_THROW(
+        "There are no kernels which are registered in the %s operator.", type_);
+  }
+
+  ExecutionContext ctx(*this, scope, *dev_ctx);
+
+  OpKernelMap& kernels = kernels_iter->second;
+
+  // TODO(dzhwinter) : kernel fallback mechanism will be added when all the
+  // transform functions are ready.
+
+  // for (auto& candidate : kKernelPriority) {
+  //   Do selection
+  // }
+
+  auto expected_kernel_key = this->GetExpectedKernelType(ctx);
+  VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+  auto kernel_iter = kernels.find(expected_kernel_key);
+  if (kernel_iter == kernels.end()) {
+    PADDLE_THROW("op %s does not have kernel for %s", type_,
+                 KernelTypeToString(expected_kernel_key));
+  }
+
+  // do data transform
+  Scope& new_scope = scope.NewScope();
+
+  for (auto& var_name_item : this->Inputs()) {
+    for (auto& var_name : var_name_item.second) {
+      auto* var = scope.FindVar(var_name);
+      if (var && VarIsTensor(var)) {
+        auto* tensor_in = GetTensorFromVar(var);
+        if (tensor_in->IsInitialized()) {
+          auto kernel_type_for_var = this->GetKernelTypeForVar(
+              var_name_item.first, *tensor_in, expected_kernel_key);
+          if (TransFromNeeded(kernel_type_for_var, expected_kernel_key)) {
+            auto out_var_names = OutputVars(true);
+            if (std::find(out_var_names.begin(), out_var_names.end(),
+                          var_name) != out_var_names.end()) {
+              PADDLE_THROW(
+                  "var %s is both input and output, "
+                  "does not support transform",
+                  var_name);
+            }
+            VLOG(3) << "Transform Variable " << var_name << " from "
+                    << kernel_type_for_var << " to " << expected_kernel_key;
+            auto* trans_var = new_scope.Var(var_name);
+            std::shared_ptr<Tensor> out(new Tensor);
+            DataTransform(expected_kernel_key, kernel_type_for_var, *tensor_in,
+                          out.get());
+            CopyVariableWithTensor(*var, *(out.get()), *trans_var);
+          }
+        }
+      }
+    }
+  }
+
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_benchmark) {
+    new_dev_ctx->Wait();
+  }
+}
+
+proto::DataType OperatorWithKernel::IndicateDataType(
+    const ExecutionContext& ctx) const {
+  auto& scope = ctx.scope();
+  int data_type = -1;
+  for (auto& input : this->inputs_) {
+    for (auto& ipt_name : input.second) {
+      auto* var = scope.FindVar(ipt_name);
+      if (var != nullptr) {
+        const Tensor* t = nullptr;
+        if (var->IsType<Tensor>()) {
+          t = &var->Get<Tensor>();
+        } else if (var->IsType<LoDTensor>()) {
+          t = &var->Get<LoDTensor>();
+        } else if (var->IsType<SelectedRows>()) {
+          t = &(var->Get<SelectedRows>().value());
+        }
+        if (t != nullptr) {
+          int tmp = static_cast<int>(ToDataType(t->type()));
+          PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                         "DataType of Paddle Op %s must be the same.", Type());
+          data_type = tmp;
+        }
+      }
+    }
+  }
+  PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+  return static_cast<proto::DataType>(data_type);
+}
+
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+    const ExecutionContext& ctx) const {
+  return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+    const std::string& var_name, const Tensor& tensor,
+    const OpKernelType& expected_kernel_type) const {
+  return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
new file mode 100644
index 0000000000..c9140f304c
--- /dev/null
+++ b/paddle/framework/operator.h
@@ -0,0 +1,401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
+#include "glog/logging.h"  // For VLOG
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/variant.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+/// If a variable is a empty variable, that name will be used.
+constexpr char kEmptyVarName[] = "@EMPTY@";
+
+/// If a variable is a temporary variable, that name will be set in Python,
+/// but it will be convert to a unique name in scope after OpCreator.
+constexpr char kTempVarName[] = "@TEMP@";
+
+/// If a variable's name has a certain suffix, it means that the
+/// variable is the gradient of another varibale.
+/// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+constexpr char kGradVarSuffix[] = "@GRAD";
+
+/// Variables with this suffix are supposed to be filled up with zeros.
+constexpr char kZeroVarSuffix[] = "@ZERO";
+
+// define some kernel priority
+/* Define multiple kernel type fallback order*/
+extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
+
+inline std::string GradVarName(const std::string& var_name) {
+  return var_name + kGradVarSuffix;
+}
+
+class OperatorBase;
+class ExecutionContext;
+
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  OperatorBase(const std::string& type, const VariableNameMap& inputs,
+               const VariableNameMap& outputs, const AttributeMap& attrs);
+
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  /// if scope is not null, also show dimensions of arguments
+  virtual std::string DebugStringEx(const Scope* scope) const;
+
+  std::string DebugString() const { return DebugStringEx(nullptr); }
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
+
+  // FIXME(typhoonzero): this is only used for recv_op to stop event_loop.
+  virtual void Stop() {}
+
+  virtual bool IsNetOp() const { return false; }
+
+  virtual bool SupportGPU() const { return false; }
+
+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
+  const VariableNameMap& Inputs() const { return inputs_; }
+  const VariableNameMap& Outputs() const { return outputs_; }
+
+  //! Get a input with argument's name described in `op_proto`
+  std::string Input(const std::string& name) const;
+  //! Get a input which has multiple variables.
+  const std::vector<std::string>& Inputs(const std::string& name) const;
+
+  std::vector<std::string> InputVars() const;
+
+  //! Get a output with argument's name described in `op_proto`
+  std::string Output(const std::string& name) const;
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
+  const std::vector<std::string>& Outputs(const std::string& name) const;
+
+  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
+
+  const std::string& Type() const { return type_; }
+  void SetType(const std::string& type) { type_ = type; }
+  const AttributeMap& Attrs() const { return attrs_; }
+
+  // Return a new operator instance, which is as same as this.
+  // Use unique_ptr to prevent caller forget to delete this pointer.
+  virtual std::unique_ptr<OperatorBase> Clone() const = 0;
+
+ protected:
+  std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)
+  // O (Outputs)
+  // OG (Output Gradients)
+  VariableNameMap inputs_;
+
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
+
+ private:
+  void GenerateTemporaryNames();
+  void CheckAllInputOutputSet() const;
+};
+
+// Macro for define a clone method.
+// If you are writing an kernel operator, `Clone` will be defined when you
+// register it. i.e. `Clone` method is not needed to define by yourself.
+#define DEFINE_OP_CLONE_METHOD(cls)                                            \
+  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
+    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
+  }
+
+// Macro for define a default constructor for Operator.
+// You can also use
+//   using PARENT_CLASS::PARENT_CLASS;
+// to use parent's constructor.
+#define DEFINE_OP_CONSTRUCTOR(cls, parent_cls)             \
+  cls(const std::string& type,                             \
+      const ::paddle::framework::VariableNameMap& inputs,  \
+      const ::paddle::framework::VariableNameMap& outputs, \
+      const paddle::framework::AttributeMap& attrs)        \
+      : parent_cls(type, inputs, outputs, attrs) {}
+
+class NOP : public OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+  void Run(const Scope& scope, const platform::Place& place) const override {}
+  std::unique_ptr<OperatorBase> Clone() const override {
+    return std::unique_ptr<OperatorBase>(new NOP(*this));
+  }
+};
+
+class ExecutionContext {
+ public:
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}
+
+  const OperatorBase& op() const { return op_; }
+
+  const Scope& scope() const { return scope_; }
+
+  template <typename T>
+  inline const T& Attr(const std::string& name) const {
+    return op_.Attr<T>(name);
+  }
+
+  size_t InputSize(const std::string& name) const {
+    return op_.Inputs(name).size();
+  }
+
+  size_t OutputSize(const std::string& name) const {
+    return op_.Outputs(name).size();
+  }
+
+  const Variable* InputVar(const std::string& name) const {
+    auto ipt = op_.Input(name);
+    return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
+  }
+
+  Variable* OutputVar(const std::string& name) const {
+    auto opt = op_.Output(name);
+    return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt);
+  }
+
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
+    return res;
+  }
+
+  std::vector<Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<Variable*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [this](const std::string& name) {
+                     return name == kEmptyVarName ? nullptr
+                                                  : scope_.FindVar(name);
+                   });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    auto* var = InputVar(name);
+    return var == nullptr ? nullptr : &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    return var == nullptr ? nullptr : var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     return var == nullptr ? nullptr : &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     return var == nullptr ? nullptr : var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) const {
+    PADDLE_ENFORCE_LT(i, InputSize(in));
+    PADDLE_ENFORCE_LT(j, OutputSize(out));
+    auto* in_var = MultiInputVar(in)[i];
+    auto* out_var = MultiOutputVar(out)[j];
+    if (!in_var->IsType<LoDTensor>()) return;
+    PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
+                   "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+    auto in_tensor = in_var->Get<LoDTensor>();
+    auto* out_tensor = out_var->GetMutable<LoDTensor>();
+    out_tensor->set_lod(in_tensor.lod());
+  }
+
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+
+  template <typename DeviceContextType>
+  const DeviceContextType& device_context() const {
+    return *reinterpret_cast<const DeviceContextType*>(&device_context_);
+  }
+
+  const platform::DeviceContext& device_context() const {
+    return device_context_;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  const inline platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    return *reinterpret_cast<const platform::CUDADeviceContext*>(
+        &device_context_);
+  }
+#endif
+
+  //! Get actual name vector for this input.
+  const std::vector<std::string>& Inputs(const std::string& name) const {
+    return op_.Inputs(name);
+  }
+
+  //! Get actual name vector for this output.
+  const std::vector<std::string>& Outputs(const std::string& name) const {
+    return op_.Outputs(name);
+  }
+
+ private:
+  const OperatorBase& op_;
+  const Scope& scope_;
+  const platform::DeviceContext& device_context_;
+};
+
+template <>
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
+
+template <>
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
+    const std::string& name) const;
+
+template <>
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
+
+template <>
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
+    const std::string& name) const;
+
+class OpKernelBase {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  using OpKernelMap =
+      std::unordered_map<OpKernelType, std::unique_ptr<OpKernelBase>,
+                         OpKernelType::Hash>;
+
+  OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
+                     const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const Scope& scope, const platform::Place& place) const final;
+
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  }
+
+  bool SupportGPU() const override {
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
+  }
+
+  virtual void InferShape(InferShapeContext* ctx) const {
+    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
+  }
+
+ protected:
+  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+  virtual OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const Tensor& tensor,
+      const OpKernelType& expected_kernel_type) const;
+
+ private:
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  proto::DataType IndicateDataType(const ExecutionContext& ctx) const;
+};
+
+extern bool OpSupportGPU(const std::string& op_type);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
new file mode 100644
index 0000000000..b69d7c7a74
--- /dev/null
+++ b/paddle/framework/operator_test.cc
@@ -0,0 +1,273 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
+ public:
+  OpWithoutKernelTest(const std::string& type, const VariableNameMap& inputs,
+                      const VariableNameMap& outputs, const AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs), x(1) {}
+  void Run(const Scope& scope, const platform::Place& place) const override {
+    ++op_run_num;
+    ASSERT_EQ(static_cast<int>(inputs_.size()), 1);
+    ASSERT_EQ(static_cast<int>(outputs_.size()), 1);
+    ASSERT_EQ(scope.FindVar(inputs_.at("input")[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope.FindVar(outputs_.at("output")[0]), nullptr);
+  }
+
+ public:
+  int x{0};
+};
+
+class OpWithoutKernelCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpWithoutKernelCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op");
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+static void BuildVar(const std::string& param_name,
+                     std::initializer_list<const char*> arguments,
+                     paddle::framework::proto::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    *var->mutable_arguments()->Add() = arg_name;
+  }
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(test_operator,
+                             paddle::framework::OpWithoutKernelTest,
+                             paddle::framework::OpWithoutKernelCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  BuildVar("input", {"IN1"}, op_desc.add_inputs());
+  BuildVar("output", {"OUT1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope.Var("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+static int cpu_kernel_run_num = 0;
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  OpKernelType GetExpectedKernelType(
+      const ExecutionContext& ctx) const override {
+    return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
+  }
+};
+
+template <typename T1, typename T2>
+class CPUKernelTest : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << ctx.op().DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op().Input("x"), "IN1");
+    ASSERT_EQ(ctx.op().Output("y"), "OUT1");
+  }
+};
+
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("xs", "inputs of test op").AsDuplicable();
+    AddInput("k", "input of test op");
+    AddOutput("ys", "outputs of test op").AsDuplicable();
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .GreaterThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel<float> {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    auto xs = ctx.op().Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3U);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
+
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2U);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3U);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2U);
+
+    auto k = ctx.op().Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op().Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+
+// test with single input
+TEST(OpKernel, all) {
+  paddle::framework::InitDevices();
+  paddle::framework::proto::OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  BuildVar("x", {"IN1"}, op_desc.add_inputs());
+  BuildVar("y", {"OUT1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
+  op->Run(scope, cpu_place);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+
+REGISTER_OP_WITHOUT_GRADIENT(
+    op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+    paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  paddle::framework::InitDevices();
+  proto::OpDesc op_desc;
+
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  BuildVar("xs", {"x0", "x1", "x2"}, op_desc.add_inputs());
+  BuildVar("k", {"k0"}, op_desc.add_inputs());
+  BuildVar("ys", {"y0", "y1"}, op_desc.add_outputs());
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUPlace cpu_place;
+  paddle::framework::Scope scope;
+  scope.Var("x0")->GetMutable<LoDTensor>();
+  scope.Var("x1")->GetMutable<LoDTensor>();
+  scope.Var("x2")->GetMutable<LoDTensor>();
+  scope.Var("k0")->GetMutable<LoDTensor>();
+  scope.Var("y0")->GetMutable<LoDTensor>();
+  scope.Var("y1")->GetMutable<LoDTensor>();
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_place);
+}
+
+class OperatorClone : public paddle::framework::OperatorBase {
+ public:
+  DEFINE_OP_CLONE_METHOD(OperatorClone);
+  OperatorClone(const std::string& type,
+                const paddle::framework::VariableNameMap& inputs,
+                const paddle::framework::VariableNameMap& outputs,
+                const paddle::framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const paddle::framework::Scope& scope,
+           const paddle::platform::Place& place) const override {}
+};
+
+TEST(Operator, Clone) {
+  paddle::framework::InitDevices();
+  OperatorClone a("ABC", paddle::framework::VariableNameMap{},
+                  paddle::framework::VariableNameMap{},
+                  paddle::framework::AttributeMap{});
+  auto b = a.Clone();
+  ASSERT_EQ(a.Type(), b->Type());
+}
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
new file mode 100644
index 0000000000..15ea4035c6
--- /dev/null
+++ b/paddle/framework/program_desc.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace framework {
+
+BlockDesc *ProgramDesc::AppendBlock(const BlockDesc &parent) {
+  auto *b = desc_.add_blocks();
+  b->set_parent_idx(parent.ID());
+  b->set_idx(desc_.blocks_size() - 1);
+  blocks_.emplace_back(new BlockDesc(this, b));
+  return blocks_.back().get();
+}
+
+proto::ProgramDesc *ProgramDesc::Proto() {
+  for (auto &block : blocks_) {
+    block->Flush();
+  }
+  return &desc_;
+}
+
+ProgramDesc::ProgramDesc() {
+  auto *block = desc_.mutable_blocks()->Add();
+  block->set_idx(kRootBlockIndex);
+  block->set_parent_idx(kNoneBlockIndex);
+  blocks_.emplace_back(new BlockDesc(this, block));
+}
+
+ProgramDesc::ProgramDesc(const ProgramDesc &o) {
+  desc_ = o.desc_;
+
+  for (int i = 0; i < desc_.blocks_size(); ++i) {
+    auto *block = desc_.mutable_blocks(i);
+    blocks_.emplace_back(new BlockDesc(*o.blocks_[i], block, this));
+  }
+}
+
+ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
+  desc_ = desc;
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
+  }
+}
+
+ProgramDesc::ProgramDesc(const std::string &binary_str) {
+  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
+                 "Fail to parse program_desc from binary string.");
+  for (auto &block_desc : *desc_.mutable_blocks()) {
+    blocks_.emplace_back(new BlockDesc(this, &block_desc));
+  }
+}
+
+const std::vector<std::string> ProgramDesc::GetFeedTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> feed_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFeedOpType) {
+      feed_target_names.insert(feed_target_names.begin(), op->Output("Out")[0]);
+    }
+  }
+  return feed_target_names;
+}
+
+const std::vector<std::string> ProgramDesc::GetFetchTargetNames() {
+  BlockDesc *global_block = blocks_[0].get();
+  std::vector<std::string> fetch_target_names;
+  for (auto *op : global_block->AllOps()) {
+    if (op->Type() == kFetchOpType) {
+      fetch_target_names.push_back(op->Input("X")[0]);
+    }
+  }
+  return fetch_target_names;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
new file mode 100644
index 0000000000..8e958eab6e
--- /dev/null
+++ b/paddle/framework/program_desc.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <vector>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class BlockDesc;
+
+class ProgramDesc {
+ public:
+  ProgramDesc();
+
+  explicit ProgramDesc(const proto::ProgramDesc &desc);
+
+  ProgramDesc(const ProgramDesc &o);
+
+  explicit ProgramDesc(const std::string &binary_str);
+
+  BlockDesc *AppendBlock(const BlockDesc &parent);
+
+  BlockDesc *MutableBlock(size_t idx) { return blocks_[idx].get(); }
+
+  const BlockDesc &Block(size_t idx) const { return *blocks_[idx]; }
+
+  size_t Size() const { return blocks_.size(); }
+
+  proto::ProgramDesc *Proto();
+
+  const std::vector<std::string> GetFeedTargetNames();
+  const std::vector<std::string> GetFetchTargetNames();
+
+ private:
+  proto::ProgramDesc desc_;
+
+  std::vector<std::unique_ptr<BlockDesc>> blocks_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
new file mode 100644
index 0000000000..59947c9f21
--- /dev/null
+++ b/paddle/framework/program_desc_test.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/program_desc.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/block_desc.h"
+
+namespace paddle {
+namespace framework {
+TEST(ProgramDesc, copy_ctor) {
+  ProgramDesc program;
+  auto* global_block = program.MutableBlock(0);
+  auto* x = global_block->Var("X");
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  ProgramDesc program_copy(program);
+
+  auto* global_block_copy = program_copy.MutableBlock(0);
+  ASSERT_NE(global_block, global_block_copy);
+
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
+    ASSERT_TRUE(global_block_copy->HasVar(name));
+    auto* copy = global_block_copy->Var(name);
+    ASSERT_NE(copy, var_before);
+    ASSERT_EQ(copy->Name(), var_before->Name());
+    ASSERT_EQ(copy->GetType(), var_before->GetType());
+    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
+  ASSERT_EQ(3UL, global_block_copy->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_copy = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_copy->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
+
+    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+
+  // Not check block's protostr are same it because the order of vars could be
+  // different and it is correct.
+}
+
+TEST(ProgramDescBind, serialize_and_deserialize) {
+  ProgramDesc program_origin;
+  auto* global_block = program_origin.MutableBlock(0);
+  auto* x = global_block->Var("X");
+  x->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(proto::FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(proto::FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(proto::VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  std::string binary_str;
+  program_origin.Proto()->SerializeToString(&binary_str);
+
+  ProgramDesc program_restored(binary_str);
+  auto* global_block_restored = program_restored.MutableBlock(0);
+  ASSERT_NE(global_block, global_block_restored);
+
+  auto assert_same_var = [&](const std::string& name, VarDesc* var_before) {
+    ASSERT_TRUE(global_block_restored->HasVar(name));
+    auto* restored = global_block_restored->Var(name);
+    ASSERT_NE(restored, var_before);
+    ASSERT_EQ(restored->Name(), var_before->Name());
+    ASSERT_EQ(restored->GetType(), var_before->GetType());
+    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(),
+            global_block_restored->LocalVarNames());
+  ASSERT_EQ(3UL, global_block_restored->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_restored = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_restored->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_restored->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_restored->Outputs());
+
+    ASSERT_EQ(op_restored->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/proto_desc.h b/paddle/framework/proto_desc.h
new file mode 100644
index 0000000000..fa01224fef
--- /dev/null
+++ b/paddle/framework/proto_desc.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIndex = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIndex = -1;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
new file mode 100644
index 0000000000..bff8e0bcea
--- /dev/null
+++ b/paddle/framework/prune.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/prune.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+const std::string kDropOutOpType = "dropout";
+const std::string kBatchNormOpType = "batch_norm";
+
+bool HasDependentVar(const proto::OpDesc& op_desc,
+                     const std::set<std::string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool IsTarget(const proto::OpDesc& op_desc) {
+  if (op_desc.has_is_target()) {
+    return op_desc.is_target();
+  }
+  return false;
+}
+
+void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
+                int block_id) {
+  // TODO(tonyyang-svail):
+  //    - will change to use multiple blocks for RNN op and Cond Op
+
+  auto& block = input.blocks(block_id);
+  auto& ops = block.ops();
+
+  bool expect_feed = true;
+  for (auto& op_desc : ops) {
+    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
+                   "All FeedOps are at the beginning of the ProgramDesc");
+    expect_feed = (op_desc.type() == kFeedOpType);
+  }
+
+  bool expect_fetch = true;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
+                   "All FetchOps must at the end of the ProgramDesc");
+    expect_fetch = (op_desc.type() == kFetchOpType);
+  }
+
+  std::set<std::string> dependent_vars;
+  std::vector<bool> should_run;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+
+    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
+      // insert its input to the dependency graph
+      for (auto& var : op_desc.inputs()) {
+        for (auto& argu : var.arguments()) {
+          dependent_vars.insert(argu);
+        }
+      }
+
+      should_run.push_back(true);
+    } else {
+      should_run.push_back(false);
+    }
+  }
+
+  // since we are traversing the ProgramDesc in reverse order
+  // we reverse the should_run vector
+  std::reverse(should_run.begin(), should_run.end());
+
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  op_field->Clear();
+  for (size_t i = 0; i < should_run.size(); ++i) {
+    if (should_run[i]) {
+      *op_field->Add() = input.blocks(block_id).ops(i);
+    }
+  }
+
+  // remove the VarDescs in BlockDesc that are not referenced in
+  // the pruned OpDescs
+  std::unordered_map<std::string, proto::VarDesc> var_map;
+  auto* var_field = output->mutable_blocks(block_id)->mutable_vars();
+  for (const auto& var : *var_field) {
+    var_map[var.name()] = var;
+  }
+
+  var_field->Clear();
+  for (const auto& op : *op_field) {
+    // add VarDescs of all input arguments for each OpDesc
+    auto& input_field = op.inputs();
+    for (auto& input_var : input_field) {
+      for (auto& arg : input_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+    // add VarDescs of all output arguments for each OpDesc
+    auto& output_field = op.outputs();
+    for (auto& output_var : output_field) {
+      for (auto& arg : output_var.arguments()) {
+        *var_field->Add() = var_map[arg];
+      }
+    }
+  }
+}
+
+// TODO(fengjiayi): Prune() could be inplaced to avoid unnecessary copies
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output) {
+  prune_impl(input, output, 0);
+}
+
+void inference_optimize_impl(const proto::ProgramDesc& input,
+                             proto::ProgramDesc* output, int block_id) {
+  *output = input;
+  auto* op_field = output->mutable_blocks(block_id)->mutable_ops();
+  for (auto& op_desc : *op_field) {
+    if (op_desc.type() == kDropOutOpType ||
+        op_desc.type() == kBatchNormOpType) {
+      for (auto& attr : *op_desc.mutable_attrs()) {
+        if (attr.name() == "is_test") {
+          attr.set_b(true);
+          break;
+        }
+      }
+    }
+  }
+}
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output) {
+  inference_optimize_impl(input, output, 0);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/prune.h b/paddle/framework/prune.h
new file mode 100644
index 0000000000..593292523d
--- /dev/null
+++ b/paddle/framework/prune.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+void Prune(const proto::ProgramDesc& input, proto::ProgramDesc* output);
+
+void InferenceOptimize(const proto::ProgramDesc& input,
+                       proto::ProgramDesc* output);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
new file mode 100644
index 0000000000..d76c5abca9
--- /dev/null
+++ b/paddle/framework/prune_test.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/prune.h"
+
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/net_op.h"
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/program_desc.h"
+
+#include <gtest/gtest.h>
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           paddle::framework::BlockDesc *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::proto::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Prune, one_operator) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  f::proto::ProgramDesc pruned;
+
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
+
+  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
+}
+
+TEST(Prune, forward) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+
+  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
+    f::proto::ProgramDesc pruned;
+    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
+    f::Prune(*pdesc, &pruned);
+    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
+  }
+}
+
+TEST(Prune, multi_input_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{},
+        block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}},
+        f::AttributeMap{}, block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
+}
+
+TEST(Prune, multi_output_op) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
+}
+
+TEST(Prune, multi_target) {
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
+        f::AttributeMap{}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{},
+        block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{},
+        block);
+
+  f::proto::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::proto::ProgramDesc pruned;
+  f::Prune(*pdesc, &pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
+}
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
new file mode 100644
index 0000000000..af08b2ab81
--- /dev/null
+++ b/paddle/framework/scope.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+
+#include <memory>  // for unique_ptr
+#include <mutex>   // for call_once
+#include "glog/logging.h"
+#include "paddle/framework/threadpool.h"
+#include "paddle/string/printf.h"
+
+DEFINE_bool(benchmark, false,
+            "Doing memory benchmark. It will make deleting scope synchronized, "
+            "and add some memory usage logs."
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) {
+    VLOG(3) << "Destroy variable " << kv.first;
+    delete kv.second;
+  }
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::Var(const std::string& name) {
+  auto* v = FindVarLocally(name);
+  if (v != nullptr) return v;
+  v = new Variable();
+  vars_[name] = v;
+  VLOG(3) << "Create variable " << name;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::Var(std::string* name) {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  if (name != nullptr) {
+    *name = var_name;
+  }
+  return Var(var_name);
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto var = FindVarLocally(name);
+  if (var != nullptr) {
+    return var;
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+std::vector<std::string> Scope::LocalVarNames() const {
+  std::vector<std::string> known_vars;
+  known_vars.reserve(this->vars_.size());
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
+  }
+  return known_vars;
+}
+
+void Scope::DeleteScope(Scope* scope) {
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  this->kids_.erase(it);
+  // When making memory benchmark on Fluid, we have to delete scope sync.
+  if (FLAGS_benchmark) {
+    delete scope;
+  } else {
+    Async([scope] { delete scope; });
+  }
+}
+
+void Scope::Rename(const std::string& origin_name,
+                   const std::string& new_name) const {
+  auto origin_it = vars_.find(origin_name);
+  PADDLE_ENFORCE(origin_it != vars_.end(),
+                 "Cannot find original variable with name %s", origin_name);
+  auto new_it = vars_.find(new_name);
+  PADDLE_ENFORCE(new_it == vars_.end(),
+                 "The variable with name %s is already in the scope", new_name);
+  vars_[new_name] = origin_it->second;
+  vars_.erase(origin_it);
+}
+
+std::string Scope::Rename(const std::string& origin_name) const {
+  auto var_name = string::Sprintf("%p.%d", this, vars_.size());
+  Rename(origin_name, var_name);
+  return var_name;
+}
+
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
new file mode 100644
index 0000000000..a1da81cc79
--- /dev/null
+++ b/paddle/framework/scope.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <list>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/variable.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace framework {
+
+class Scope;
+
+/**
+ * @brief Scope that manage all variables.
+ *
+ * Scope is an association of a name to Variable. All variables belong to
+ * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * One net can run in different scopes and update different variable in the
+ * scope.
+ */
+class Scope {
+ public:
+  Scope() {}
+  ~Scope();
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* Var(const std::string& name);
+
+  /// Create a variable with a scope-unique name.
+  Variable* Var(std::string* name = nullptr);
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  const Scope& parent() const { return *parent_; }
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  void DeleteScope(Scope* scope);
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();
+
+  // enumerate all the variables current contains.
+  std::vector<std::string> LocalVarNames() const;
+
+  // Rename variable to a new name
+  void Rename(const std::string& origin_name,
+              const std::string& new_name) const;
+
+  // Rename variable to a new name and return the new name
+  std::string Rename(const std::string& origin_name) const;
+
+  Variable* FindVarLocally(const std::string& name) const;
+
+ private:
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  mutable std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
+
+  DISABLE_COPY_AND_ASSIGN(Scope);
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
new file mode 100644
index 0000000000..0f5b86061d
--- /dev/null
+++ b/paddle/framework/scope_test.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+using paddle::framework::Scope;
+using paddle::framework::Variable;
+
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();
+
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");
+
+  EXPECT_NE(v0, v1);
+
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}
+
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();
+
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));
+
+  ss.Var("a");
+
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}
+
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.Var("a");
+
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
+}
+
+TEST(Scope, GetAllNames) {
+  Scope s;
+  Variable* v = s.Var("a");
+  EXPECT_EQ(&s, s.FindScope(v));
+
+  std::vector<std::string> ans = s.LocalVarNames();
+  std::string str;
+  for (auto& var : ans) {
+    str += var;
+  }
+
+  EXPECT_STREQ("a", str.c_str());
+}
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
new file mode 100644
index 0000000000..3b3e60177a
--- /dev/null
+++ b/paddle/framework/selected_rows.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx) {
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {
+    // the 2st field, rows information
+    auto& rows = selected_rows.rows();
+    uint64_t size = rows.size();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    for (uint64_t i = 0; i < size; ++i) {
+      os.write(reinterpret_cast<const char*>(&rows[i]), sizeof(rows[i]));
+    }
+  }
+  {
+    // the 3st field, the height of SelectedRows
+    int64_t height = selected_rows.height();
+    os.write(reinterpret_cast<const char*>(&height), sizeof(height));
+  }
+  // the 4st field, Tensor data
+  SerializeToStream(os, selected_rows.value(), dev_ctx);
+}
+
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx) {
+  {
+    // the 1st field, unit32_t version for SelectedRows
+    uint32_t version;
+    is.read(reinterpret_cast<char*>(&version), sizeof(version));
+    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  }
+  {
+    // the 2st field, rows information
+    uint64_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    auto& rows = *selected_rows->mutable_rows();
+    rows.resize(size);
+    for (uint64_t i = 0; i < size; ++i) {
+      is.read(reinterpret_cast<char*>(&rows[i]), sizeof(int64_t));
+    }
+  }
+  {
+    // the 3st field, the height of the SelectedRows
+    int64_t height;
+    is.read(reinterpret_cast<char*>(&height), sizeof(int64_t));
+    selected_rows->set_height(height);
+  }
+  // the 4st field, tensor which contains the data
+  DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
new file mode 100644
index 0000000000..30d3dfc1e8
--- /dev/null
+++ b/paddle/framework/selected_rows.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRows {
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() {
+    height_ = 0;
+    value_.reset(new Tensor());
+  }
+
+  platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  Vector<int64_t>* mutable_rows() { return &rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+/*
+ * Serialize/Desiralize SelectedRows to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+                           const platform::DeviceContext& dev_ctx);
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
new file mode 100644
index 0000000000..8ff3fb6a97
--- /dev/null
+++ b/paddle/framework/selected_rows_test.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRowsTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    std::vector<int64_t> rows{0, 4, 7};
+    int64_t height = 10;
+    int64_t row_numel = 100;
+    selected_rows_.reset(new SelectedRows(rows, height));
+
+    Tensor* value = selected_rows_->mutable_value();
+    value->mutable_data<float>(
+        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+  }
+
+ protected:
+  platform::CPUPlace place_;
+  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+};
+
+TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
+
+TEST_F(SelectedRowsTester, dims) {
+  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
+}
+
+TEST_F(SelectedRowsTester, complete_dims) {
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
+}
+
+TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
+  SelectedRows dst_tensor;
+  platform::CPUDeviceContext cpu_ctx(place_);
+  std::ostringstream oss;
+
+  SerializeToStream(oss, *selected_rows_, cpu_ctx);
+
+  std::istringstream iss(oss.str());
+  DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+
+  ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
+  ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+  ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
new file mode 100644
index 0000000000..e53cc0cdab
--- /dev/null
+++ b/paddle/framework/shape_inference.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+    const std::string &name) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return GetDims(names);
+}
+
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
+void InferShapeContext::SetOutputsDim(
+    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &names = Outputs(name);
+  SetDims(names, dims);
+}
+
+std::vector<framework::DDim> InferShapeContext::GetDims(
+    const std::vector<std::string> &names) const {
+  std::vector<framework::DDim> ret;
+  ret.reserve(names.size());
+  std::transform(
+      names.begin(), names.end(), std::back_inserter(ret),
+      [this](const std::string &name) { return this->GetDim(name); });
+  return ret;
+}
+
+void InferShapeContext::SetDims(const std::vector<std::string> &names,
+                                const std::vector<framework::DDim> &dims) {
+  size_t length = names.size();
+  PADDLE_ENFORCE_EQ(length, dims.size());
+  for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
+    SetDim(names[i], dims[i]);
+  }
+}
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetInputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Inputs(name));
+}
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetOutputsVarType(
+    const std::string &name) const {
+  return GetVarTypes(Outputs(name));
+}
+std::vector<proto::VarDesc::VarType> InferShapeContext::GetVarTypes(
+    const std::vector<std::string> &names) const {
+  std::vector<proto::VarDesc::VarType> retv;
+  retv.resize(names.size());
+  std::transform(names.begin(), names.end(), retv.begin(),
+                 std::bind(std::mem_fn(&InferShapeContext::GetVarType), this,
+                           std::placeholders::_1));
+  return retv;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
new file mode 100644
index 0000000000..f93319d8f2
--- /dev/null
+++ b/paddle/framework/shape_inference.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+class InferShapeContext {
+ public:
+  virtual ~InferShapeContext() = default;
+  virtual bool HasInput(const std::string &name) const = 0;
+  virtual bool HasOutput(const std::string &name) const = 0;
+
+  std::vector<proto::VarDesc::VarType> GetInputsVarType(
+      const std::string &name) const;
+  std::vector<proto::VarDesc::VarType> GetOutputsVarType(
+      const std::string &name) const;
+
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
+
+  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+
+  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
+
+  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputsDim(const std::string &name,
+                     const std::vector<framework::DDim> &dims);
+
+  virtual AttrReader Attrs() const = 0;
+  virtual const std::vector<std::string> &Inputs(
+      const std::string &name) const = 0;
+  virtual const std::vector<std::string> &Outputs(
+      const std::string &name) const = 0;
+
+  virtual void ShareLoD(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) const = 0;
+
+  virtual bool IsRuntime() const = 0;
+
+  // Note: In while op, we need this to be public
+  void SetDims(const std::vector<std::string> &names,
+               const std::vector<framework::DDim> &dims);
+
+ protected:
+  virtual framework::DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+
+  std::vector<framework::DDim> GetDims(
+      const std::vector<std::string> &names) const;
+
+  std::vector<proto::VarDesc::VarType> GetVarTypes(
+      const std::vector<std::string> &names) const;
+
+  virtual proto::VarDesc::VarType GetVarType(const std::string &name) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
new file mode 100644
index 0000000000..f922e60624
--- /dev/null
+++ b/paddle/framework/tensor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
new file mode 100644
index 0000000000..4aaa29d794
--- /dev/null
+++ b/paddle/framework/tensor.h
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <typeindex>
+#include <vector>
+
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+
+namespace framework {
+
+class LoDTensor;
+
+class Tensor {
+ public:
+  template <typename T, size_t D, int MajorType, typename IndexType>
+  friend struct EigenTensor;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenMatrix;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenVector;
+
+ public:
+  Tensor() : offset_(0) {}
+
+  /*! Return a pointer to mutable memory block. */
+  template <typename T>
+  inline T* data();
+
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T* data() const;
+
+  inline bool IsInitialized() const;
+
+  inline void switch_place(platform::Place new_place);
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  inline void* mutable_data(platform::Place place, std::type_index type);
+
+  inline void* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Return the numel of the memory block. */
+  inline int64_t numel() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline Tensor& Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  inline Tensor& ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief  Return a sub-tensor of the given tensor.
+   *
+   * @param[in] begin_idx   The index of the start row(inclusive) to slice.
+   *                        The index number begins from 0.
+   * @param[in] end_idx     The index of the end row(exclusive) to slice.
+   *                        The index number begins from 0.
+   */
+  inline Tensor Slice(int begin_idx, int end_idx) const;
+
+  platform::Place place() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::place() is called.");
+    return holder_->place();
+  }
+
+  std::type_index type() const {
+    PADDLE_ENFORCE_NOT_NULL(
+        holder_, "Tensor not initialized yet when Tensor::type() is called.");
+    return holder_->type();
+  }
+
+  size_t memory_size() const;
+
+  inline void check_memory_size() const;
+
+  inline DataLayout layout() const { return layout_; }
+
+  inline void set_layout(const DataLayout layout) { layout_ = layout; }
+
+ private:
+  friend class LoDTensor;
+
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
+  struct Placeholder {
+    virtual ~Placeholder() = default;
+    virtual void* ptr() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
+    virtual void set_type(std::type_index type) = 0;
+  };
+
+  template <typename Place>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
+          place_(place),
+          size_(size),
+          type_(type) {
+      PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
+                              (is_cpu_place(place_) ? "CPU" : "GPU"));
+    }
+
+    virtual size_t size() const { return size_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
+
+    /*! the pointer of memory block. */
+    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
+  };
+
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /**
+   * @brief points to elements dimensions.
+   *
+   * @note dims_ do not indicate the memory block size.
+   */
+
+  DDim dims_;
+
+  /**
+   * @brief the layout of memory block, default is NHWC.
+   *
+   * @note the memory allocation order, describe how weight/data is stored
+   *       For example, in 4-D Tensor(rank=4), there are three commonly
+   *       used layout. They are
+   *            NCHW, NHWC, CHWN.
+   *       N,C,H,W for respectively the batch size, the number of
+   *       feature maps, the height.
+   */
+
+  DataLayout layout_ = DataLayout::kNHWC;
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
+  size_t offset_;
+};
+
+inline void Tensor::switch_place(platform::Place new_place) {
+  if (holder_->place() == new_place) {
+    return;
+  }
+
+  // TODO(tonyyang-svail): do memcpy here.
+  PADDLE_THROW("Not Implemented");
+}
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/tensor_impl.h"
diff --git a/paddle/majel/README.md b/paddle/framework/tensor.md
similarity index 99%
rename from paddle/majel/README.md
rename to paddle/framework/tensor.md
index 7a80816d8e..0a27ac9bb6 100644
--- a/paddle/majel/README.md
+++ b/paddle/framework/tensor.md
@@ -71,7 +71,7 @@ private:
 ```
 
 ```c++
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<CUDAPlace, CpuPlace> Place;
 typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
                        Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
 typedef boost::variant<
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
new file mode 100644
index 0000000000..1340c5e485
--- /dev/null
+++ b/paddle/framework/tensor_impl.h
@@ -0,0 +1,196 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename... T>
+struct SizeOfTypeFunctor;
+
+template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor;
+  size_t size = functor(type);
+  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE_GE(
+      holder_->size(), memory_size() + offset_,
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
+}
+
+inline size_t Tensor::memory_size() const {
+  return holder_ == nullptr ? 0UL : numel() * SizeOfType(type());
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+}
+
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
+  PADDLE_ENFORCE_GT(
+      numel(), 0,
+      "When calling this method, the Tensor's numel must be larger than zero. "
+      "Please check Tensor::Resize has been called first.");
+  int64_t size = numel() * SizeOfType(type);
+  /* some versions of boost::variant don't have operator!= */
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
+    } else if (platform::is_gpu_place(place)) {
+#ifndef PADDLE_WITH_CUDA
+      PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+    }
+#else
+      holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
+          boost::get<platform::CUDAPlace>(place), size, type));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+inline void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing");
+  return mutable_data(place, holder_->type());
+}
+
+inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size();
+  *this = src;
+  return *this;
+}
+
+inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
+  check_memory_size();
+  PADDLE_ENFORCE_GE(begin_idx, 0,
+                    "The start row index must be greater than 0.");
+  PADDLE_ENFORCE_LE(end_idx, dims_[0], "The end row index is out of bound.");
+  PADDLE_ENFORCE_LT(
+      begin_idx, end_idx,
+      "The start row index must be lesser than the end row index.");
+
+  if (dims_[0] == 1) {
+    return *this;
+  } else {
+    size_t base = numel() / dims_[0];
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.set_layout(layout_);
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.Resize(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
+    return dst;
+  }
+}
+
+inline Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+inline int64_t Tensor::numel() const { return product(dims_); }
+
+inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  Tensor res;
+  res.ShareDataWith(src);
+  res.Resize(flatten_to_2d(src.dims(), num_col_dims));
+  return res;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
new file mode 100644
index 0000000000..9a387526ac
--- /dev/null
+++ b/paddle/framework/tensor_test.cc
@@ -0,0 +1,215 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/tensor.h"
+#include <gtest/gtest.h>
+#include <string>
+
+namespace framework = paddle::framework;
+namespace platform = paddle::platform;
+
+TEST(Tensor, Dims) {
+  framework::Tensor tt;
+  tt.Resize({2, 3, 4});
+  framework::DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  framework::Tensor src_tensor;
+
+  bool caught = false;
+  try {
+    src_tensor.data<double>();
+  } catch (platform::EnforceNotMet err) {
+    caught = true;
+    std::string msg =
+        "holder_ should not be null\nTensor holds no memory. Call "
+        "Tensor::mutable_data first.";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+}
+
+TEST(Tensor, MutableData) {
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CPUPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CPUPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CPUPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CPUPlace());
+    EXPECT_EQ(p1, p2);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    float* p1 = nullptr;
+    float* p2 = nullptr;
+    // initialization
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({1, 2, 3}),
+                                        platform::CUDAPlace());
+    EXPECT_NE(p1, nullptr);
+    // set src_tensor a new dim with large size
+    // momery is supposed to be re-allocated
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({3, 4}),
+                                        platform::CUDAPlace());
+    EXPECT_NE(p2, nullptr);
+    EXPECT_NE(p1, p2);
+    // set src_tensor a new dim with same size
+    // momery block is supposed to be unchanged
+    p1 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2, 3}),
+                                        platform::CUDAPlace());
+    EXPECT_EQ(p1, p2);
+    // set src_tensor a new dim with smaller size
+    // momery block is supposed to be unchanged
+    p2 = src_tensor.mutable_data<float>(framework::make_ddim({2, 2}),
+                                        platform::CUDAPlace());
+    EXPECT_EQ(p1, p2);
+  }
+#endif
+}
+
+TEST(Tensor, ShareDataWith) {
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataWith(src_tensor);
+    } catch (paddle::platform::EnforceNotMet err) {
+      caught = true;
+      std::string msg =
+          "holder_ should not be null\nTensor holds no memory. Call "
+          "Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CPUPlace());
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    framework::Tensor dst_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({2, 3, 4}),
+                                 platform::CUDAPlace());
+    dst_tensor.ShareDataWith(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+}
+
+TEST(Tensor, Slice) {
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<int>(framework::make_ddim({5, 3, 4}),
+                                 platform::CPUPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(1, 3);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), platform::CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<int>(
+            slice_tensor.dims(), platform::CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    framework::Tensor src_tensor;
+    src_tensor.mutable_data<double>(framework::make_ddim({6, 9}),
+                                    platform::CUDAPlace());
+    framework::Tensor slice_tensor = src_tensor.Slice(2, 6);
+    framework::DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.mutable_data<double>(
+            src_tensor.dims(), platform::CUDAPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.mutable_data<double>(
+            slice_tensor.dims(), platform::CUDAPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+}
+
+TEST(Tensor, ReshapeToMatrix) {
+  framework::Tensor src;
+  int* src_ptr = src.mutable_data<int>({2, 3, 4, 9}, platform::CPUPlace());
+  for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
+    src_ptr[i] = i;
+  }
+  framework::Tensor res = framework::ReshapeToMatrix(src, 2);
+  ASSERT_EQ(res.dims()[0], 2 * 3);
+  ASSERT_EQ(res.dims()[1], 4 * 9);
+}
+
+TEST(Tensor, Layout) {
+  framework::Tensor src;
+  ASSERT_EQ(src.layout(), framework::DataLayout::kNHWC);
+  src.set_layout(framework::DataLayout::kAnyLayout);
+  ASSERT_EQ(src.layout(), framework::DataLayout::kAnyLayout);
+}
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
new file mode 100644
index 0000000000..a5b83eaa07
--- /dev/null
+++ b/paddle/framework/tensor_util.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_util.h"
+
+namespace paddle {
+namespace framework {
+template <typename Predicate, typename DevCtx>
+struct AnyDTypeVisitor {
+  Predicate predicate_;
+  const Tensor& tensor_;
+  const DevCtx& ctx_;
+  Tensor* out_;
+
+  AnyDTypeVisitor(Predicate predicate, const Tensor& tensor, const DevCtx& ctx,
+                  Tensor* out)
+      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
+
+  template <typename T>
+  void operator()() const {
+    auto t = EigenVector<T>::Flatten(tensor_);
+    auto o = EigenScalar<bool>::From(*out_);
+    // return any of predicate_(t) is true.
+    o.device(*ctx_.eigen_device()) = predicate_(t).any();
+  }
+};
+
+template <typename Predicate, typename DevCtx>
+inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
+                    const DevCtx& ctx, framework::Tensor* out) {
+  VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor<Predicate, DevCtx>(
+                                               predicate, tensor, ctx, out));
+}
+
+template <typename Predicate>
+struct AnyVisitor : public boost::static_visitor<bool> {
+  const framework::Tensor& tensor_;
+  Predicate predicate_;
+
+  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
+      : tensor_(tensor), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  bool operator()(const Place& place) const {
+    framework::Tensor out;
+    out.Resize({1});
+    out.mutable_data<bool>(place);
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    AnyImpl(predicate_, tensor_, *ctx, &out);
+    return this->GetResult(out, place);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CUDAPlace& gpu) const {
+    platform::CPUPlace cpu;
+    framework::Tensor tmp;
+    tmp.Resize({1});
+    tmp.mutable_data<bool>(cpu);
+    auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
+    gpuctx->Wait();
+    Copy(out, cpu, *gpuctx, &tmp);
+    gpuctx->Wait();
+    return GetResult(tmp, cpu);
+  }
+
+  bool GetResult(const framework::Tensor& out,
+                 const platform::CPUPlace& cpu) const {
+    return *out.data<bool>();
+  }
+};
+
+template <typename Predicate>
+inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
+  AnyVisitor<Predicate> visitor(tensor, predicate);
+  auto place = tensor.place();
+  return platform::VisitPlace(place, visitor);
+}
+
+struct HasNANPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isnan()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isnan();
+  }
+};
+
+bool HasNAN(const framework::Tensor& tensor) {
+  HasNANPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+struct HasInfPredicate {
+  template <typename T>
+  auto operator()(const T& eigen_vec) const
+      -> decltype(std::declval<T>().isinf()) {
+    // Cast eigen_vector to vector of bool. true if is inf.
+    return eigen_vec.isinf();
+  }
+};
+
+bool HasInf(const framework::Tensor& tensor) {
+  HasInfPredicate predicate;
+  return Any(tensor, predicate);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util.cu b/paddle/framework/tensor_util.cu
new file mode 120000
index 0000000000..b00e6e59d9
--- /dev/null
+++ b/paddle/framework/tensor_util.cu
@@ -0,0 +1 @@
+./tensor_util.cc
\ No newline at end of file
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
new file mode 100644
index 0000000000..b49c614499
--- /dev/null
+++ b/paddle/framework/tensor_util.h
@@ -0,0 +1,333 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 const platform::DeviceContext& ctx, Tensor* dst) {
+  VLOG(3) << "Copy " << src.dims() << " from " << src.place() << " to "
+          << dst_place;
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  dst->set_layout(src.layout());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::CUDAPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::CUDAPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::CUDAPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief Wrapper on
+ *     Copy(const Tensor& src, const platform::Place& dst_place,
+ *              const platform::DeviceContext& ctx, Tensor* dst);
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ *
+ * @note    Copy supports CPU <-> GPU, GPU <-> GPU.
+ */
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+                 Tensor* dst) {
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  const platform::DeviceContext* dev_ctx;
+  if (platform::is_gpu_place(src.place())) {
+    dev_ctx = pool.Get(src.place());
+  } else {
+    dev_ctx = pool.Get(dst_place);
+  }
+  Copy(src, dst_place, *dev_ctx, dst);
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector will resize dst to an 1D tensor with the same
+ *            size as src.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyFromVector CPU vector -> CPU Tensor
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src, Tensor* dst) {
+  platform::CPUPlace dst_place = platform::CPUPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::CUDAPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief CopyToVector CPUTensor <-> CPU Vector
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  PADDLE_ENFORCE(platform::is_cpu_place(src.place()));
+
+  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
+               src_ptr, size);
+}
+
+// Returns true if a tensor contains NAN, i.e., Not A Number.
+bool HasNAN(const framework::Tensor& tensor);
+
+// Returns true if a tensor contains Inf, i.e., Infinity.
+bool HasInf(const framework::Tensor& tensor);
+
+inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
+                              const platform::DeviceContext& dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char*>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    proto::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto* pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto* data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto& gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::CUDAPlace>(tensor.place()),
+                     reinterpret_cast<const void*>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char*>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+struct DeserializedDataFunctor {
+  DeserializedDataFunctor(void** buf, Tensor* tensor,
+                          const platform::Place& place)
+      : buf_(buf), tensor_(tensor), place_(place) {}
+
+  template <typename T>
+  void operator()() {
+    *buf_ = tensor_->mutable_data<T>(place_);
+  }
+
+  void** buf_;
+  Tensor* tensor_;
+  platform::Place place_;
+};
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
+                                  const platform::DeviceContext& dev_ctx) {
+  uint32_t version;
+  is.read(reinterpret_cast<char*>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  proto::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char*>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char*>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void* buf;
+    auto ctx = platform::CPUDeviceContext();
+    if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      Tensor cpu_tensor;
+      cpu_tensor.Resize(framework::make_ddim(dims));
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
+      auto dst_place = dev_ctx.GetPlace();
+      framework::Copy(cpu_tensor, dst_place, dev_ctx, tensor);
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      framework::VisitDataType(
+          desc.data_type(),
+          DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+      is.read(static_cast<char*>(buf), tensor->memory_size());
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
new file mode 100644
index 0000000000..906b0b5656
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cc
@@ -0,0 +1,309 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <cmath>
+#include <string>
+
+namespace paddle {
+namespace framework {
+
+TEST(Copy, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  src_tensor.set_layout(DataLayout::kAnyLayout);
+
+  auto cpu_place = new platform::CPUPlace();
+  Copy(src_tensor, *cpu_place, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  Copy(slice_tensor, *cpu_place, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+  EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::CUDAPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+
+    EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::CUDAPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    std::vector<int> dst;
+    CopyToVector<int>(src, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    CUDAPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+TEST(HasNAN, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  float* buf = src.mutable_data<float>({3}, CPUPlace());
+  buf[0] = 0.0;
+  buf[1] = NAN;
+  buf[2] = 0.0;
+
+  ASSERT_TRUE(HasNAN(src));
+}
+
+TEST(HasInf, CPU) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src;
+  double* buf = src.mutable_data<double>({3}, CPUPlace());
+  buf[0] = 1.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.0;
+  ASSERT_TRUE(HasInf(src));
+}
+
+TEST(Tensor, SerializeAndDeserialize) {
+  framework::Tensor src_tensor;
+  int array[6] = {1, 2, 3, 4, 5, 6};
+  src_tensor.Resize({2, 3});
+  int* src_ptr = src_tensor.mutable_data<int>(platform::CPUPlace());
+  for (int i = 0; i < 6; ++i) {
+    src_ptr[i] = array[i];
+  }
+  {
+    framework::Tensor dst_tensor;
+    auto place = new platform::CPUPlace();
+    platform::CPUDeviceContext cpu_ctx(*place);
+    std::ostringstream oss;
+    SerializeToStream(oss, src_tensor, cpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 5; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    delete place;
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor gpu_tensor;
+    gpu_tensor.Resize({2, 3});
+    Tensor dst_tensor;
+
+    auto gpu_place = new platform::CUDAPlace();
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+
+    Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    std::ostringstream oss;
+    SerializeToStream(oss, gpu_tensor, gpu_ctx);
+
+    std::istringstream iss(oss.str());
+    DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
+
+    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 6; ++i) {
+      ASSERT_EQ(dst_ptr[i], array[i]);
+    }
+    delete gpu_place;
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cu b/paddle/framework/tensor_util_test.cu
new file mode 100644
index 0000000000..ebd35fdf6c
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace framework {
+
+static __global__ void FillNAN(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = NAN;
+}
+static __global__ void FillInf(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = INFINITY;
+  buf[2] = 0.5;
+}
+
+TEST(HasNAN, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasNAN(tensor));
+}
+
+TEST(HasInf, GPU) {
+  Tensor tensor;
+  platform::CUDAPlace gpu(0);
+  auto& pool = platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  float* buf = tensor.mutable_data<float>({3}, gpu);
+  FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+  cuda_ctx->Wait();
+  ASSERT_TRUE(HasInf(tensor));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc
new file mode 100644
index 0000000000..b7d7c00bcf
--- /dev/null
+++ b/paddle/framework/threadpool.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/threadpool.h"
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+std::unique_ptr<ThreadPool> ThreadPool::threadpool_(nullptr);
+std::once_flag ThreadPool::init_flag_;
+
+ThreadPool* ThreadPool::GetInstance() {
+  std::call_once(init_flag_, &ThreadPool::Init);
+  return threadpool_.get();
+}
+
+void ThreadPool::Init() {
+  if (threadpool_.get() == nullptr) {
+    // TODO(Yancey1989): specify the max threads number
+    int num_threads = std::thread::hardware_concurrency();
+    PADDLE_ENFORCE_GT(num_threads, 0);
+    threadpool_.reset(new ThreadPool(num_threads));
+  }
+}
+
+ThreadPool::ThreadPool(int num_threads)
+    : total_threads_(num_threads), idle_threads_(num_threads), running_(true) {
+  threads_.resize(num_threads);
+  for (auto& thread : threads_) {
+    // TODO(Yancey1989): binding the thread on the specify CPU number
+    thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this)));
+  }
+}
+
+ThreadPool::~ThreadPool() {
+  {
+    // notify all threads to stop running
+    running_ = false;
+    scheduled_.notify_all();
+  }
+
+  for (auto& t : threads_) {
+    t->join();
+    t.reset(nullptr);
+  }
+}
+
+void ThreadPool::Wait() {
+  std::unique_lock<std::mutex> lock(mutex_);
+  completed_.wait(lock, [=] { return Done() == true; });
+}
+
+void ThreadPool::TaskLoop() {
+  while (running_) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; });
+
+    if (!running_) {
+      break;
+    }
+    // pop a task from the task queue
+    auto task = std::move(tasks_.front());
+    tasks_.pop();
+
+    --idle_threads_;
+    lock.unlock();
+
+    // run the task
+    task();
+
+    {
+      std::unique_lock<std::mutex> lock(mutex_);
+      ++idle_threads_;
+      if (Done()) {
+        completed_.notify_all();
+      }
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
new file mode 100644
index 0000000000..4e9b58679d
--- /dev/null
+++ b/paddle/framework/threadpool.h
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <functional>
+#include <future>
+#include <mutex>
+#include <queue>
+#include <thread>
+#include <vector>
+
+#include "paddle/platform/macros.h"  // for DISABLE_COPY_AND_ASSIGN
+
+namespace paddle {
+namespace framework {
+
+// ThreadPool maintains a queue of tasks, and runs them using a fixed
+// number of threads.
+class ThreadPool {
+ public:
+  typedef std::packaged_task<void()> Task;
+
+  // Returns the singleton of ThreadPool.
+  static ThreadPool* GetInstance();
+
+  ~ThreadPool();
+
+  // Returns the number of threads created by the constructor.
+  size_t Threads() const { return total_threads_; }
+
+  // Returns the number of currently idle threads.
+  size_t IdleThreads() {
+    std::unique_lock<std::mutex> lock(mutex_);
+    return idle_threads_;
+  }
+
+  // Run pushes a function to the task queue and returns a std::future
+  // object.  To wait for the completion of the task, call
+  // std::future::wait().
+  template <typename Callback>
+  std::future<void> Run(Callback fn) {
+    std::unique_lock<std::mutex> lock(mutex_);
+    Task task(std::bind(fn));
+    std::future<void> f = task.get_future();
+    tasks_.push(std::move(task));
+    lock.unlock();
+    scheduled_.notify_one();
+    return f;
+  }
+
+  // Wait until all the tasks are completed.
+  void Wait();
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ThreadPool);
+
+  explicit ThreadPool(int num_threads);
+
+  // If the task queue is empty and avaialbe is equal to the number of
+  // threads, means that all tasks are completed.  Note: this function
+  // is not thread-safe.  Returns true if all tasks are completed.
+  // Note: don't delete the data member total_threads_ and use
+  // threads_.size() instead; because you'd need to lock the mutex
+  // before accessing threads_.
+  bool Done() { return tasks_.empty() && idle_threads_ == total_threads_; }
+
+  // The constructor starts threads to run TaskLoop, which retrieves
+  // and runs tasks from the queue.
+  void TaskLoop();
+
+  // Init is called by GetInstance.
+  static void Init();
+
+ private:
+  static std::unique_ptr<ThreadPool> threadpool_;
+  static std::once_flag init_flag_;
+
+  std::vector<std::unique_ptr<std::thread>> threads_;
+  const size_t total_threads_;
+  size_t idle_threads_;
+
+  std::queue<Task> tasks_;
+  std::mutex mutex_;
+  bool running_;
+  std::condition_variable scheduled_;
+  std::condition_variable completed_;
+};
+
+// Run a function asynchronously.
+// NOTE: The function must return void. If the function need to return a value,
+// you can use lambda to capture a value pointer.
+template <typename Callback>
+std::future<void> Async(Callback callback) {
+  return ThreadPool::GetInstance()->Run(callback);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc
new file mode 100644
index 0000000000..3fbfe7efc8
--- /dev/null
+++ b/paddle/framework/threadpool_test.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <atomic>
+
+#include "threadpool.h"
+
+namespace framework = paddle::framework;
+
+void do_sum(framework::ThreadPool* pool, std::atomic<int>& sum, int cnt) {
+  std::vector<std::future<void>> fs;
+  for (int i = 0; i < cnt; ++i) {
+    fs.push_back(framework::Async([&sum]() { sum.fetch_add(1); }));
+  }
+}
+
+TEST(ThreadPool, ConcurrentInit) {
+  framework::ThreadPool* pool;
+  int n = 50;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < n; ++i) {
+    std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); });
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
+TEST(ThreadPool, ConcurrentRun) {
+  framework::ThreadPool* pool = framework::ThreadPool::GetInstance();
+  std::atomic<int> sum(0);
+  std::vector<std::thread> threads;
+  int n = 50;
+  // sum = (n * (n + 1)) / 2
+  for (int i = 1; i <= n; ++i) {
+    std::thread t(do_sum, pool, std::ref(sum), i);
+    threads.push_back(std::move(t));
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  pool->Wait();
+  EXPECT_EQ(sum, ((n + 1) * n) / 2);
+}
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
new file mode 100644
index 0000000000..1eedbbc419
--- /dev/null
+++ b/paddle/framework/type_defs.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class OpDesc;
+class InferShapeContext;
+class BlockDesc;
+
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*, int64_t>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
+    const OpDesc&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    std::unordered_map<std::string, std::string>* /*grad_to_var*/,
+    const std::vector<BlockDesc*>& grad_block)>;
+
+using InferVarTypeFN =
+    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+
+using InferShapeFN = std::function<void(InferShapeContext*)>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
new file mode 100644
index 0000000000..62ab6593ef
--- /dev/null
+++ b/paddle/framework/var_desc.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/var_desc.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+proto::VarDesc::VarType VarDesc::GetType() const { return desc_.type(); }
+
+void VarDesc::SetType(proto::VarDesc::VarType type) { desc_.set_type(type); }
+
+void VarDesc::SetShape(const std::vector<int64_t> &dims) {
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
+}
+
+void VarDesc::SetDataType(proto::DataType data_type) {
+  mutable_tensor_desc()->set_data_type(data_type);
+}
+
+std::vector<int64_t> VarDesc::Shape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+proto::DataType VarDesc::GetDataType() const {
+  return tensor_desc().data_type();
+}
+
+void VarDesc::SetLoDLevel(int32_t lod_level) {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+      break;
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      desc_.mutable_tensor_array()->set_lod_level(lod_level);
+      break;
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
+}
+
+int32_t VarDesc::GetLoDLevel() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().lod_level();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().lod_level();
+    default:
+      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
+                   desc_.tensor_array().lod_level());
+  }
+}
+
+const proto::TensorDesc &VarDesc::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.tensor_array().tensor();
+    default:
+      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+  }
+}
+
+proto::TensorDesc *VarDesc::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(),
+                 "invoke MutableTensorDesc must after set type");
+  switch (desc_.type()) {
+    case proto::VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case proto::VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    case proto::VarDesc::LOD_TENSOR_ARRAY:
+      return desc_.mutable_tensor_array()->mutable_tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
new file mode 100644
index 0000000000..9316b14bb6
--- /dev/null
+++ b/paddle/framework/var_desc.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+// convert between std::vector and protobuf repeated.
+template <typename T>
+inline std::vector<T> RepeatedToVector(
+    const google::protobuf::RepeatedField<T> &repeated_field) {
+  std::vector<T> ret;
+  ret.reserve(repeated_field.size());
+  std::copy(repeated_field.begin(), repeated_field.end(),
+            std::back_inserter(ret));
+  return ret;
+}
+
+template <typename T, typename RepeatedField>
+inline void VectorToRepeated(const std::vector<T> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (const auto &elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+// Specialize vector<bool>.
+template <typename RepeatedField>
+inline void VectorToRepeated(const std::vector<bool> &vec,
+                             RepeatedField *repeated_field) {
+  repeated_field->Clear();
+  repeated_field->Reserve(vec.size());
+  for (auto elem : vec) {
+    *repeated_field->Add() = elem;
+  }
+}
+
+class VarDesc {
+ public:
+  explicit VarDesc(const std::string &name) {
+    desc_.set_name(name);
+    desc_.set_type(proto::VarDesc::LOD_TENSOR);
+  }
+
+  explicit VarDesc(const proto::VarDesc &desc) : desc_(desc) {}
+
+  proto::VarDesc *Proto() { return &desc_; }
+
+  std::string Name() const { return desc_.name(); }
+
+  void SetName(std::string name) { desc_.set_name(name); }
+
+  void SetShape(const std::vector<int64_t> &dims);
+
+  void SetDataType(proto::DataType data_type);
+
+  std::vector<int64_t> Shape() const;
+
+  proto::DataType GetDataType() const;
+
+  void SetLoDLevel(int32_t lod_level);
+
+  int32_t GetLoDLevel() const;
+
+  proto::VarDesc::VarType GetType() const;
+
+  void SetType(proto::VarDesc::VarType type);
+
+  bool Persistable() const { return desc_.persistable(); }
+
+  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
+
+ private:
+  const proto::TensorDesc &tensor_desc() const;
+  proto::TensorDesc *mutable_tensor_desc();
+
+  proto::VarDesc desc_;
+};
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
new file mode 100644
index 0000000000..5b7a08a087
--- /dev/null
+++ b/paddle/framework/var_type.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+inline proto::VarDesc::VarType ToVarType(std::type_index type) {
+  if (type.hash_code() == typeid(LoDTensor).hash_code()) {
+    return proto::VarDesc_VarType_LOD_TENSOR;
+  } else if (type.hash_code() == typeid(LoDRankTable).hash_code()) {
+    return proto::VarDesc_VarType_LOD_RANK_TABLE;
+  } else if (type.hash_code() == typeid(LoDTensorArray).hash_code()) {
+    return proto::VarDesc_VarType_LOD_TENSOR_ARRAY;
+  } else if (type.hash_code() == typeid(SelectedRows).hash_code()) {
+    return proto::VarDesc_VarType_SELECTED_ROWS;
+  } else {
+    PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
+  }
+}
+
+template <typename Visitor>
+inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
+  switch (ToVarType(var.Type())) {
+    case proto::VarDesc_VarType_LOD_TENSOR:
+      visitor(var.Get<framework::LoDTensor>());
+      return;
+    case proto::VarDesc_VarType_LOD_RANK_TABLE:
+      visitor(var.Get<LoDRankTable>());
+      return;
+    case proto::VarDesc_VarType_LOD_TENSOR_ARRAY:
+      visitor(var.Get<LoDTensorArray>());
+      return;
+    case proto::VarDesc_VarType_SELECTED_ROWS:
+      visitor(var.Get<SelectedRows>());
+      return;
+    default:
+      PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
new file mode 100644
index 0000000000..6c11f2fee7
--- /dev/null
+++ b/paddle/framework/var_type_inference.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+class VarTypeInference {
+ public:
+  virtual ~VarTypeInference() {}
+  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
new file mode 100644
index 0000000000..fa6018b1c5
--- /dev/null
+++ b/paddle/framework/var_type_inference_test.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/var_type_inference.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = proto::VarDesc::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == proto::VarDesc::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = proto::VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(InferVarType, sum_op) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.MutableBlock(0)->Var("test_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_c")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarDesc::SELECTED_ROWS,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+
+  prog.MutableBlock(0)->Var("test_b")->SetType(proto::VarDesc::LOD_TENSOR);
+  op->InferVarType(prog.MutableBlock(0));
+  ASSERT_EQ(proto::VarDesc::LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test_out")->GetType());
+}
+
+TEST(InferVarType, sum_op_without_infer_var_type) {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum_without_infer_var_type");
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  op->SetOutput("Out", {"test2_out"});
+
+  prog.MutableBlock(0)->Var("test2_a")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_b")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_c")->SetType(proto::VarDesc::SELECTED_ROWS);
+  prog.MutableBlock(0)->Var("test2_out");
+
+  op->InferVarType(prog.MutableBlock(0));
+
+  ASSERT_EQ(proto::VarDesc_VarType_LOD_TENSOR,
+            prog.MutableBlock(0)->Var("test2_out")->GetType());
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
new file mode 100644
index 0000000000..3b7ec0a2a9
--- /dev/null
+++ b/paddle/framework/variable.h
@@ -0,0 +1,95 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+class Variable {
+ public:
+  template <typename T>
+  const T& Get() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
+    PADDLE_ENFORCE(IsType<T>(),
+                   "Variable must be type %s, the holding type is %s",
+                   typeid(T).name(), holder_->Type().name());
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  bool IsInitialized() const { return holder_ != nullptr; }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      holder_.reset(new PlaceholderImpl<T>(new T()));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
+  void Clear() { holder_.reset(); }
+
+  std::type_index Type() const {
+    PADDLE_ENFORCE(holder_ != nullptr, "Must hold memory");
+    return holder_->Type();
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() {}
+    virtual const std::type_info& Type() const = 0;
+    virtual void* Ptr() const = 0;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+    virtual const std::type_info& Type() const { return type_; }
+    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+
+    std::unique_ptr<T> ptr_;
+    const std::type_info& type_;
+  };
+
+  std::unique_ptr<Placeholder>
+      holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.md b/paddle/framework/variable.md
new file mode 100644
index 0000000000..442ef6b718
--- /dev/null
+++ b/paddle/framework/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we would like that a variable allocates memory only when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, which can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from defining something like `caffe2::TypeMeta`, which takes hundreds of lines of C++ code.
diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc
new file mode 100644
index 0000000000..e5585c8724
--- /dev/null
+++ b/paddle/framework/variable_test.cc
@@ -0,0 +1,41 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/variable.h"
+
+TEST(Variable, GetMutable) {
+  using paddle::framework::Variable;
+
+  struct Tensor {
+    int content_;
+  };
+
+  std::unique_ptr<Variable> v(new Variable());
+
+  Tensor* t = v->GetMutable<Tensor>();
+  t->content_ = 1234;
+
+  const Tensor& tt = v->Get<Tensor>();
+  EXPECT_EQ(1234, tt.content_);
+
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
+}
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
new file mode 100644
index 0000000000..bd0fe119ce
--- /dev/null
+++ b/paddle/function/BlockExpandOp.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
+ *
+ * Arguments in forward function:
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
+ */
+class BlockExpandFunction : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) const {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
+
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class BlockExpandForward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
+      im2col(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+template <DeviceType Device>
+class BlockExpandBackward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp
new file mode 100644
index 0000000000..59193a3ec3
--- /dev/null
+++ b/paddle/function/BlockExpandOpTest.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(BlockExpandForward, real) {
+  for (size_t batchSize : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t inputHeight : {5, 33}) {
+        for (size_t inputWidth : {5, 32}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpand",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(BlockExpandBackward, real) {
+  for (size_t batchSize : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t inputHeight : {5, 33}) {
+        for (size_t inputWidth : {5, 32}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpandGrad",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
+                                ADD_TO);
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index 1744f37780..6b8e1e2da9 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -32,7 +32,7 @@ TEST(BufferTest, SequenceIdArg) {
                          sizeOfValuType(VALUE_TYPE_INT32));
   SequenceIdArg buffer(memory.getBuf(), shape);
   EXPECT_EQ(buffer.data(), memory.getBuf());
-  EXPECT_EQ(buffer.numSeqs(), 9);
+  EXPECT_EQ(buffer.numSeqs(), 9U);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 233a53709a..9b2779b42c 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -4,18 +4,31 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 list(APPEND cpp_files BufferArg.cpp)
+list(APPEND cpp_files GemmFunctor.cpp)
+if(USE_EIGEN_FOR_BLAS)
+  list(APPEND cpp_files EigenGemm.cpp)
+endif(USE_EIGEN_FOR_BLAS)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
+if(USE_NNPACK)
+  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
+  if(WITH_TESTING)
+    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
+  endif()
+endif()
+
+list(APPEND cpp_files neon/NeonDepthwiseConv.cpp)
+
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function gen_proto_cpp)
+add_dependencies(paddle_function paddle_proto)
 
-if(WITH_GPU)
 if(WITH_TESTING)
+if(WITH_GPU)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
@@ -28,7 +41,16 @@ if(WITH_TESTING)
     add_simple_unittest(PadOpTest)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(BlockExpandOpTest)
+    add_simple_unittest(CropOpTest)
+    add_simple_unittest(SwitchOpTest)
+    add_simple_unittest(ScaleSubRegionOpTest)
 endif()
+
+add_simple_unittest(Im2ColTest)
+add_simple_unittest(GemmConvOpTest)
+add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index b87750b742..23916c0f4b 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     CPU,
                     ContextProjectionBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     GPU,
                     ContextProjectionForwardFunc);
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1a5b404240..4492dea5d8 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "ContextProjectionOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
@@ -30,7 +30,7 @@ __global__ void KeContextProjectionForward(const real* input,
   int block_size = blockDim.x;
   int sequenceId = blockIdx.x;
   int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
@@ -49,8 +49,9 @@ __global__ void KeContextProjectionForward(const real* input,
         } else if ((i + context_start) >= (seq_end - seq_start)) {
           if (padding) {
             value =
-              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
-                         input_dim + idx];
+                weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                           input_dim +
+                       idx];
           } else {
             continue;
           }
@@ -61,7 +62,7 @@ __global__ void KeContextProjectionForward(const real* input,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          output + outy * input_dim * context_length + outx * input_dim;
+            output + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           output_r[idx] += value;
           if (j - outy == outx) break;
@@ -108,13 +109,25 @@ void hl_context_projection_forward(const real* input,
   dim3 grid(blocks_x, blocks_y);
 
   if (weight) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weight, output, input_dim,
-       context_length, context_start, begin_pad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weight, output, input_dim,
-       context_length, context_start, begin_pad);
+    KeContextProjectionForward<true><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        input,
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
+  } else {
+    KeContextProjectionForward<false><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        input,
+        sequence,
+        weight,
+        output,
+        input_dim,
+        context_length,
+        context_start,
+        begin_pad);
   }
   CHECK_SYNC("hl_context_projection_forward failed");
 }
@@ -148,7 +161,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
   int block_size = blockDim.x;
   int sequenceId = blockIdx.x;
   int seq_start = sequence[sequenceId];
-  int seq_end = sequence[sequenceId+1];
+  int seq_end = sequence[sequenceId + 1];
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
@@ -170,7 +183,7 @@ __global__ void KeContextProjectionBackwardData(const real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out + outy * input_dim * context_length + outx * input_dim;
+            out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -211,8 +224,8 @@ void hl_context_projection_backward_data(const real* out_grad,
   int blocks_y = 1;
   dim3 threads(block_size, 1);
   dim3 grid(blocks_x, blocks_y);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  KeContextProjectionBackwardData<<<grid, threads, 0, STREAM_DEFAULT>>>(
+      out_grad, sequence, input_grad, input_dim, context_length, context_start);
   CHECK_SYNC("hl_context_projection_backward_data failed");
 }
 
@@ -231,7 +244,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                       context_start);
 }
 
-template<int THREADS_X, int THREADS_Y>
+template <int THREADS_X, int THREADS_Y>
 __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
@@ -254,17 +267,17 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
   if (weight_idx < w_dim) {
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
-      int seq_end = sequence[seqId+1];
-      output_r = const_cast<real*>(out_grad)
-                    + seq_start * w_dim * context_length;
+      int seq_end = sequence[seqId + 1];
+      output_r =
+          const_cast<real*>(out_grad) + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
           instanceId = padId;
         } else {
           // begin_pad > 0;
-          instanceId = (padId - begin_pad) +
-            (seq_end - seq_start) - context_start;
+          instanceId =
+              (padId - begin_pad) + (seq_end - seq_start) - context_start;
         }
       } else {
         if (padId + (seq_end - seq_start) < context_start) {
@@ -275,10 +288,11 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
         }
       }
 
-      int outx = (instanceId - context_length) < 0 ?
-                 instanceId : (context_length - 1);
-      int outy = (instanceId - context_length) < 0 ?
-                 0 : (instanceId - (context_length - 1));
+      int outx =
+          (instanceId - context_length) < 0 ? instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0
+                     ? 0
+                     : (instanceId - (context_length - 1));
       output_r += outy * w_dim * context_length + outx * w_dim;
       for (int j = outy; j < seq_end - seq_start; j++) {
         value += output_r[weight_idx];
@@ -290,7 +304,7 @@ __global__ void KeContextProjectionBackwardWeight(const real* out_grad,
   }
   __syncthreads();
 
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+  for (int stride = THREADS_Y / 2; stride > 0; stride = stride / 2) {
     if (idy < stride) {
       sum_s[idy][idx] += sum_s[idy + stride][idx];
     }
@@ -339,22 +353,27 @@ void hl_context_projection_backward_weight(const real* out_grad,
   dim3 threads(threads_x, threads_y);
   dim3 grid(blocks_x, 1);
 
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (out_grad, sequence, w_grad, num_sequences, w_dim,
-     context_length, context_start, begin_pad);
+  KeContextProjectionBackwardWeight<32,
+                                    32><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      out_grad,
+      sequence,
+      w_grad,
+      num_sequences,
+      w_dim,
+      context_length,
+      context_start,
+      begin_pad);
   CHECK_SYNC("hl_context_projection_backward_weight failed");
 }
 
 template <>
-void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        const GpuMatrix& out_grad,
-        GpuMatrix& w_grad,
-        const GpuIVector& seq_vec,
-        size_t context_length,
-        int context_start,
-        size_t total_pad,
-        size_t begin_pad) {
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                      GpuMatrix& w_grad,
+                                                      const GpuIVector& seq_vec,
+                                                      size_t context_length,
+                                                      int context_start,
+                                                      size_t total_pad,
+                                                      size_t begin_pad) {
   hl_context_projection_backward_weight(out_grad.getData(),
                                         seq_vec.getData(),
                                         w_grad.getData(),
@@ -376,23 +395,18 @@ void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-    if (in_grad) {
-        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
-                out_grad,
-                in_grad,
-                sequence,
-                context_length,
-                context_start);
-    }
-    if (is_padding && w_grad) {
-        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-                out_grad,
-                w_grad,
-                sequence,
-                context_length,
-                context_start,
-                total_pad,
-                begin_pad);
+  if (in_grad) {
+    ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+        out_grad, in_grad, sequence, context_length, context_start);
+  }
+  if (is_padding && w_grad) {
+    ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(out_grad,
+                                                     w_grad,
+                                                     sequence,
+                                                     context_length,
+                                                     context_start,
+                                                     total_pad,
+                                                     begin_pad);
   }
 }
 
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 1b25172ca5..9e9dd20e6f 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "ContextProjectionForward",
       FuncConfig()
           .set("context_length", context_length)
@@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "ContextProjectionBackward",
       FuncConfig()
           .set("context_length", context_length)
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
new file mode 100644
index 0000000000..062ea25a11
--- /dev/null
+++ b/paddle/function/ConvOp.h
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    dilations_ = config.get<std::vector<size_t>>("dilations");
+    groups_ = config.get<size_t>("groups");
+
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void checkShape(const TensorShape& input,
+                  const TensorShape& filter,
+                  const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+
+protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+
+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> dilations_;
+
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int dilationH() const { return dilations_[0]; }
+
+  inline int dilationW() const { return dilations_[1]; }
+
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/ConvOpTest.h b/paddle/function/ConvOpTest.h
new file mode 100644
index 0000000000..d8d3c792df
--- /dev/null
+++ b/paddle/function/ConvOpTest.h
@@ -0,0 +1,275 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FunctionTest.h"
+
+namespace paddle {
+
+template <DeviceType DType1, DeviceType DType2>
+void forward(Compare2Function<DType1, DType2>& test,
+             const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_input(Compare2Function<DType1, DType2>& test,
+                    const TensorShape& input,
+                    const TensorShape& filter,
+                    const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_filter(Compare2Function<DType1, DType2>& test,
+                     const TensorShape& input,
+                     const TensorShape& filter,
+                     const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+using Function = void (*)(Compare2Function<DType1, DType2>& test,
+                          const TensorShape& input,
+                          const TensorShape& filter,
+                          const TensorShape& output);
+
+/**
+ * \brief A basic convolution function test interface.
+ *
+ * \param conv1         type name of convolution function 1.
+ * \param conv2         type name of convolution function 2.
+ * \param function      test function, can be one of the forward, backward_input
+ *                      backward_filter function.
+ * Example:
+ * 1. Compare GemmConv's CPU and GPU implementation:
+ *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+ *      "GemmConv-CPU", "GemmConv-GPU", forward);
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution(const std::string& conv1,
+                 const std::string& conv2,
+                 Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 5}) {
+    for (size_t inputSize : {7, 14, 31}) {
+      for (size_t filterSize : {1, 3, 5}) {
+        for (size_t inputChannels : {3, 16}) {
+          for (size_t outputChannels : {3, 16}) {
+            if (outputChannels < inputChannels) continue;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
+                  if (padding >= filterSize) break;
+                  size_t filterS = (filterSize - 1) * dilation + 1;
+
+                  if (inputSize + 2 * padding < filterS) break;
+
+                  if ((conv1 == "NaiveConv-CPU" || conv2 == "NaiveConv-CPU" ||
+                       conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      dilation > 1)
+                    break;
+
+                  // NNPACK only supports stride = 1 if batchSize > 1
+                  if ((conv1 == "NNPACKConv-CPU" ||
+                       conv2 == "NNPACKConv-CPU") &&
+                      batchSize > 1 && stride > 1)
+                    break;
+
+                  size_t outputSize =
+                      (inputSize - filterS + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  std::vector<size_t> dilations = {dilation, dilation};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("dilations", dilations)
+                          .set("groups", (size_t)1)
+                          .set("algo", (std::string) "auto"));
+
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+
+                  function(test, input, filter, output);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for
+ *        image height is not equal image width.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution2(const std::string& conv1,
+                  const std::string& conv2,
+                  Function<DType1, DType2> function) {
+  for (size_t batchSize : {4}) {
+    for (size_t inputHeight : {7, 31}) {
+      for (size_t inputWidth : {10, 54}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t inputChannels : {7}) {
+              for (size_t outputChannels : {7}) {
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t dilation = 1;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {dilation, dilation};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("dilations", dilations)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for depthwise convolution.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void DepthwiseConvolution(const std::string& conv1,
+                          const std::string& conv2,
+                          Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 32}) {
+    for (size_t inputSize : {7, 14, 54}) {
+      for (size_t filterSize : {3, 4}) {
+        for (size_t inputChannels : {32}) {
+          for (size_t outputChannels : {32, 64}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // NNPACK only supports stride = 1 if batchSize > 1,
+                // and there has some bug when batchSize > 1 and groups != 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> dilations = {1, 1};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("dilations", dilations)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{groups,
+                                   outputChannels / groups,
+                                   inputChannels / groups,
+                                   filterSize,
+                                   filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 7ece7b2dfe..2e5c281f37 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -233,7 +233,7 @@ private:
 
 REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
 #endif
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
index c62ab39551..a1f88f479b 100644
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CosSimOp.h"
 #include "hl_base.h"
 #include "hl_device_functions.cuh"
-#include "CosSimOp.h"
 
 namespace paddle {
 
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSim(real* output,
                          const real* input1,
                          const real* input2,
@@ -78,8 +78,8 @@ void hlCossim(real* output,
   dim3 threads(block_size, 1);
   dim3 grid(1, input1_height);
 
-  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      output, input1, input2, width, input1_height, input2_height, scale);
   CHECK_SYNC("hlCossim failed");
 }
 
@@ -99,7 +99,7 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
   hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
 }
 
-template<int block_size>
+template <int block_size>
 __global__ void KeCosSimDerivative(const real* grad,
                                    const real* output,
                                    const real* prev_out_x,
@@ -148,14 +148,13 @@ __global__ void KeCosSimDerivative(const real* grad,
   if (xy[0] == 0) {
     real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
     for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] +=
-        scale * grad[ty] * prev_out_y[index] * reciprocal;
+      prev_grad_x[index] += scale * grad[ty] * prev_out_y[index] * reciprocal;
       if (input2_height > 1) {
-        prev_grad_y[index] +=
-          scale * grad[ty] * prev_out_x[index] * reciprocal;
+        prev_grad_y[index] += scale * grad[ty] * prev_out_x[index] * reciprocal;
       } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index,
-          scale * grad[ty] * prev_out_x[index] * reciprocal);
+        paddle::paddleAtomicAdd(
+            prev_grad_y + index,
+            scale * grad[ty] * prev_out_x[index] * reciprocal);
       }
     }
   } else {
@@ -163,17 +162,18 @@ __global__ void KeCosSimDerivative(const real* grad,
     real reciprocalSquareSumX = 1.0 / xx[0];
     real reciprocalSquareSumY = 1.0 / yy[0];
     for (int index = tid; index < width; index += block_size) {
-      prev_grad_x[index] += output[ty] * grad[ty] *
-        (prev_out_y[index] * reciprocalXY -
-         prev_out_x[index] * reciprocalSquareSumX);
+      prev_grad_x[index] +=
+          output[ty] * grad[ty] * (prev_out_y[index] * reciprocalXY -
+                                   prev_out_x[index] * reciprocalSquareSumX);
       if (input2_height > 1) {
-        prev_grad_y[index] += output[ty] * grad[ty] *
-          (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY);
+        prev_grad_y[index] +=
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
+                                     prev_out_y[index] * reciprocalSquareSumY);
       } else {
-        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
-          (prev_out_x[index] * reciprocalXY -
-           prev_out_y[index] * reciprocalSquareSumY));
+        paddle::paddleAtomicAdd(
+            prev_grad_y + index,
+            output[ty] * grad[ty] * (prev_out_x[index] * reciprocalXY -
+                                     prev_out_y[index] * reciprocalSquareSumY));
       }
     }
   }
@@ -198,9 +198,17 @@ void hlCossimDerivative(const real* grad,
   const int block_size = 256;
   dim3 threads(block_size, 1);
   dim3 grid(1, input1_height);
-  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
-        input1_height, input2_height, scale);
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>(
+      grad,
+      output,
+      prev_out_x,
+      prev_out_y,
+      prev_grad_x,
+      prev_grad_y,
+      width,
+      input1_height,
+      input2_height,
+      scale);
   CHECK_SYNC("hlCossimDerivate failed");
 }
 
@@ -214,9 +222,9 @@ void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                      real scale) {
   CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
         in2_val.getData() && in1_grad.getData() && in2_grad.getData());
-  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
-        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
-        << "Matrix types are not equally GPU";
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_ &&
+        in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+      << "Matrix types are not equally GPU";
 
   size_t dim = in1_val.getWidth();
   const real* grad = out_grad.getData();
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
index 48c815f027..f6c0041101 100644
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x,
                        size_t height_y,
                        size_t width,
                        real scale) {
-  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
   // prepare input arguments
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
@@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x,
                         size_t height_y,
                         size_t width,
                         real scale) {
-  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
   // prepare input arguments
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
new file mode 100644
index 0000000000..46f98f12c1
--- /dev/null
+++ b/paddle/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/function/TensorShape.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/function/CropOp.h
new file mode 100644
index 0000000000..87986fbdc7
--- /dev/null
+++ b/paddle/function/CropOp.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief  This funtion crops inputs according to the specify start point and
+ *shape.
+ *
+ * \param[out] outputs	save results.
+ * \param[in]  inputs	input data.
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void Crop(real* outputs,
+          const real* inputs,
+          const TensorShape inShape,
+          const TensorShape outShape,
+          const FuncConfig& conf);
+
+/**
+ * \brief   Cropping operation backward.
+ *
+ * \param[out] inGrad	gradients of previous layer
+ * \param[in]  outGrad  output gradient
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void CropGrad(const real* inGrad,
+              real* outGrad,
+              const TensorShape inShape,
+              const TensorShape outShape,
+              const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu
new file mode 100644
index 0000000000..241356a9ca
--- /dev/null
+++ b/paddle/function/CropOpGpu.cu
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeCrop(real* outputs,
+                       const real* inputs,
+                       int inC,
+                       int inH,
+                       int inW,
+                       int cropC,
+                       int cropH,
+                       int cropW,
+                       int outC,
+                       int outH,
+                       int outW,
+                       int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % outW;
+    const int h = (idx / outW) % outH;
+    const int c = (idx / outW / outH) % outC;
+    const int n = idx / outW / outH / outC;
+
+    const int off = ((n * inC + c + cropC) * inH + h + cropH) * inW + cropW + w;
+    outputs[idx] = inputs[off];
+  }
+}
+
+template <>
+void Crop<DEVICE_TYPE_GPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  size_t nth = num * outC * outH * outW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
+                                                     inputs,
+                                                     inC,
+                                                     inH,
+                                                     inW,
+                                                     cropC,
+                                                     cropH,
+                                                     cropW,
+                                                     outC,
+                                                     outH,
+                                                     outW,
+                                                     nth);
+  CHECK_SYNC("Crop");
+}
+
+__global__ void KeCropDiff(const real* inGrad,
+                           real* outGrad,
+                           int inC,
+                           int inH,
+                           int inW,
+                           int cropC,
+                           int cropH,
+                           int cropW,
+                           int outC,
+                           int outH,
+                           int outW,
+                           int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off =
+        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
+
+    outGrad[off] += inGrad[idx];
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCropDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
+                                                         outGrad,
+                                                         inC,
+                                                         inH,
+                                                         inW,
+                                                         cropC,
+                                                         cropH,
+                                                         cropW,
+                                                         outC,
+                                                         outH,
+                                                         outW,
+                                                         nth);
+  CHECK_SYNC("CropGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CropOpTest.cpp b/paddle/function/CropOpTest.cpp
new file mode 100644
index 0000000000..6f11abfdf6
--- /dev/null
+++ b/paddle/function/CropOpTest.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Crop, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {false, true}) {
+            CpuGpuFuncCompare compare(
+                test_grad ? "CropGrad" : "Crop",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("crop_corner", {0, 1, 1, 1})
+                    .set<std::vector<uint32_t>>("crop_shape", {0, 2, 3, 3}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{numSamples, 2, 3, 3};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT,
+                                         test_grad ? inDims : outDims,
+                                         test_grad ? ADD_TO : ASSIGN_TO),
+                               test_grad ? ADD_TO : ASSIGN_TO);
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ef878bfbba..9e88669d37 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -336,7 +336,7 @@ private:
 
 REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
 #endif
diff --git a/paddle/function/CrossMapNormalOpGpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
index b33dd10834..88b991ff6a 100644
--- a/paddle/function/CrossMapNormalOpGpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -12,14 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "CrossMapNormalOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
-__global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
-                                   real* scale, size_t channels,
-                                   size_t height, size_t width, size_t size,
+__global__ void KeCMRNormFillScale(size_t imageSize,
+                                   const real* in,
+                                   real* scale,
+                                   size_t channels,
+                                   size_t height,
+                                   size_t width,
+                                   size_t size,
                                    real alpha) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < imageSize) {
@@ -51,8 +55,10 @@ __global__ void KeCMRNormFillScale(size_t imageSize, const real* in,
   }
 }
 
-__global__ void KeCMRNormOutput(size_t inputSize, const real* in,
-                                const real* scale, real negative_beta,
+__global__ void KeCMRNormOutput(size_t inputSize,
+                                const real* in,
+                                const real* scale,
+                                real negative_beta,
                                 real* out) {
   const int index = threadIdx.x + blockIdx.x * blockDim.x;
   if (index < inputSize) {
@@ -74,24 +80,30 @@ void CrossMapNormal<DEVICE_TYPE_GPU>(real* outputs,
   size_t imageSize = numSamples * height * width;
   int blockSize = 1024;
   int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (imageSize, inputs, denoms, channels, height, width, size, scale);
+  KeCMRNormFillScale<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      imageSize, inputs, denoms, channels, height, width, size, scale);
 
-  size_t inputSize = numSamples * height * width *channels;
+  size_t inputSize = numSamples * height * width * channels;
   blockSize = 1024;
   gridSize = (inputSize + 1024 - 1) / 1024;
-  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (inputSize, inputs, denoms, -pow, outputs);
+  KeCMRNormOutput<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inputSize, inputs, denoms, -pow, outputs);
 
   CHECK_SYNC("CrossMapNormal");
 }
 
-__global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
-                              const real* top_data, const real* scale,
-                              const real* top_diff, size_t channels,
-                              size_t height, size_t width, size_t size,
-                              real negative_beta, real cache_ratio,
-                              real* bottom_diff ) {
+__global__ void KeCMRNormDiff(size_t imageSize,
+                              const real* bottom_data,
+                              const real* top_data,
+                              const real* scale,
+                              const real* top_diff,
+                              size_t channels,
+                              size_t height,
+                              size_t width,
+                              size_t size,
+                              real negative_beta,
+                              real cache_ratio,
+                              real* bottom_diff) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < imageSize) {
     const int w = idx % width;
@@ -113,17 +125,17 @@ __global__ void KeCMRNormDiff(size_t imageSize, const real* bottom_data,
     while (index < channels + post_pad) {
       if (index < channels) {
         accum += top_diff[index * step] * top_data[index * step] /
-          scale[index * step];
+                 scale[index * step];
       }
       if (index >= size) {
         accum -= top_diff[(index - size) * step] *
-          top_data[(index - size) * step] / scale[(index - size) * step];
+                 top_data[(index - size) * step] / scale[(index - size) * step];
       }
       if (index >= post_pad) {
         bottom_diff[(index - post_pad) * step] +=
-          top_diff[(index - post_pad) * step] *
-          pow(scale[(index - post_pad) * step], negative_beta) - cache_ratio *
-          bottom_data[(index - post_pad) * step] * accum;
+            top_diff[(index - post_pad) * step] *
+                pow(scale[(index - post_pad) * step], negative_beta) -
+            cache_ratio * bottom_data[(index - post_pad) * step] * accum;
       }
       ++index;
     }
@@ -147,9 +159,18 @@ void CrossMapNormalGrad<DEVICE_TYPE_GPU>(real* inputsGrad,
 
   int blockSize = 1024;
   int gridSize = (imageSize + 1024 - 1) / 1024;
-  KeCMRNormDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (imageSize, inputsValue, outputsValue, denoms, outputsGrad, channels,
-      height, width, size, -pow, 2.0f * pow * scale, inputsGrad);
+  KeCMRNormDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(imageSize,
+                                                            inputsValue,
+                                                            outputsValue,
+                                                            denoms,
+                                                            outputsGrad,
+                                                            channels,
+                                                            height,
+                                                            width,
+                                                            size,
+                                                            -pow,
+                                                            2.0f * pow * scale,
+                                                            inputsGrad);
   CHECK_SYNC("CrossMapNormalGrad");
 }
 
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index 51f5da81bf..3b390db77f 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -18,21 +18,21 @@ limitations under the License. */
 namespace paddle {
 
 TEST(CrossMapNormal, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+  for (size_t numSamples : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (size_t size : {1, 3}) {
             VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
             // init Test object
-            FunctionCompare test("CrossMapNormal",
-                                 FuncConfig()
-                                     .set("size", size)
-                                     .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+            CpuGpuFuncCompare test("CrossMapNormal",
+                                   FuncConfig()
+                                       .set("size", size)
+                                       .set("scale", (real)1.5)
+                                       .set("pow", (real)0.5));
             // prepare input arguments
             TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
@@ -48,20 +48,20 @@ TEST(CrossMapNormal, real) {
 }
 
 TEST(CrossMapNormalGrad, real) {
-  for (size_t numSamples : {5, 32}) {
-    for (size_t channels : {1, 5, 32}) {
-      for (size_t imgSizeH : {5, 33, 100}) {
-        for (size_t imgSizeW : {5, 32, 96}) {
-          for (size_t size : {1, 2, 3, 5, 7}) {
+  for (size_t numSamples : {5}) {
+    for (size_t channels : {1, 5}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (size_t size : {1, 3}) {
             VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare test("CrossMapNormalGrad",
-                                 FuncConfig()
-                                     .set("size", size)
-                                     .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+            CpuGpuFuncCompare test("CrossMapNormalGrad",
+                                   FuncConfig()
+                                       .set("size", size)
+                                       .set("scale", (real)1.5)
+                                       .set("pow", (real)0.5));
             TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
             test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
new file mode 100644
index 0000000000..9863e3ae1d
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -0,0 +1,305 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "ConvOp.h"
+
+namespace paddle {
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+/*
+ * \brief Forward calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  inputChannels,
+                  inputHeight,
+                  inputWidth,
+                  filterMultiplier,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputChannels,
+                           inputHeight,
+                           inputWidth,
+                           filterMultiplier,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterMultiplier,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            filterGrad);
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
new file mode 100644
index 0000000000..1bf70e52f3
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000..2d722dfcfc
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__ void ConvolutionDepthwiseForward(const int nthreads,
+                                            const T* const inputData,
+                                            const T* const filterData,
+                                            const int batchSize,
+                                            const int outputChannels,
+                                            const int outputHeight,
+                                            const int outputWidth,
+                                            const int inputChannels,
+                                            const int inputHeight,
+                                            const int inputWidth,
+                                            const int filterMultiplier,
+                                            const int filterHeight,
+                                            const int filterWidth,
+                                            const int strideH,
+                                            const int strideW,
+                                            const int paddingH,
+                                            const int paddingW,
+                                            T* const outputData) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight) && (w_in_start >= 0) &&
+        (w_in_end < inputWidth)) {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          const int offset =
+              ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                  inputWidth +
+              w_in;
+          value += (*weight) * inputData[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filterHeight; ++kh) {
+        for (int kw = 0; kw < filterWidth; ++kw) {
+          const int h_in = -paddingH + h_out * strideH + kh;
+          const int w_in = -paddingW + w_out * strideW + kw;
+          if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+              (w_in < inputWidth)) {
+            const int offset =
+                ((batch * inputChannels + c_in) * inputHeight + h_in) *
+                    inputWidth +
+                w_in;
+            value += (*weight) * inputData[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__ void ConvolutionDepthwiseInputBackward(const int nthreads,
+                                                  const T* const top_diff,
+                                                  const T* const weight_data,
+                                                  const int num,
+                                                  const int outputChannels,
+                                                  const int outputHeight,
+                                                  const int outputWidth,
+                                                  const int inputChannels,
+                                                  const int inputHeight,
+                                                  const int inputWidth,
+                                                  const int filterMultiplier,
+                                                  const int filterHeight,
+                                                  const int filterWidth,
+                                                  const int strideH,
+                                                  const int strideW,
+                                                  const int paddingH,
+                                                  const int paddingW,
+                                                  T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__ void ConvolutionDepthwiseFilterBackward(const int num_i,
+                                                   const int nthreads,
+                                                   const T* const top_diff,
+                                                   const T* const inputData,
+                                                   const int num,
+                                                   const int outputChannels,
+                                                   const int outputHeight,
+                                                   const int outputWidth,
+                                                   const int inputChannels,
+                                                   const int inputHeight,
+                                                   const int inputWidth,
+                                                   const int filterMultiplier,
+                                                   const int filterHeight,
+                                                   const int filterWidth,
+                                                   const int strideH,
+                                                   const int strideW,
+                                                   const int paddingH,
+                                                   const int paddingW,
+                                                   T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth +
+          w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        outputSize,
+        inputData,
+        filterData,
+        batchSize,
+        outputChannels,
+        outputHeight,
+        outputWidth,
+        inputChannels,
+        inputHeight,
+        inputWidth,
+        filterMultiplier,
+        filterHeight,
+        filterWidth,
+        strideH,
+        strideW,
+        paddingH,
+        paddingW,
+        outputData);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseInputBackward<T>
+        // NOLINT_NEXT_LINE(whitespace/operators)
+        <<<grid, threads, 0, STREAM_DEFAULT>>>(inputSize,
+                                               outputGrad,
+                                               filterData,
+                                               batchSize,
+                                               outputChannels,
+                                               outputHeight,
+                                               outputWidth,
+                                               inputChannels,
+                                               inputHeight,
+                                               inputWidth,
+                                               filterMultiplier,
+                                               filterHeight,
+                                               filterWidth,
+                                               strideH,
+                                               strideW,
+                                               paddingH,
+                                               paddingW,
+                                               inputGrad);
+  }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {
+    int colDataSize = outputChannels * filterHeight * filterWidth *
+                      outputHeight * outputWidth;
+
+    size_t blocks = (colDataSize + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+                                1,
+                                filterGrad,
+                                false,
+                                true);
+
+    for (int i = 0; i < batchSize; i++) {
+      ConvolutionDepthwiseFilterBackward<
+          T><<<grid, threads, 0, STREAM_DEFAULT>>>(i,
+                                                   colDataSize,
+                                                   outputGrad,
+                                                   inputData,
+                                                   batchSize,
+                                                   outputChannels,
+                                                   outputHeight,
+                                                   outputWidth,
+                                                   inputChannels,
+                                                   inputHeight,
+                                                   inputWidth,
+                                                   filterMultiplier,
+                                                   filterHeight,
+                                                   filterWidth,
+                                                   strideH,
+                                                   strideW,
+                                                   paddingH,
+                                                   paddingW,
+                                                   colData);
+      int K = outputHeight * outputWidth;
+      int M = colDataSize / K;
+
+      BaseMatrix colMatrix(M, K, colData, false, true);
+      filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
new file mode 100644
index 0000000000..b1a90da7db
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+#ifdef PADDLE_WITH_CUDA
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
+}
+
+TEST(DepthwiseConv, BackwardInput) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
+}
+
+TEST(DepthwiseConv, BackwardFilter) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NeonDepthwiseConv-CPU", forward);
+}
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
new file mode 100644
index 0000000000..644098a9e7
--- /dev/null
+++ b/paddle/function/EigenGemm.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+
+template <class T>
+struct EigenBlasGemm {
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
+                           Eigen::Aligned>
+      EigenMatrix;
+
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    Eigen::array<int, 2> sizeA;
+    if (transA) {
+      sizeA[0] = K;
+      sizeA[1] = M;
+      CHECK_EQ(M, lda);
+    } else {
+      sizeA[0] = M;
+      sizeA[1] = K;
+      CHECK_EQ(K, lda);
+    }
+    Eigen::array<int, 2> sizeB;
+    if (transB) {
+      sizeB[0] = N;
+      sizeB[1] = K;
+      CHECK_EQ(K, ldb);
+    } else {
+      sizeB[0] = K;
+      sizeB[1] = N;
+      CHECK_EQ(N, ldb);
+    }
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
+
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
+
+    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
+    Eigen::array<DimPair, 1> dims;
+    dims[0] = DimPair(1, 0);
+    dims[0].first = transA ? 0 : 1;
+    dims[0].second = transB ? 1 : 0;
+
+    Eigen::DefaultDevice device;
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
+    } else {
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
+    }
+  }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template struct EigenBlasGemm<double>;
+#else
+template struct EigenBlasGemm<float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index fdf7e631e5..7b0b1c6adb 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -24,14 +24,14 @@ void FunctionApi(typename Tensor<real, DType>::Matrix& output,
 
 template <>
 void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100);
-  EXPECT_EQ(output.getWidth(), 200);
+  EXPECT_EQ(output.getHeight(), 100U);
+  EXPECT_EQ(output.getWidth(), 200U);
 }
 
 template <>
 void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10);
-  EXPECT_EQ(output.getWidth(), 20);
+  EXPECT_EQ(output.getHeight(), 10U);
+  EXPECT_EQ(output.getWidth(), 20U);
 }
 
 template <DeviceType DType>
@@ -85,16 +85,16 @@ void testBufferArgs(const BufferArgs& inputs,
 }
 
 void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
-  EXPECT_EQ(inputs.size(), 1);
+  EXPECT_EQ(inputs.size(), 1U);
   check(inputs[0]);
 }
 
 TEST(Arguments, Matrix) {
   MatrixPtr matrix = Matrix::create(100, 200);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
-    EXPECT_EQ(arg.shape()[0], 100);
-    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
     EXPECT_EQ(arg.data(), matrix->getData());
 
     EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
@@ -112,8 +112,8 @@ TEST(Arguments, Matrix) {
 TEST(Arguments, Vector) {
   VectorPtr vector = Vector::create(100, false);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1);
-    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
     EXPECT_EQ(arg.data(), vector->getData());
 
     CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
@@ -131,9 +131,9 @@ TEST(Arguments, Vector) {
 TEST(Arguments, CpuSparseMatrix) {
   CpuSparseMatrix sparse(200, 300, 50);
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
-    EXPECT_EQ(arg.shape()[0], 200);
-    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
     EXPECT_EQ(arg.data(), sparse.getData());
     // CHECK_EQ(arg.sparse().nnz(), 50);
     // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
@@ -152,10 +152,10 @@ TEST(Arguments, CpuSparseMatrix) {
 TEST(Arguments, BufferArg) {
   BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
   CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3);
-    EXPECT_EQ(arg.shape()[0], 1);
-    EXPECT_EQ(arg.shape()[1], 2);
-    EXPECT_EQ(arg.shape()[2], 3);
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
   };
 
   BufferArgs argments;
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 0cfafdb27f..370940532e 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -22,14 +22,62 @@ namespace paddle {
 
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
+
 /**
- * \brief A class for comparing CPU and GPU implementations of Function.
- *
+ * \brief A class for comparing two Functions of different implementations.
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
  *
  * Use case:
  *  // Initializes a test object, the corresponding cpu and gpu Function
  *  // are constructed according to FunctionName and FuncConfig.
- *  FunctionCompare test(FunctionName, FuncConfig);
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
  *  // Prepare inputs and outputs arguments.
  *  // Here the input and output can not contain real data,
  *  // only contains the argument type and shape.
@@ -45,28 +93,39 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  *  // Compares CPU and GPU calculation results for consistency.
  *  test.run();
  */
-class FunctionCompare {
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
 public:
-  FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpuFunc_->init(config);
-    gpuFunc_->init(config);
+  typedef typename test::Allocator<DType1>::type Allocator1;
+  typedef typename test::Allocator<DType2>::type Allocator2;
+  typedef typename Tensor<real, DType1>::Vector Vector1;
+  typedef typename Tensor<real, DType2>::Vector Vector2;
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
+    initArgsCallback_ = nullptr;
   }
 
-  ~FunctionCompare() {}
+  ~Compare2Function() {}
 
   // input need only contains shape, do not contains data.
   void addInputs(const BufferArg& input) {
     size_t size =
         input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
-    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
-        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
-    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
-        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
   }
 
   // assume one copy of sequence is shared by different SequenceArgs
@@ -75,62 +134,61 @@ public:
     size_t batchSize = input.shape()[0];
     size_t numSeqs = batchSize / 10 + 1;
     size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
-    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
-    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
+                                            TensorShape{numSeqs + 1});
     /// init sequence Id
-    initArg(*cpuSeq_, batchSize);
+    initArg(*seq1_, batchSize);
 
-    // todo(tianbing), delete it
-    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
-
-    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
-    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
-    gpuSeq.copyFrom(cpuSeq);
+    copyArg_(*seq1_, *seq2_);
   }
 
   void addInputs(const SequenceArg& input) {
     CHECK_EQ(input.shape().ndims(), 2UL);
     size_t batchSize = input.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
       addSequence(SequenceIdArg(TensorShape{batchSize}));
     }
 
     size_t size =
         input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
     /// SequenceArg
-    cpuInputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+    func1Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                       input.valueType(),
                                       input.shape(),
-                                      *cpuSeq_));
-    gpuInputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      *seq1_));
+    func2Inputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                       input.valueType(),
                                       input.shape(),
-                                      *gpuSeq_));
+                                      *seq2_));
+  }
+
+  void registerInitCallback(std::function<void(BufferArg&, size_t)> callback) {
+    initArgsCallback_ = callback;
   }
 
   // output need only contains shape, do not contains data.
   void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
     size_t size =
         output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
-    cpuOutputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+    func1Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
                                     output.valueType(),
                                     output.shape(),
                                     argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+    func2Outputs_.emplace_back(
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
                                     output.valueType(),
                                     output.shape(),
                                     argType));
@@ -138,14 +196,14 @@ public:
 
   /// add and init output sparse matrix
   void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
         output.shape()[0],
         output.shape()[1],
         output.nnz(),
         static_cast<SparseValueType>(output.dataType()),
         static_cast<SparseFormat>(output.dataFormat()));
 
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
         output.shape()[0],
         output.shape()[1],
         output.nnz(),
@@ -154,52 +212,52 @@ public:
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
     hl_stream_synchronize(stream);
 
-    cpuOutputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+    func1Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
+    func2Outputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
   }
 
   void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
     CHECK_EQ(output.shape().ndims(), 2UL);
     size_t batchSize = output.shape()[0];
 
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
       addSequence(SequenceIdArg(TensorShape{batchSize}));
     }
     size_t size =
         output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
 
     /// SequenceArg
-    cpuOutputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+    func1Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                       output.valueType(),
                                       output.shape(),
-                                      *cpuSeq_,
+                                      *seq1_,
                                       argType));
-    gpuOutputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+    func2Outputs_.emplace_back(
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                       output.valueType(),
                                       output.shape(),
-                                      *gpuSeq_,
+                                      *seq2_,
                                       argType));
   }
 
   void addInputs(const SparseMatrixArg& input) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
         input.shape()[0],
         input.shape()[1],
         input.nnz(),
         static_cast<SparseValueType>(input.dataType()),
         static_cast<SparseFormat>(input.dataFormat()));
 
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
         input.shape()[0],
         input.shape()[1],
         input.nnz(),
@@ -208,12 +266,12 @@ public:
 
     /// init sparse matrix
     hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse1_->randomizeUniform();
+    sparse2_->copyFrom(*sparse1_, stream);
     hl_stream_synchronize(stream);
 
-    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
-    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
   }
 
   void run() {
@@ -236,27 +294,27 @@ public:
       function->calc(inArgs, outArgs);
     };
 
-    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
-    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
 
     // check outputs
     compareOutputs();
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 
 protected:
   // only init cpu argument, gpu argument copy from cpu argument.
   void initArg(BufferArg& arg) {
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
     vector.uniform(0.001, 1);
   }
 
   void initArg(SequenceArg& arg) {
     /// init only matrix
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
     vector.uniform(0.001, 1);
   }
 
@@ -276,73 +334,77 @@ protected:
   }
 
   void initInputs() {
-    for (size_t i = 0; i < cpuInputs_.size(); i++) {
-      if (cpuInputs_[i]->isSparseArg()) {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
+      if (func1Inputs_[i]->isSparseArg()) {
         continue;  /// sparse matrix already init
       }
 
-      if (cpuInputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+      if (func1Inputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
       } else {
-        initArg(*cpuInputs_[i]);
+        initArg(*func1Inputs_[i]);
+      }
+
+      if (initArgsCallback_ != nullptr) {
+        initArgsCallback_(*func1Inputs_[i], i);
       }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
-                          (real*)cpuInputs_[i]->data());
-      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
-                          (real*)gpuInputs_[i]->data());
 
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
     }
   }
 
   void initOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
-      if (cpuOutputs_[i]->isSparseArg()) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
+      if (func1Outputs_[i]->isSparseArg()) {
         continue;  /// sparse matrix already init
       }
 
-      if (cpuOutputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+      if (func1Outputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
       } else {
-        initArg(*cpuOutputs_[i]);
+        initArg(*func1Outputs_[i]);
       }
 
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
-                          (real*)cpuOutputs_[i]->data());
-      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
-                          (real*)gpuOutputs_[i]->data());
-
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
     }
   }
 
   void compareOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
       // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = cpuOutputs_[i];
-      const auto gpu = gpuOutputs_[i];
+      const auto cpu = func1Outputs_[i];
+      const auto gpu = func2Outputs_[i];
       CHECK_EQ(cpu->numElements(), gpu->numElements());
-      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
-      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
       autotest::TensorCheckErr(cpuVector, gpuVector);
     }
   }
 
 protected:
-  std::shared_ptr<FunctionBase> cpuFunc_;
-  std::shared_ptr<FunctionBase> gpuFunc_;
-  std::vector<CpuMemHandlePtr> cpuMemory_;
-  std::vector<GpuMemHandlePtr> gpuMemory_;
-  std::vector<BufferArgPtr> cpuInputs_;
-  std::vector<BufferArgPtr> cpuOutputs_;
-  std::vector<BufferArgPtr> gpuInputs_;
-  std::vector<BufferArgPtr> gpuOutputs_;
-  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
-  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
-  std::shared_ptr<SequenceIdArg> cpuSeq_;
-  std::shared_ptr<SequenceIdArg> gpuSeq_;
+  std::shared_ptr<FunctionBase> function1_;
+  std::shared_ptr<FunctionBase> function2_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
+  std::vector<BufferArgPtr> func1Inputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
+  std::shared_ptr<SequenceIdArg> seq1_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+  std::function<void(BufferArg&, size_t)> initArgsCallback_;
+};
+
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+
+  ~CpuGpuFuncCompare() {}
 };
 
 }  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
new file mode 100644
index 0000000000..cbdbf5335d
--- /dev/null
+++ b/paddle/function/GemmConvOp.cpp
@@ -0,0 +1,515 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+#include "Im2Col.h"
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        K,
+                                        colData,
+                                        N,
+                                        beta,
+                                        outputData + g * outputOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+
+/*
+ * \brief Forward calculation of convolution, optimized for mobile.
+ */
+template <DeviceType Device>
+class GemmConvMobileFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    size_t colWidth = outputHeight * outputWidth;
+    // Max col matrix height 256, Max col matrix width 1024
+    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+
+      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColMobileFunctor<real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    int nStride = colWidth;
+    int kStride = colHeight;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          real beta_ = beta;
+          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+               colHeightStart += stepColHeight) {
+            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+                 colWidthStart += stepColWidth) {
+              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int K = std::min(colHeight - colHeightStart, stepColHeight);
+              // im2col
+              im2col(inputData + g * inputOffset,
+                     imShape,
+                     colData,
+                     colShape,
+                     strideH(),
+                     strideW(),
+                     paddingH(),
+                     paddingW(),
+                     dilationH(),
+                     dilationW(),
+                     colHeightStart,
+                     K,
+                     colWidthStart,
+                     N);
+
+              // gemm
+              int M = outputChannels / groups_;
+              BlasGemm<Device, real>::compute(
+                  false,
+                  false,
+                  M,
+                  N,
+                  K,
+                  1.0f,
+                  filterData + g * filterOffset + colHeightStart,
+                  kStride,
+                  colData,
+                  N,
+                  beta_,
+                  outputData + g * outputOffset + colWidthStart,
+                  nStride);
+            }
+            beta_ = 1.0;
+          }
+        } else {
+          int M = outputChannels / groups_;
+          int N = outputHeight * outputWidth;
+          int K = inputChannels / groups_ * filterHeight * filterWidth;
+          BlasGemm<Device, real>::compute(false,
+                                          false,
+                                          M,
+                                          N,
+                                          K,
+                                          1.0f,
+                                          filterData + g * filterOffset,
+                                          K,
+                                          inputData + g * inputOffset,
+                                          N,
+                                          beta,
+                                          outputData + g * outputOffset,
+                                          N);
+        }
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+
+    memory_.reset();
+  }
+};
+
+#endif
+
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Col2ImFunctor<kCFO, Device, real> col2im;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
+        BlasGemm<Device, real>::compute(true,
+                                        false,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        filterData + g * filterOffset,
+                                        M,
+                                        outputGrad + g * outputOffset,
+                                        N,
+                                        scale,
+                                        colData,
+                                        N);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        }
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
+
+    Im2ColFunctor<kCFO, Device, real> im2col;
+    size_t inputOffset = imShape.getElements();
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW(),
+                 dilationH(),
+                 dilationW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        BlasGemm<Device, real>::compute(false,
+                                        true,
+                                        M,
+                                        N,
+                                        K,
+                                        1.0f,
+                                        outputGrad + g * outputOffset,
+                                        K,
+                                        colData,
+                                        K,
+                                        i == 0 ? beta : 1.0f,
+                                        filterGrad + g * filterOffset,
+                                        N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifdef PADDLE_MOBILE_INFERENCE
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvMobileFunction);
+#else
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+#endif
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
new file mode 100644
index 0000000000..b5b5e1f35b
--- /dev/null
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+TEST(GemmConv, NaiveConv) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(GemmConv, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+}
+
+TEST(GemmConv, BackwardInput) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+}
+
+TEST(GemmConv, BackwardFilter) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.cpp b/paddle/function/GemmFunctor.cpp
new file mode 100644
index 0000000000..9e25ee58a1
--- /dev/null
+++ b/paddle/function/GemmFunctor.cpp
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "GemmFunctor.h"
+#include "paddle/math/MathFunctions.h"
+
+namespace paddle {
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_CPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+#ifdef PADDLE_USE_EIGEN_FOR_BLAS
+    EigenBlasGemm<T>::compute(
+        transA, transB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+#else
+    gemm<T>(transA == false ? CblasNoTrans : CblasTrans,
+            transB == false ? CblasNoTrans : CblasTrans,
+            M,
+            N,
+            K,
+            alpha,
+            A,
+            lda,
+            B,
+            ldb,
+            beta,
+            C,
+            ldc);
+#endif
+  }
+};
+
+template <class T>
+struct BlasGemm<DEVICE_TYPE_GPU, T> {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == false ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  transB == false ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+
+template struct BlasGemm<DEVICE_TYPE_CPU, real>;
+template struct BlasGemm<DEVICE_TYPE_GPU, real>;
+
+}  // namespace paddle
diff --git a/paddle/function/GemmFunctor.h b/paddle/function/GemmFunctor.h
new file mode 100644
index 0000000000..0809953b4e
--- /dev/null
+++ b/paddle/function/GemmFunctor.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
+// cblas_dgemm interface's parameter format, it is necessary to introduce
+// GemmFunctor as a new interface. Later, when considering the implementation
+// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
+// interface.
+template <DeviceType Device, class T>
+struct BlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
+};
+
+// TODO(hedaoyuan): Since the definition of the real type in the Paddle
+// conflicts with the Eigen library, so compile the Eigen code can not
+// include the Paddle header file. And need an EigenBlasGemm template class
+// that does not contain the DeviceType parameter.
+// I will fix this problem and merge BlasGemm and EigenBlasGemm into one.
+template <class T>
+struct EigenBlasGemm {
+  static void compute(const bool transA,
+                      const bool transB,
+                      const int M,
+                      const int N,
+                      const int K,
+                      const T alpha,
+                      const T* A,
+                      const int lda,
+                      const T* B,
+                      const int ldb,
+                      const T beta,
+                      T* C,
+                      const int ldc);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/GruFunctor.h b/paddle/function/GruFunctor.h
new file mode 100644
index 0000000000..9f6392198e
--- /dev/null
+++ b/paddle/function/GruFunctor.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "GemmFunctor.h"
+#include "hl_cpu_gru.cuh"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+struct GruFunctor {
+  template <class OpResetOutput, class OpFinalOutput>
+  static void compute(OpResetOutput opResetOutput,
+                      OpFinalOutput opFinalOutput,
+                      hl_gru_value value,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   2 * frameSize,
+                                   frameSize,
+                                   1,
+                                   value.prevOutValue,
+                                   frameSize,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   value.gateValue,
+                                   frameSize * 3);
+    }
+
+    forward_reset_output(
+        opResetOutput, value, frameSize, batchSize, active_gate);
+
+    if (value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   false,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   value.resetOutputValue,
+                                   frameSize,
+                                   value.stateWeight,
+                                   frameSize,
+                                   1,
+                                   value.gateValue + frameSize * 2,
+                                   frameSize * 3);
+    }
+
+    forward_final_output(
+        opFinalOutput, value, frameSize, batchSize, active_node);
+#endif
+  }
+};
+
+template <DeviceType Device, class T>
+struct GruGradFunctor {
+  template <class OpStateGrad, class OpResetGrad>
+  static void compute(OpStateGrad opStateGrad,
+                      OpResetGrad opResetGrad,
+                      hl_gru_value value,
+                      hl_gru_grad grad,
+                      int frameSize,
+                      int batchSize,
+                      hl_activation_mode_t active_node,
+                      hl_activation_mode_t active_gate) {
+#ifndef __NVCC__
+    backward_state_grad(
+        opStateGrad, value, grad, frameSize, batchSize, active_node);
+
+    if (value.prevOutValue && grad.prevOutGrad) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize,
+                                   1,
+                                   grad.gateGrad + frameSize * 2,
+                                   frameSize * 3,
+                                   value.stateWeight,
+                                   frameSize,
+                                   0,
+                                   grad.resetOutputGrad,
+                                   frameSize);
+
+      if (grad.stateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize,
+                                     batchSize,
+                                     1,
+                                     value.resetOutputValue,
+                                     frameSize,
+                                     grad.gateGrad + frameSize * 2,
+                                     frameSize * 3,
+                                     1,
+                                     grad.stateWeightGrad,
+                                     frameSize);
+      }
+    }
+
+    backward_reset_grad(
+        opResetGrad, value, grad, frameSize, batchSize, active_gate);
+
+    if (grad.prevOutGrad && value.prevOutValue) {
+      BlasGemm<Device, T>::compute(false,
+                                   true,
+                                   batchSize,
+                                   frameSize,
+                                   frameSize * 2,
+                                   1,
+                                   grad.gateGrad,
+                                   frameSize * 3,
+                                   value.gateWeight,
+                                   frameSize * 2,
+                                   1,
+                                   grad.prevOutGrad,
+                                   frameSize);
+
+      if (grad.gateWeightGrad) {
+        BlasGemm<Device, T>::compute(true,
+                                     false,
+                                     frameSize,
+                                     frameSize * 2,
+                                     batchSize,
+                                     1,
+                                     value.prevOutValue,
+                                     frameSize,
+                                     grad.gateGrad,
+                                     frameSize * 3,
+                                     1,
+                                     grad.gateWeightGrad,
+                                     frameSize * 2);
+      }
+    }
+#endif
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
new file mode 100644
index 0000000000..36a9bcf84e
--- /dev/null
+++ b/paddle/function/Im2Col.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "neon/neon_util.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>      [height, width]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1);
+};
+
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int colHeightStart,
+                  int colHeightSize,
+                  int colWidthStart,
+                  int colWidthSize) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int colh = 0; colh < colHeightSize; colh++) {
+      int wOffset = (colHeightStart + colh) % filterWidth;
+      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+
+      for (int colw = 0; colw < colWidthSize; colw++) {
+        int h = (colWidthStart + colw) / outputWidth;
+        int w = (colWidthStart + colw) % outputWidth;
+
+        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+        if ((imRowIdx - paddingHeight) < 0 ||
+            (imRowIdx - paddingHeight) >= inputHeight ||
+            (imColIdx - paddingWidth) < 0 ||
+            (imColIdx - paddingWidth) >= inputWidth) {
+          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+        } else {
+          imRowIdx += c_im * inputHeight - paddingHeight;
+          imColIdx -= paddingWidth;
+          colData[colh * colWidthSize + colw] =
+              imData[imRowIdx * inputWidth + imColIdx];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
new file mode 100644
index 0000000000..f864d42f80
--- /dev/null
+++ b/paddle/function/Im2ColOp.cpp
@@ -0,0 +1,245 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+          int imColIdx = w * strideWidth + wOffset * dilationWidth;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight = 1,
+                  int dilationWidth = 1) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset = outputH * strideHeight +
+                                filterH * dilationHeight - paddingHeight;
+              int imColOffset = outputW * strideWidth +
+                                filterW * dilationWidth - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000..71da11b955
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,464 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include "hl_device_functions.cuh"
+
+namespace paddle {
+
+template <class T>
+__global__ void im2col(const T* data_im,
+                       int numOuts,
+                       int height,
+                       int width,
+                       int blockH,
+                       int blockW,
+                       int strideH,
+                       int strideW,
+                       int paddingH,
+                       int paddingW,
+                       int dilationH,
+                       int dilationW,
+                       int height_col,
+                       int width_col,
+                       T* data_col) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in + i * dilationH);
+        int cIdx = int(w_in + j * dilationW);
+        if ((rIdx - (int)paddingH) >= (int)height ||
+            (rIdx - (int)paddingH) < 0 ||
+            (cIdx - (int)paddingW) >= (int)width ||
+            (cIdx - (int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in * height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx * width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 - 1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                    numKernels,
+                                                    inputHeight,
+                                                    inputWidth,
+                                                    filterHeight,
+                                                    filterWidth,
+                                                    strideHeight,
+                                                    strideWidth,
+                                                    paddingHeight,
+                                                    paddingWidth,
+                                                    dilationHeight,
+                                                    dilationWidth,
+                                                    outputHeight,
+                                                    outputWidth,
+                                                    colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2im(size_t n,
+                       const T* data_col,
+                       size_t height,
+                       size_t width,
+                       size_t channels,
+                       size_t blockH,
+                       size_t blockW,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t paddingH,
+                       size_t paddingW,
+                       size_t dilationH,
+                       size_t dilationW,
+                       size_t height_col,
+                       size_t width_col,
+                       T* data_im) {
+  size_t index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    int filterH = (blockH - 1) * dilationH + 1;
+    int filterW = (blockW - 1) * dilationW + 1;
+
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width - 2 * paddingW) &&
+        (h - (int)paddingH) >= 0 && (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+          (w < (int)filterW) ? 0 : (w - int(filterW)) / (int)strideW + 1;
+      int w_col_end = min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+          (h < (int)filterH) ? 0 : (h - (int)filterH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int h_k = (h - h_col * strideH);
+          int w_k = (w - w_col * strideW);
+          if (h_k % dilationH == 0 && w_k % dilationW == 0) {
+            h_k /= dilationH;
+            w_k /= dilationW;
+            int c_col =
+                (((c * blockH + h_k) * blockW + w_k) * height_col + h_col) *
+                    width_col +
+                w_col;
+            val += data_col[c_col];
+          }
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c * ((width - 2 * paddingW) * (height - 2 * paddingH)) +
+              h * (width - 2 * paddingW) + w] += val;
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    size_t numKernels = inputChannels * (inputHeight + 2 * paddingHeight) *
+                        (inputWidth + 2 * paddingWidth);
+
+    size_t blocks = (numKernels + 1024 - 1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        numKernels,
+        colData,
+        inputHeight + 2 * paddingHeight,
+        inputWidth + 2 * paddingWidth,
+        inputChannels,
+        filterHeight,
+        filterWidth,
+        strideHeight,
+        strideWidth,
+        paddingHeight,
+        paddingWidth,
+        dilationHeight,
+        dilationWidth,
+        outputHeight,
+        outputWidth,
+        imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
+template <class T>
+__global__ void im2colOCF(const T* imData,
+                          T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset =
+            idx * dilationHeight + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationWidth + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template <class T>
+__global__ void col2imOCF(T* imData,
+                          const T* colData,
+                          int inputChannels,
+                          int inputHeight,
+                          int inputWidth,
+                          int filterHeight,
+                          int filterWidth,
+                          int strideHeight,
+                          int strideWidth,
+                          int paddingHeight,
+                          int paddingWidth,
+                          int dilationHeight,
+                          int dilationWidth,
+                          int outputHeight,
+                          int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z; channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset =
+            idx * dilationWidth + swId * strideWidth - paddingWidth;
+        int heightOffset =
+            idy * dilationHeight + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth +
+                       channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth +
+                        channelId * filterHeight * filterWidth +
+                        (shId * outputWidth + swId) *
+                            (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<<grid, threads, 0, STREAM_DEFAULT>>>(imData,
+                                                       colData,
+                                                       inputChannels,
+                                                       inputHeight,
+                                                       inputWidth,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       strideHeight,
+                                                       strideWidth,
+                                                       paddingHeight,
+                                                       paddingWidth,
+                                                       dilationHeight,
+                                                       dilationWidth,
+                                                       outputHeight,
+                                                       outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
new file mode 100644
index 0000000000..3ba866dcdd
--- /dev/null
+++ b/paddle/function/Im2ColTest.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+                  TensorShape colShape2 = TensorShape({outputHeight,
+                                                       outputWidth,
+                                                       channels,
+                                                       filterHeight,
+                                                       filterWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(width, height, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, Device, T> im2Col1;
+                  Im2ColFunctor<kOCF, Device, T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+
+                  // The transposition of the result of ColFormat == kCFO
+                  // is equal to the result of ColFormat == kOCF.
+                  MatrixPtr test;
+                  output2->transpose(test, true);
+                  autotest::TensorCheckErr(*output1, *test);
+
+                  Col2ImFunctor<kCFO, Device, T> col2Im1;
+                  Col2ImFunctor<kOCF, Device, T> col2Im2;
+
+                  col2Im1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  col2Im2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape2,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  autotest::TensorCheckErr(*input1, *input2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifdef PADDLE_WITH_CUDA
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+template <class T>
+void TestIm2ColMobileFunctor() {
+  for (size_t channels : {32}) {
+    for (size_t inputHeight : {33, 100}) {
+      for (size_t inputWidth : {32, 96}) {
+        for (size_t filterHeight : {5}) {
+          for (size_t filterWidth : {7}) {
+            for (size_t stride : {2}) {
+              for (size_t padding : {1}) {
+                for (size_t dilation : {1, 3}) {
+                  size_t filterSizeH = (filterHeight - 1) * dilation + 1;
+                  size_t filterSizeW = (filterWidth - 1) * dilation + 1;
+                  if (inputHeight + 2 * padding < filterSizeH ||
+                      inputWidth + 2 * padding < filterSizeW)
+                    break;
+                  if (padding >= filterSizeH || padding >= filterSizeW) break;
+                  size_t outputHeight =
+                      (inputHeight - filterSizeH + 2 * padding) / stride + 1;
+                  size_t outputWidth =
+                      (inputWidth - filterSizeW + 2 * padding) / stride + 1;
+
+                  TensorShape imShape =
+                      TensorShape({channels, inputHeight, inputWidth});
+                  TensorShape colShape1 = TensorShape({channels,
+                                                       filterHeight,
+                                                       filterWidth,
+                                                       outputHeight,
+                                                       outputWidth});
+
+                  size_t height = channels * filterHeight * filterWidth;
+                  size_t width = outputHeight * outputWidth;
+                  VectorPtr input1 =
+                      Vector::create(imShape.getElements(), false);
+                  VectorPtr input2 =
+                      Vector::create(imShape.getElements(), false);
+                  MatrixPtr output1 =
+                      Matrix::create(height, width, false, false);
+                  MatrixPtr output2 =
+                      Matrix::create(height, width, false, false);
+                  input1->uniform(0.001, 1);
+                  input2->copyFrom(*input1);
+
+                  Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> im2Col1;
+                  Im2ColMobileFunctor<T> im2Col2;
+                  im2Col1(input1->getData(),
+                          imShape,
+                          output1->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation);
+                  im2Col2(input2->getData(),
+                          imShape,
+                          output2->getData(),
+                          colShape1,
+                          stride,
+                          stride,
+                          padding,
+                          padding,
+                          dilation,
+                          dilation,
+                          0,
+                          height,
+                          0,
+                          width);
+
+                  autotest::TensorCheckEqual(*output1, *output2);
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, Mobile) { TestIm2ColMobileFunctor<float>(); }
+
+}  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 91b4b8ed91..704a8c4132 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -13,18 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MulOp.h"
-/// todo(tianbing), delete it
-#include <iostream>
-#include "paddle/math/MathFunctions.h"
+#include "GemmFunctor.h"
 #include "paddle/math/SIMDFunctions.h"
 #include "paddle/utils/ThreadLocal.h"
 
-#ifndef PADDLE_TYPE_DOUBLE
-#define GEMM paddle::gemm<float>
-#else
-#define GEMM paddle::gemm<double>
-#endif
-
 namespace {
 inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
@@ -114,19 +106,20 @@ void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
                             real scaleT,
                             bool aTrans,
                             bool bTrans) {
-  GEMM(aTrans ? CblasTrans : CblasNoTrans,
-       bTrans ? CblasTrans : CblasNoTrans,
-       out.getHeight(),
-       out.getWidth(),
-       !aTrans ? a.getWidth() : a.getHeight(),
-       scaleAB,
-       a.getData(),
-       a.getStride(),
-       b.getData(),
-       b.getStride(),
-       scaleT,
-       out.getData(),
-       out.getStride());
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
+      aTrans,
+      bTrans,
+      out.getHeight(),
+      out.getWidth(),
+      !aTrans ? a.getWidth() : a.getHeight(),
+      scaleAB,
+      a.getData(),
+      a.getStride(),
+      b.getData(),
+      b.getStride(),
+      scaleT,
+      out.getData(),
+      out.getStride());
 }
 
 /// dense matrix (+)= sparse matrix * dense matrix
@@ -348,7 +341,7 @@ private:
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
index dcfcb2325d..9449b89056 100644
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "MulOp.h"
+#include "hl_base.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index 8753057ebf..d31eb0c74f 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -35,7 +35,7 @@ void testFuncDDDMatrix(
   size_t heightC = dimM;
   size_t widthC = dimN;
   // init Test object
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
       "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
   // prepare input arguments
   /// matrix A : HA * WA
@@ -81,8 +81,8 @@ void testFuncDSparseDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// sparse matrix A : M * K
   test.addInputs(SparseMatrixArg(
@@ -126,8 +126,8 @@ void testFuncDDSparseMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@@ -172,8 +172,8 @@ void testFuncSparseDDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
   // init Test object
-  FunctionCompare test("MulOp",
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  CpuGpuFuncCompare test(
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
   // prepare input arguments
   /// matrix A : M * K
   test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
new file mode 100644
index 0000000000..e0692fa06d
--- /dev/null
+++ b/paddle/function/NaiveConvOp.cpp
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvOp.h"
+
+namespace paddle {
+
+/*
+ * The three arguments are stored in memory in row major order.
+ * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
+ * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
+ * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
+ */
+template <class T>
+class NaiveConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  size_t batchSize,
+                  size_t inputChannels,
+                  size_t inputHeight,
+                  size_t inputWidth,
+                  const T* filterData,
+                  size_t filterHeight,
+                  size_t filterWidth,
+                  T* outputData,
+                  size_t outputChannels,
+                  size_t outputHeight,
+                  size_t outputWidth,
+                  size_t paddingH,
+                  size_t paddingW,
+                  size_t strideH,
+                  size_t strideW) {
+    for (size_t batch = 0; batch < batchSize; batch++) {
+      for (size_t outC = 0; outC < outputChannels; outC++) {
+        for (size_t outH = 0; outH < outputHeight; outH++) {
+          for (size_t outW = 0; outW < outputWidth; outW++) {
+            const int inStartH = (outH * strideH) - paddingH;
+            const int inStartW = (outW * strideW) - paddingW;
+            T outValue = (T)0;
+            for (size_t inC = 0; inC < inputChannels; inC++) {
+              for (size_t fH = 0; fH < filterHeight; fH++) {
+                for (size_t fW = 0; fW < filterWidth; fW++) {
+                  T inValue;
+                  const int inH = inStartH + fH;
+                  const int inW = inStartW + fW;
+                  if ((inH >= 0 && inH < (int)inputHeight) &&
+                      (inW >= 0 && inW < (int)inputWidth)) {
+                    size_t offsetInput =
+                        batch * inputChannels * inputHeight * inputWidth +
+                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
+                    inValue = inputData[offsetInput];
+                  } else {
+                    inValue = (T)0;
+                  }
+                  size_t offsetFilter =
+                      outC * inputChannels * filterHeight * filterWidth +
+                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
+                  T filterValue = filterData[offsetFilter];
+                  outValue += (inValue * filterValue);
+                }
+              }
+            }
+
+            size_t offset =
+                batch * outputChannels * outputHeight * outputWidth +
+                outC * outputHeight * outputWidth + outH * outputWidth + outW;
+            outputData[offset] = outValue;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <DeviceType Device>
+class NaiveConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t inputChannels = inputs[0].shape()[1];
+    size_t inputHeight = inputs[0].shape()[2];
+    size_t inputWidth = inputs[0].shape()[3];
+    size_t filterHeight = inputs[1].shape()[2];
+    size_t filterWidth = inputs[1].shape()[3];
+    size_t outputChannels = outputs[0].shape()[1];
+    size_t outputHeight = outputs[0].shape()[2];
+    size_t outputWidth = outputs[0].shape()[3];
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    NaiveConvFunctor<real> conv;
+    conv(inputData,
+         batchSize,
+         inputChannels,
+         inputHeight,
+         inputWidth,
+         filterData,
+         filterHeight,
+         filterWidth,
+         outputData,
+         outputChannels,
+         outputHeight,
+         outputWidth,
+         paddingH(),
+         paddingW(),
+         strideH(),
+         strideW());
+  }
+};
+
+REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index adba7c92ec..eed2f2e308 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -207,7 +207,7 @@ private:
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
 #endif
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
index 9094f15284..5b6f4e6832 100644
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
@@ -12,15 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_base.h"
 #include "PadOp.h"
+#include "hl_base.h"
 
 namespace paddle {
 
-__global__ void KePad(real* outputs, const real* inputs,
-                      int inC, int inH, int inW,
-                      int padc, int padh, int padw,
-                      int outC, int outH, int outW, int nthreads) {
+__global__ void KePad(real* outputs,
+                      const real* inputs,
+                      int inC,
+                      int inH,
+                      int inW,
+                      int padc,
+                      int padh,
+                      int padw,
+                      int outC,
+                      int outH,
+                      int outW,
+                      int nthreads) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nthreads) {
     const int w = idx % inW;
@@ -50,16 +58,33 @@ void Pad<DEVICE_TYPE_GPU>(real* outputs,
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
-  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
-     outC, outH, outW, nth);
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(outputs,
+                                                    inputs,
+                                                    inC,
+                                                    inH,
+                                                    inW,
+                                                    cstart,
+                                                    hstart,
+                                                    wstart,
+                                                    outC,
+                                                    outH,
+                                                    outW,
+                                                    nth);
   CHECK_SYNC("Pad");
 }
 
-__global__ void KePadDiff(real* inGrad, const real* outGrad,
-                          int inC, int inH, int inW,
-                          int padc, int padh, int padw,
-                          int outC, int outH, int outW, int nthreads) {
+__global__ void KePadDiff(real* inGrad,
+                          const real* outGrad,
+                          int inC,
+                          int inH,
+                          int inW,
+                          int padc,
+                          int padh,
+                          int padw,
+                          int outC,
+                          int outH,
+                          int outW,
+                          int nthreads) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < nthreads) {
     const int w = idx % inW;
@@ -89,9 +114,18 @@ void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
   int outC = inC + cstart + cend;
   int outH = inH + hstart + hend;
   int outW = inW + wstart + wend;
-  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
-    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
-     outC, outH, outW, nth);
+  KePadDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(inGrad,
+                                                        outGrad,
+                                                        inC,
+                                                        inH,
+                                                        inW,
+                                                        cstart,
+                                                        hstart,
+                                                        wstart,
+                                                        outC,
+                                                        outH,
+                                                        outW,
+                                                        nth);
   CHECK_SYNC("PadGrad");
 }
 
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index f77ac2a8c4..e286f4e5b8 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -25,7 +25,7 @@ TEST(Pad, real) {
           VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
           for (bool test_grad : {false, true}) {
-            FunctionCompare compare(
+            CpuGpuFuncCompare compare(
                 test_grad ? "PadGrad" : "Pad",
                 FuncConfig()
                     .set<std::vector<uint32_t>>("channel", {2, 3})
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
new file mode 100644
index 0000000000..7c802d6627
--- /dev/null
+++ b/paddle/function/RowConvOp.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOp.h b/paddle/function/RowConvOp.h
new file mode 100644
index 0000000000..2c5de6151a
--- /dev/null
+++ b/paddle/function/RowConvOp.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief The forward of row convolution.
+ *
+ * \param[out] out      The output data and shape is h x d. h is the sum of
+ *                      time steps of all samples in one mini-batch.
+ * \param[in]  in       The input data and shape is h x d.
+ * \param[in]  filter   The filter and shape is k x d. The lookahead step
+ *                      number plus one equals k.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConv(typename Tensor<real, DType>::Matrix& out,
+             const typename Tensor<real, DType>::Matrix& in,
+             const typename Tensor<real, DType>::Matrix& filter,
+             const typename Tensor<int, DType>::Vector& seq);
+
+/**
+ * \brief The backward of row convolution.
+ *
+ * \param[in]  outG     The gradient w.r.t output data.
+ * \param[in]  in       The input data.
+ * \param[in]  filter   The filter.
+ * \param[out] inG      The gradient w.r.t input data.
+ * \param[out] filterG  The gradient w.r.t filter.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
+                 const typename Tensor<real, DType>::Matrix& in,
+                 const typename Tensor<real, DType>::Matrix& filter,
+                 typename Tensor<real, DType>::Matrix& inG,
+                 typename Tensor<real, DType>::Matrix& filterG,
+                 const typename Tensor<int, DType>::Vector& seq);
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
new file mode 100644
index 0000000000..b0cbd9fd1d
--- /dev/null
+++ b/paddle/function/RowConvOpGpu.cu
@@ -0,0 +1,368 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y,
+                          const real* x,
+                          const real* w,
+                          const int* starts,
+                          const int height,
+                          const int width,
+                          const int numSeq,
+                          const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y,
+                           const real* x,
+                           const real* w,
+                           const int* starts,
+                           const int height,
+                           const int width,
+                           const int numSeq,
+                           const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+        y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+template <int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw,
+                                  const real* x,
+                                  const real* dy,
+                                  const int* starts,
+                                  const int height,
+                                  const int width,
+                                  const int numSeq,
+                                  const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] =
+          (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] =
+            (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw,
+                                   const real* x,
+                                   const real* dy,
+                                   const int* starts,
+                                   const int height,
+                                   const int width,
+                                   const int numSeq,
+                                   const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+
+    const int size = ((steps + BLOCK_H - 1) / BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] =
+          (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] =
+            (xoff < width && (yoff - t) >= start && yoff - t < end)
+                ? dy[(yoff - t) * width + xoff]
+                : 0.0;
+        __syncthreads();
+
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t * width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template <int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx,
+                                const real* w,
+                                const real* dy,
+                                const int* starts,
+                                const int height,
+                                const int width,
+                                const int numSeq,
+                                const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i * width + gidx + tidx] : 0.0;
+  }
+
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx,
+                                 const real* w,
+                                 const real* dy,
+                                 const int* starts,
+                                 const int height,
+                                 const int width,
+                                 const int numSeq,
+                                 const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                                  const GpuMatrix& in,
+                                  const GpuMatrix& filter,
+                                  GpuMatrix& inG,
+                                  GpuMatrix& filterG,
+                                  const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) {
+      KeRowConvBwWeight<32, 32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>(
+          dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64><<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2<<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>(
+          dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp
new file mode 100644
index 0000000000..f52d18b049
--- /dev/null
+++ b/paddle/function/RowConvOpTest.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
+  CpuGpuFuncCompare test("RowConv", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
+  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+TEST(RowConv, real) {
+  for (size_t numSamples : {17, 129, 2020}) {
+    for (size_t dim : {16, 512, 2560}) {
+      for (size_t context : {3, 19, 65}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " context length=" << context;
+        testRowConvFw(numSamples, dim, context);
+        testRowConvBw(numSamples, dim, context);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.cpp b/paddle/function/ScaleSubRegionOp.cpp
new file mode 100644
index 0000000000..a080505d7d
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.cpp
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "paddle/function/TensorShape.h"
+
+namespace paddle {
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_CPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  memcpy(outputs, inputs, number * channel * height * width * sizeof(real));
+
+  for (int n = 0; n < number; ++n) {
+    // indices start from 1
+    int offset = n * 6;
+    for (int c = indices[offset] - 1; c < indices[offset + 1]; ++c) {
+      for (int h = indices[offset + 2] - 1; h < indices[offset + 3]; ++h) {
+        for (int w = indices[offset + 4] - 1; w < indices[offset + 5]; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          outputs[idx] *= value;
+        }
+      }
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  for (int n = 0; n < number; ++n) {
+    for (int c = 0; c < channel; ++c) {
+      for (int h = 0; h < height; ++h) {
+        for (int w = 0; w < width; ++w) {
+          int idx = ((n * channel + c) * height + h) * width + w;
+          int offset = n * 6;
+          if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+              h >= (indices[offset + 2] - 1) &&
+              h <= (indices[offset + 3] - 1) &&
+              w >= (indices[offset + 4] - 1) &&
+              w <= (indices[offset + 5] - 1)) {
+            outGrad[idx] += inGrad[idx] * value;
+          } else {
+            outGrad[idx] += inGrad[idx];
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief For each instance, ScaleSubRegion can be used to multiply a value to
+ *        a specified sub continuous region. By providing start index and end
+ *        index for C/H/W, you can specify the location and shape of the region.
+ *
+ * Argument in this Function:
+ * \param inputs    A 4-D tensor with shape [N, C, H, W], only one input.
+ * \param indices   A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs   A 4-D tensor with same shape as inputs, output value.
+ */
+template <DeviceType Device>
+class ScaleSubRegionFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegion<Device>(outputs[0].data<real>(),
+                           inputs[0].data<real>(),
+                           inputs[1].data<real>(),
+                           shape,
+                           conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of ScaleSubRegion Function.
+ *
+ * Argument in this Function:
+ * \param inputs  A 4-D tensor with shape [N, C, H, W], output gradient.
+ * \param indices A 2-D tensor with shape [N, 6], indicates the sub region.
+ * \param outputs A 4-D tensor with shape [N, C, H, W], gradient of input value.
+ */
+
+template <DeviceType Device>
+class ScaleSubRegionGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape shape = inputs[0].shape();
+
+    ScaleSubRegionGrad<Device>(inputs[0].data<real>(),
+                               outputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               shape,
+                               conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(ScaleSubRegion, CPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, CPU, ScaleSubRegionGradFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(ScaleSubRegion, GPU, ScaleSubRegionFunc);
+REGISTER_TYPED_FUNC(ScaleSubRegionGrad, GPU, ScaleSubRegionGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOp.h b/paddle/function/ScaleSubRegionOp.h
new file mode 100644
index 0000000000..0480c8577f
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOp.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief Function to multiply a value to values in specified sub continuous
+ *        region. Indices must be provided to indcate the location and shape of
+ *        the region and the multiplied value is passed by configure variable.
+ *
+ *
+ * \param[out] outputs  Output value.
+ * \param[in]  inputs   Input data which contains NCHW information.
+ * \param[in]  indices  Indices data to indcate the sub region.
+ * \param[in]  shape    Tensor shape of input value.
+ * \param[in]  conf     Configure variable which contains the multiplied value.
+ */
+template <DeviceType Device>
+void ScaleSubRegion(real* outputs,
+                    const real* inputs,
+                    const real* indices,
+                    const TensorShape shape,
+                    const FuncConfig& conf);
+
+/**
+ * \brief Backward propagation function of ScaleSubRegion.
+ *
+ * \param[out] inGrad   Gradients of previous layer.
+ * \param[in]  outGrad  Output gradient.
+ * \param[in]  indices  Indices data.
+ * \param[in]  shape    The Shape of input tensor.
+ * \param[in]  conf     Configure variable.
+ */
+template <DeviceType Device>
+void ScaleSubRegionGrad(const real* inGrad,
+                        real* outGrad,
+                        const real* indices,
+                        const TensorShape shape,
+                        const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpGpu.cu b/paddle/function/ScaleSubRegionOpGpu.cu
new file mode 100644
index 0000000000..8aae2e44c3
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeScaleSubRegion(real* outputs,
+                                 const real* inputs,
+                                 const real* indices,
+                                 real value,
+                                 int channel,
+                                 int height,
+                                 int width,
+                                 int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outputs[idx] = inputs[idx] * value;
+    } else {
+      outputs[idx] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegion<DEVICE_TYPE_GPU>(real* outputs,
+                                     const real* inputs,
+                                     const real* indices,
+                                     const TensorShape shape,
+                                     const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegion<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegion");
+}
+
+__global__ void KeScaleSubRegionDiff(const real* inGrad,
+                                     real* outGrad,
+                                     const real* indices,
+                                     real value,
+                                     int channel,
+                                     int height,
+                                     int width,
+                                     int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % width;
+    const int h = (idx / width) % height;
+    const int c = (idx / width / height) % channel;
+    const int n = idx / width / height / channel;
+
+    const int offset = n * 6;
+    if (c >= (indices[offset] - 1) && c <= (indices[offset + 1] - 1) &&
+        h >= (indices[offset + 2] - 1) && h <= (indices[offset + 3] - 1) &&
+        w >= (indices[offset + 4] - 1) && w <= (indices[offset + 5] - 1)) {
+      outGrad[idx] += inGrad[idx] * value;
+    } else {
+      outGrad[idx] += inGrad[idx];
+    }
+  }
+}
+
+template <>
+void ScaleSubRegionGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                                         real* outGrad,
+                                         const real* indices,
+                                         const TensorShape shape,
+                                         const FuncConfig& conf) {
+  real value = conf.get<real>("value");
+
+  int number = shape[0];
+  int channel = shape[1];
+  int height = shape[2];
+  int width = shape[3];
+
+  size_t nth = number * channel * height * width;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeScaleSubRegionDiff<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      inGrad, outGrad, indices, value, channel, height, width, nth);
+  CHECK_SYNC("ScaleSubRegionGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ScaleSubRegionOpTest.cpp b/paddle/function/ScaleSubRegionOpTest.cpp
new file mode 100644
index 0000000000..43331f258d
--- /dev/null
+++ b/paddle/function/ScaleSubRegionOpTest.cpp
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(ScaleSubRegion, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 32}) {
+      for (size_t imgSizeH : {5, 33}) {
+        for (size_t imgSizeW : {5, 32}) {
+          for (real value : {-0.5, 0.0, 0.5}) {
+            for (bool firstHalf : {false, true}) {
+              VLOG(3) << " numSamples=" << numSamples
+                      << " channels=" << channels << " imgSizeH=" << imgSizeH
+                      << " imgSizeW=" << imgSizeW;
+
+              for (bool testGrad : {false, true}) {
+                CpuGpuFuncCompare compare(
+                    testGrad ? "ScaleSubRegionGrad" : "ScaleSubRegion",
+                    FuncConfig().set<real>("value", value));
+
+                TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+                TensorShape indicesShape{numSamples, 6};
+
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+                compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, indicesShape));
+
+                compare.registerInitCallback([=](BufferArg& arg, size_t index) {
+                  if (index == 1) {
+                    real* data = (real*)arg.data();
+
+                    for (size_t i = 0; i < numSamples; ++i) {
+                      size_t offset = i * 6;
+                      data[offset] = firstHalf ? 1 : channels / 2;
+                      data[offset + 1] = firstHalf ? channels / 2 : channels;
+                      data[offset + 2] = firstHalf ? 1 : imgSizeH / 2;
+                      data[offset + 3] = firstHalf ? imgSizeH / 2 : imgSizeH;
+                      data[offset + 4] = firstHalf ? 1 : imgSizeW / 2;
+                      data[offset + 5] = firstHalf ? imgSizeW / 2 : imgSizeW;
+                    }
+                  }
+                });
+
+                compare.addOutputs(
+                    BufferArg(
+                        VALUE_TYPE_FLOAT, shape, testGrad ? ADD_TO : ASSIGN_TO),
+                    testGrad ? ADD_TO : ASSIGN_TO);
+                compare.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
new file mode 100644
index 0000000000..597723a2dd
--- /dev/null
+++ b/paddle/function/SwitchOp.cpp
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void NCHW2NHWC<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int c = 0; c < inC; ++c) {
+      for (int h = 0; h < inH; ++h) {
+        for (int w = 0; w < inW; ++w) {
+          if (argType == ADD_TO) {
+            outputs[((n * inH + h) * inW + w) * inC + c] += *(inputs++);
+          } else {
+            outputs[((n * inH + h) * inW + w) * inC + c] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <>
+void NHWC2NCHW<DEVICE_TYPE_CPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  for (int n = 0; n < num; ++n) {
+    for (int h = 0; h < inH; ++h) {
+      for (int w = 0; w < inW; ++w) {
+        for (int c = 0; c < inC; ++c) {
+          if (argType == ADD_TO) {
+            outputs[((n * inC + c) * inH + h) * inW + w] += *(inputs++);
+          } else {
+            outputs[((n * inC + c) * inH + h) * inW + w] = *(inputs++);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size,channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size,channels, height, width'.
+ * \param outputs output data with order 'batch_size, height, width, channels'.
+ */
+template <DeviceType Device>
+class NCHW2NHWCFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    NCHW2NHWC<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inC,
+                      inH,
+                      inW,
+                      outputs[0].getArgType());
+  }
+};
+
+/**
+ * \brief  Switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order
+ *         'batch_size, height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * Argument in this Function:
+ * \param inputs  input data with order 'batch_size, height, width, channels'.
+ * \param outputs output data with order 'batch_size, channels, height, width'.
+ */
+template <DeviceType Device>
+class NHWC2NCHWFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = inputs[0].shape()[0];
+    size_t inH = inputs[0].shape()[1];
+    size_t inW = inputs[0].shape()[2];
+    size_t inC = inputs[0].shape()[3];
+
+    NHWC2NCHW<Device>(outputs[0].data<real>(),
+                      inputs[0].data<real>(),
+                      num,
+                      inH,
+                      inW,
+                      inC,
+                      outputs[0].getArgType());
+  }
+};
+
+REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
+#ifdef PADDLE_WITH_CUDA
+REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
+REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOp.h b/paddle/function/SwitchOp.h
new file mode 100644
index 0000000000..e4c1c3ac92
--- /dev/null
+++ b/paddle/function/SwitchOp.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief  This funtion switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order 'batch_size,
+ *channels, height, width' to
+ *         order 'batch_size, height, width, channels'.
+ *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  argType     type of output argument.
+ */
+template <DeviceType Device>
+void NCHW2NHWC(real* outputs,
+               const real* inputs,
+               const int num,
+               const int inC,
+               const int inH,
+               const int inW,
+               const int argtype);
+
+/**
+ * \brief  This funtion switch dimension order of image input.
+ *         The input and output is a 4D tensor. Switch order 'batch_size,
+ *height, width, channels' to
+ *         order 'batch_size, channels, height, width'.
+ *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inW     with of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  argType     type of output argument.
+ */
+template <DeviceType Device>
+void NHWC2NCHW(real* inGrad,
+               const real* outGrad,
+               const int num,
+               const int inH,
+               const int inW,
+               const int inC,
+               const int argType);
+}  // namespace paddle
diff --git a/paddle/function/SwitchOpGpu.cu b/paddle/function/SwitchOpGpu.cu
new file mode 100644
index 0000000000..45390a56c3
--- /dev/null
+++ b/paddle/function/SwitchOpGpu.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 Paddle
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOp.h"
+#include "hl_base.h"
+
+namespace paddle {
+
+__global__ void KeNCHW2NHWC(real* outputs,
+                            const real* inputs,
+                            int inC,
+                            int inH,
+                            int inW,
+                            int nthreads,
+                            int argType) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * inH + h) * inW + w) * inC + c;
+    if (argType == ADD_TO) {
+      outputs[off] += inputs[idx];
+    } else {
+      outputs[off] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void NCHW2NHWC<DEVICE_TYPE_GPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inC,
+                                const int inH,
+                                const int inW,
+                                const int argType) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  KeNCHW2NHWC<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, inC, inH, inW, nth, argType);
+  CHECK_SYNC("NCHW2NHWC");
+}
+
+__global__ void KeNHWC2NCHW(real* outputs,
+                            const real* inputs,
+                            int inH,
+                            int inW,
+                            int inC,
+                            int nthreads,
+                            int argType) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int c = idx % inC;
+    const int w = (idx / inC) % inW;
+    const int h = (idx / inC / inW) % inH;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * inC + c) * inH + h) * inW + w;
+    if (argType == ADD_TO) {
+      outputs[off] += inputs[idx];
+    } else {
+      outputs[off] = inputs[idx];
+    }
+  }
+}
+
+template <>
+void NHWC2NCHW<DEVICE_TYPE_GPU>(real* outputs,
+                                const real* inputs,
+                                const int num,
+                                const int inH,
+                                const int inW,
+                                const int inC,
+                                const int argType) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  KeNHWC2NCHW<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+      outputs, inputs, inH, inW, inC, nth, argType);
+  CHECK_SYNC("NHWC2NCHW");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/SwitchOpTest.cpp b/paddle/function/SwitchOpTest.cpp
new file mode 100644
index 0000000000..03b0dd66dd
--- /dev/null
+++ b/paddle/function/SwitchOpTest.cpp
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Pad, real) {
+  for (size_t numSamples : {1, 4, 8, 16}) {
+    for (size_t channels : {1, 4, 8, 16}) {
+      for (size_t imgSizeH : {1, 4, 8, 16}) {
+        for (size_t imgSizeW : {1, 4, 8, 16}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {true, false}) {
+            CpuGpuFuncCompare compare(test_grad ? "NHWC2NCHW" : "NCHW2NHWC",
+                                      FuncConfig());
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{numSamples, imgSizeH, imgSizeW, channels};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(
+                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
index 45a2e106e7..e55d516d4a 100644
--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
@@ -19,35 +19,35 @@ namespace paddle {
 
 TEST(TensorShape, Constructor) {
   TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0);
-  EXPECT_EQ(t1.getElements(), 0);
+  EXPECT_EQ(t1.ndims(), 0U);
+  EXPECT_EQ(t1.getElements(), 0U);
 
   TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3);
-  EXPECT_EQ(t2.getElements(), 1);
+  EXPECT_EQ(t2.ndims(), 3U);
+  EXPECT_EQ(t2.getElements(), 1U);
 
   TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2);
-  EXPECT_EQ(t3.getElements(), 80);
+  EXPECT_EQ(t3.ndims(), 2U);
+  EXPECT_EQ(t3.getElements(), 80U);
 
   TensorShape t4(t3);
   EXPECT_EQ(t4.ndims(), t3.ndims());
   EXPECT_EQ(t4.getElements(), t3.getElements());
 
   TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5);
-  EXPECT_EQ(t5.getElements(), 120);
+  EXPECT_EQ(t5.ndims(), 5U);
+  EXPECT_EQ(t5.getElements(), 120U);
 }
 
 TEST(TensorShape, GetAndSet) {
   TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3);
-  EXPECT_EQ(t.getElements(), 6);
+  EXPECT_EQ(t.ndims(), 3U);
+  EXPECT_EQ(t.getElements(), 6U);
 
-  EXPECT_EQ(t[1], 2);
+  EXPECT_EQ(t[1], 2U);
   t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300);
-  EXPECT_EQ(t[1], 100);
+  EXPECT_EQ(t.getElements(), 300U);
+  EXPECT_EQ(t[1], 100U);
 }
 
 }  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index e50e46f3e9..d1c559a91e 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -19,9 +19,9 @@ namespace paddle {
 
 TEST(TensorType, Matrix) {
   Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
-  EXPECT_EQ(matrix.getHeight(), 100);
-  EXPECT_EQ(matrix.getWidth(), 200);
-  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.getHeight(), 100U);
+  EXPECT_EQ(matrix.getWidth(), 200U);
+  EXPECT_EQ(matrix.getElementCnt(), 100U * 200U);
   EXPECT_EQ(matrix.useGpu(), false);
 
   Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
@@ -33,15 +33,15 @@ TEST(TensorType, Vector) {
   Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
   EXPECT_EQ(cpuVector.useGpu(), false);
   EXPECT_EQ(gpuVector.useGpu(), true);
-  EXPECT_EQ(cpuVector.getSize(), 100);
-  EXPECT_EQ(gpuVector.getSize(), 100);
+  EXPECT_EQ(cpuVector.getSize(), 100U);
+  EXPECT_EQ(gpuVector.getSize(), 100U);
 
   Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
   Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
   EXPECT_EQ(cpuIVector.useGpu(), false);
   EXPECT_EQ(gpuIVector.useGpu(), true);
-  EXPECT_EQ(cpuIVector.getSize(), 100);
-  EXPECT_EQ(gpuIVector.getSize(), 100);
+  EXPECT_EQ(cpuIVector.getSize(), 100U);
+  EXPECT_EQ(gpuIVector.getSize(), 100U);
 }
 
 TEST(TensorType, EmptyMatrix) {
diff --git a/paddle/function/neon/NeonDepthwiseConv.cpp b/paddle/function/neon/NeonDepthwiseConv.cpp
new file mode 100644
index 0000000000..38aa667061
--- /dev/null
+++ b/paddle/function/neon/NeonDepthwiseConv.cpp
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(static_cast<size_t>(inputChannels), groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input
+    float* inputPadding = inputData;
+    int padInputHeight = inputHeight + 2 * paddingH();
+    int padInputWidth = inputWidth + 2 * paddingW();
+    if (paddingH() > 0 || paddingW() > 0) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      neon::Padding<float>::run(inputData,
+                                inputPadding,
+                                batchSize * inputChannels,
+                                inputHeight,
+                                inputWidth,
+                                padInputHeight,
+                                padInputWidth);
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 3 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 2>::run;
+    } else if (filterWidth == 4 && strideW() == 1) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else if (filterWidth == 4 && strideW() == 2) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 2>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+REGISTER_TYPED_FUNC(NeonDepthwiseConv, CPU, NeonDepthwiseConvFunction);
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConv.h b/paddle/function/neon/NeonDepthwiseConv.h
new file mode 100644
index 0000000000..98a86d278f
--- /dev/null
+++ b/paddle/function/neon/NeonDepthwiseConv.h
@@ -0,0 +1,627 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string.h>
+#include "neon_util.h"
+
+namespace paddle {
+namespace neon {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <int filterSize, int stride>
+struct DepthwiseConvKernel {};
+
+inline float32_t conv3x3(const float* r0,
+                         const float* r1,
+                         const float* r2,
+                         float32x4_t k0,
+                         float32x4_t k1,
+                         float32x4_t k2) {
+  float32_t tmp[12];
+  vst1q_f32(&(tmp[0]), k0);
+  vst1q_f32(&(tmp[4]), k1);
+  vst1q_f32(&(tmp[8]), k2);
+  float32_t sum0 = r0[0] * tmp[0] + r0[1] * tmp[1] + r0[2] * tmp[2];
+  float32_t sum1 = r1[0] * tmp[4] + r1[1] * tmp[5] + r1[2] * tmp[6];
+  float32_t sum2 = r2[0] * tmp[8] + r2[1] * tmp[9] + r2[2] * tmp[10];
+  return sum0 + sum1 + sum2;
+}
+
+inline float32_t conv4x4(float32x4_t r0,
+                         float32x4_t r1,
+                         float32x4_t r2,
+                         float32x4_t r3,
+                         float32x4_t k0,
+                         float32x4_t k1,
+                         float32x4_t k2,
+                         float32x4_t k3) {
+  float32x4_t tmp;
+  tmp = vmulq_f32(r0, k0);
+  tmp = vmlaq_f32(tmp, r1, k1);
+  tmp = vmlaq_f32(tmp, r2, k2);
+  tmp = vmlaq_f32(tmp, r3, k3);
+  return vaddvq_f32(tmp);
+}
+
+/**
+ * Each step calculates four elements of the output.
+ * First step:
+ *   R0[0, 1, 2, 3...] * K[0][0]
+ *   R0[1, 2, 3, 4...] * K[0][1]
+ *   R0[2, 3, 4, 5...] * K[0][2]
+ *   R1[0, 1, 2, 3...] * K[1][0]
+ *   R1[1, 2, 3, 4...] * K[1][1]
+ *   R1[2, 3, 4, 5...] * K[1][2]
+ *   R2[0, 1, 2, 3...] * K[2][0]
+ *   R2[1, 2, 3, 4...] * K[2][1]
+ * + R2[2, 3, 4, 5...] * K[2][2]
+ * ------------------------------
+ *     Output[0, 1, 2, 3]
+ */
+template <>
+struct DepthwiseConvKernel<3, 1> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 9) {
+      // Load the filters
+      float32x4_t k[3];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 3);
+      k[2] = vld1q_f32(filterData + 6);
+      k[0] = vsetq_lane_f32(0.f, k[0], 3);
+      k[1] = vsetq_lane_f32(0.f, k[1], 3);
+      k[2] = vsetq_lane_f32(0.f, k[2], 3);
+
+      const float* r0 =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      const float* r1 = r0 + inputWidth;
+      const float* r2 = r0 + inputWidth * 2;
+      float32x4_t input[3][3];
+      for (int h = 0; h < outputHeight; h++) {
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t tmp;
+          input[0][0] = vld1q_f32(r0);
+          tmp = vld1q_f32(r0 + 4);
+          input[0][1] = vextq_f32(input[0][0], tmp, 1);
+          input[0][2] = vextq_f32(input[0][0], tmp, 2);
+          input[1][0] = vld1q_f32(r1);
+          tmp = vld1q_f32(r1 + 4);
+          input[1][1] = vextq_f32(input[1][0], tmp, 1);
+          input[1][2] = vextq_f32(input[1][0], tmp, 2);
+          input[2][0] = vld1q_f32(r2);
+          tmp = vld1q_f32(r2 + 4);
+          input[2][1] = vextq_f32(input[2][0], tmp, 1);
+          input[2][2] = vextq_f32(input[2][0], tmp, 2);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 4;
+          r1 += 4;
+          r2 += 4;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
+          r0++;
+          r1++;
+          r2++;
+          outputData++;
+        }
+
+        r0 += 2;
+        r1 += 2;
+        r2 += 2;
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ * First step:
+ *   R0[0, 2, 4, 6...] * K[0][0]
+ *   R0[1, 3, 5, 7...] * K[0][1]
+ *   R0[2, 4, 6, 8...] * K[0][2]
+ *   R1[0, 2, 4, 6...] * K[1][0]
+ *   R1[1, 3, 5, 7...] * K[1][1]
+ *   R1[2, 4, 6, 8...] * K[1][2]
+ *   R2[0, 2, 4, 6...] * K[2][0]
+ *   R2[1, 3, 5, 7...] * K[2][1]
+ *   R2[2, 4, 6, 8...] * K[2][2]
+ * ------------------------------
+ *     Output[0, 1, 2, 3]
+ */
+template <>
+struct DepthwiseConvKernel<3, 2> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 9) {
+      // Load the filters
+      float32x4_t k[3];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 3);
+      k[2] = vld1q_f32(filterData + 6);
+      k[0] = vsetq_lane_f32(0.f, k[0], 3);
+      k[1] = vsetq_lane_f32(0.f, k[1], 3);
+      k[2] = vsetq_lane_f32(0.f, k[2], 3);
+
+      const float* start =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      float32x4_t input[3][3];
+      for (int h = 0; h < outputHeight; h++) {
+        const float* r0 = start + 2 * h * inputWidth;
+        const float* r1 = start + (2 * h + 1) * inputWidth;
+        const float* r2 = start + (2 * h + 2) * inputWidth;
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t data1;
+          float32x4x2_t data2;
+
+          data2 = vld2q_f32(r0);
+          input[0][0] = data2.val[0];
+          input[0][1] = data2.val[1];
+          data1 = vld1q_f32(r0 + 8);
+          input[0][2] = vextq_f32(data2.val[0], data1, 1);
+
+          data2 = vld2q_f32(r1);
+          input[1][0] = data2.val[0];
+          input[1][1] = data2.val[1];
+          data1 = vld1q_f32(r1 + 8);
+          input[1][2] = vextq_f32(data2.val[0], data1, 1);
+
+          data2 = vld2q_f32(r2);
+          input[2][0] = data2.val[0];
+          input[2][1] = data2.val[1];
+          data1 = vld1q_f32(r2 + 8);
+          input[2][2] = vextq_f32(data2.val[0], data1, 1);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][0], k[1], 0);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][1], k[1], 1);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][2], k[1], 2);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          *outputData = conv3x3(r0, r1, r2, k[0], k[1], k[2]);
+          r0 += 2;
+          r1 += 2;
+          r2 += 2;
+          outputData++;
+        }
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ */
+template <>
+struct DepthwiseConvKernel<4, 1> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 16) {
+      // Load the filters
+      float32x4_t k[4];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 4);
+      k[2] = vld1q_f32(filterData + 8);
+      k[3] = vld1q_f32(filterData + 12);
+
+      const float* r0 =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      const float* r1 = r0 + inputWidth;
+      const float* r2 = r0 + inputWidth * 2;
+      const float* r3 = r0 + inputWidth * 3;
+      float32x4_t input[4][4];
+      for (int h = 0; h < outputHeight; h++) {
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4_t tmp;
+          input[0][0] = vld1q_f32(r0);
+          tmp = vld1q_f32(r0 + 4);
+          input[0][1] = vextq_f32(input[0][0], tmp, 1);
+          input[0][2] = vextq_f32(input[0][0], tmp, 2);
+          input[0][3] = vextq_f32(input[0][0], tmp, 3);
+
+          input[1][0] = vld1q_f32(r1);
+          tmp = vld1q_f32(r1 + 4);
+          input[1][1] = vextq_f32(input[1][0], tmp, 1);
+          input[1][2] = vextq_f32(input[1][0], tmp, 2);
+          input[1][3] = vextq_f32(input[1][0], tmp, 3);
+
+          input[2][0] = vld1q_f32(r2);
+          tmp = vld1q_f32(r2 + 4);
+          input[2][1] = vextq_f32(input[2][0], tmp, 1);
+          input[2][2] = vextq_f32(input[2][0], tmp, 2);
+          input[2][3] = vextq_f32(input[2][0], tmp, 3);
+
+          input[3][0] = vld1q_f32(r3);
+          tmp = vld1q_f32(r3 + 4);
+          input[3][1] = vextq_f32(input[3][0], tmp, 1);
+          input[3][2] = vextq_f32(input[3][0], tmp, 2);
+          input[3][3] = vextq_f32(input[3][0], tmp, 3);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 4;
+          r1 += 4;
+          r2 += 4;
+          r3 += 4;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          float32x4_t i3 = vld1q_f32(r3);
+          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
+          r0++;
+          r1++;
+          r2++;
+          r3++;
+          outputData++;
+        }
+
+        r0 += 3;
+        r1 += 3;
+        r2 += 3;
+        r3 += 3;
+      }
+    }
+  }
+};
+
+/**
+ * Each step calculates four elements of the output.
+ */
+template <>
+struct DepthwiseConvKernel<4, 2> {
+  static void run(const float* inputData,
+                  const float* filterData,
+                  int inputHeight,
+                  int inputWidth,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int filterMultiplier,
+                  float* outputData) {
+    const int steps = outputWidth >> 2;
+    const int remain = outputWidth & 3;
+    for (int c = 0; c < outputChannels; c++, filterData += 16) {
+      // Load the filters
+      float32x4_t k[4];
+      k[0] = vld1q_f32(filterData);
+      k[1] = vld1q_f32(filterData + 4);
+      k[2] = vld1q_f32(filterData + 8);
+      k[3] = vld1q_f32(filterData + 12);
+
+      const float* start =
+          inputData + (c / filterMultiplier) * (inputHeight * inputWidth);
+      float32x4_t input[4][4];
+      for (int h = 0; h < outputHeight; h++) {
+        const float* r0 = start + 2 * h * inputWidth;
+        const float* r1 = start + (2 * h + 1) * inputWidth;
+        const float* r2 = start + (2 * h + 2) * inputWidth;
+        const float* r3 = start + (2 * h + 3) * inputWidth;
+        for (int s = 0; s < steps; s++) {
+          // Load the inputs
+          float32x4x2_t data1;
+          float32x4x2_t data2;
+
+          data1 = vld2q_f32(r0);
+          data2 = vld2q_f32(r0 + 8);
+          input[0][0] = data1.val[0];
+          input[0][1] = data1.val[1];
+          input[0][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[0][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r1);
+          data2 = vld2q_f32(r1 + 8);
+          input[1][0] = data1.val[0];
+          input[1][1] = data1.val[1];
+          input[1][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[1][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r2);
+          data2 = vld2q_f32(r2 + 8);
+          input[2][0] = data1.val[0];
+          input[2][1] = data1.val[1];
+          input[2][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[2][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          data1 = vld2q_f32(r3);
+          data2 = vld2q_f32(r3 + 8);
+          input[3][0] = data1.val[0];
+          input[3][1] = data1.val[1];
+          input[3][2] = vextq_f32(data1.val[0], data2.val[0], 1);
+          input[3][3] = vextq_f32(data1.val[1], data2.val[1], 1);
+
+          float32x4_t tmp1 = vdupq_n_f32(0.f);
+          float32x4_t tmp2 = vdupq_n_f32(0.f);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][0], k[0], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][1], k[0], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[0][2], k[0], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[0][3], k[0], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][0], k[1], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][1], k[1], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[1][2], k[1], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[1][3], k[1], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][0], k[2], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][1], k[2], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[2][2], k[2], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[2][3], k[2], 3);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][0], k[3], 0);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][1], k[3], 1);
+          tmp1 = vmlaq_laneq_f32(tmp1, input[3][2], k[3], 2);
+          tmp2 = vmlaq_laneq_f32(tmp2, input[3][3], k[3], 3);
+          tmp1 = vaddq_f32(tmp1, tmp2);
+
+          vst1q_f32(outputData, tmp1);
+          r0 += 8;
+          r1 += 8;
+          r2 += 8;
+          r3 += 8;
+          outputData += 4;
+        }
+
+        for (int r = 0; r < remain; r++) {
+          float32x4_t i0 = vld1q_f32(r0);
+          float32x4_t i1 = vld1q_f32(r1);
+          float32x4_t i2 = vld1q_f32(r2);
+          float32x4_t i3 = vld1q_f32(r3);
+          *outputData = conv4x4(i0, i1, i2, i3, k[0], k[1], k[2], k[3]);
+          r0 += 2;
+          r1 += 2;
+          r2 += 2;
+          r3 += 2;
+          outputData++;
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+struct Padding {
+  static void run(const T* input,
+                  T* inputPadding,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int padInputHeight,
+                  int padInputWidth) {
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = T(0);
+        }
+
+        memcpy(inputPadding, input, inputWidth * sizeof(T));
+        inputPadding += inputWidth;
+        input += inputWidth;
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = T(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(T));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+struct Padding<float> {
+  static void run(const float* input,
+                  float* inputPadding,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int padInputHeight,
+                  int padInputWidth) {
+    const int paddingHeight = (padInputHeight - inputHeight) / 2;
+    const int paddingWidth = (padInputWidth - inputWidth) / 2;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = float(0);
+        }
+
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(input);
+          vst1q_f32(inputPadding, s0);
+          input += 4;
+          inputPadding += 4;
+        }
+        for (int r = 0; r < remain; r++) {
+          *inputPadding++ = *input++;
+        }
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = float(0);
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+// for stride is 2
+struct StridePadding {
+  static void run(const float* input,
+                  float* inputPadding,
+                  int channels,
+                  int inputHeight,
+                  int inputWidth,
+                  int padInputHeight,
+                  int padInputWidth) {
+    const int paddingHeight = (padInputHeight - (inputHeight * 2 - 1)) / 2;
+    const int paddingWidth = (padInputWidth - (inputWidth * 2 - 1)) / 2;
+    for (int c = 0; c < channels; c++) {
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+
+      for (int i = 0; i < inputHeight; i++) {
+        // padding head
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = float(0);
+        }
+
+        int step = inputWidth >> 2;
+        int remain = inputWidth & 3;
+        float32x4_t s1 = vdupq_n_f32(0.f);
+        for (int s = 0; s < step; s++) {
+          float32x4_t s0 = vld1q_f32(input);
+          float32x4x2_t v = {{s0, s1}};
+          vst2q_f32(inputPadding, v);
+          input += 4;
+          inputPadding += 8;
+        }
+        for (int r = 0; r < remain; r++) {
+          *inputPadding++ = *input++;
+          *inputPadding++ = float(0);
+        }
+        inputPadding--;
+
+        // padding tail
+        for (int j = 0; j < paddingWidth; j++) {
+          *inputPadding++ = float(0);
+        }
+        if (i != inputHeight - 1) {
+          memset(inputPadding, 0, padInputWidth * sizeof(float));
+          inputPadding += padInputWidth;
+        }
+      }
+
+      if (paddingHeight > 0) {
+        memset(inputPadding, 0, padInputWidth * paddingHeight * sizeof(float));
+        inputPadding += padInputWidth * paddingHeight;
+      }
+    }
+  }
+};
+
+#endif
+
+#endif
+
+}  // namespace neon
+}  // namespace paddle
diff --git a/paddle/function/neon/NeonDepthwiseConvTranspose.cpp b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
new file mode 100644
index 0000000000..49ca4bc8a0
--- /dev/null
+++ b/paddle/function/neon/NeonDepthwiseConvTranspose.cpp
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NeonDepthwiseConv.h"
+#include "paddle/function/ConvOp.h"
+
+namespace paddle {
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+template <DeviceType Device>
+class NeonDepthwiseConvTransposeFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    int batchSize = input[0];
+    int inputChannels = input[1];
+    int inputHeight = input[2];
+    int inputWidth = input[3];
+    int filterHeight = getFilterHeight(filter);
+    int filterWidth = getFilterWidth(filter);
+    int outputChannels = output[1];
+    int outputHeight = output[2];
+    int outputWidth = output[3];
+    int filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    // only support strideH() == strideW() and filterHeight == filterWidth.
+    CHECK_EQ(strideH(), strideW());
+    CHECK_EQ(paddingH(), paddingW());
+    CHECK_EQ(filterHeight, filterWidth);
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    // padding the input, input -> inputPadding
+    float* inputPadding = inputData;
+    int padInputHeight =
+        (inputHeight - 1) * strideH() + 2 * filterHeight - 1 - 2 * paddingH();
+    int padInputWidth =
+        (inputWidth - 1) * strideW() + 2 * filterWidth - 1 - 2 * paddingW();
+
+    if (padInputHeight > inputHeight || padInputWidth > inputWidth) {
+      int newSize = batchSize * inputChannels * padInputHeight * padInputWidth;
+      resizeBuffer<Device>(newSize);
+      inputPadding = reinterpret_cast<float*>(memory_->getBuf());
+      if (strideH() == 1) {
+        neon::Padding<float>::run(inputData,
+                                  inputPadding,
+                                  batchSize * inputChannels,
+                                  inputHeight,
+                                  inputWidth,
+                                  padInputHeight,
+                                  padInputWidth);
+      } else if (strideH() == 2) {
+        neon::StridePadding::run(inputData,
+                                 inputPadding,
+                                 batchSize * inputChannels,
+                                 inputHeight,
+                                 inputWidth,
+                                 padInputHeight,
+                                 padInputWidth);
+      } else {
+        LOG(FATAL) << "Not supported";
+      }
+    }
+
+    std::function<void(
+        const float*, const float*, int, int, int, int, int, int, float*)>
+        DepthWiseConv;
+
+    if (filterWidth == 3) {
+      DepthWiseConv = neon::DepthwiseConvKernel<3, 1>::run;
+    } else if (filterWidth == 4) {
+      DepthWiseConv = neon::DepthwiseConvKernel<4, 1>::run;
+    } else {
+      LOG(FATAL) << "Not supported";
+    }
+
+    for (int i = 0; i < batchSize; i++) {
+      DepthWiseConv(inputPadding,
+                    filterData,
+                    padInputHeight,
+                    padInputWidth,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    filterMultiplier,
+                    outputData);
+      inputPadding += inputChannels * padInputHeight * padInputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+
+#ifndef PADDLE_TYPE_DOUBLE
+
+REGISTER_TYPED_FUNC(NeonDepthwiseConvTranspose,
+                    CPU,
+                    NeonDepthwiseConvTransposeFunction);
+
+#endif
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/neon/neon_util.h b/paddle/function/neon/neon_util.h
new file mode 100644
index 0000000000..e2db045067
--- /dev/null
+++ b/paddle/function/neon/neon_util.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include <arm_neon.h>
+
+namespace paddle {
+
+namespace neon {
+
+inline float32x4_t vld1q_f32_aligned(const float* p) {
+  return vld1q_f32(
+      (const float*)__builtin_assume_aligned(p, sizeof(float32x4_t)));
+}
+
+#ifndef __aarch64__
+inline float32_t vaddvq_f32(float32x4_t a) {
+  float32x2_t v = vadd_f32(vget_high_f32(a), vget_low_f32(a));
+  return vget_lane_f32(vpadd_f32(v, v), 0);
+}
+
+#define vmlaq_laneq_f32(a, b, v, lane) \
+  vmlaq_n_f32(a, b, vgetq_lane_f32(v, lane))
+#endif
+
+}  // namespace neon
+}  // namespace paddle
+
+#endif
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000..6ccc487cf1
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,247 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "nnpack.h"
+#include "paddle/function/ConvOp.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            true,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    create_nnpack_threadpool();
+  }
+
+  ~NNPACKConvFunction() {
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      VLOG(3) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    size_t inputOffset = inputChannels / groups_ * inputHeight * inputWidth;
+    size_t outputOffset = outputChannels / groups_ * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+
+    if (batchSize == 1) {
+      for (size_t g = 0; g < groups_; g++) {
+        nnp_status status =
+            nnp_convolution_inference(algorithm_,
+                                      transform_strategy_,
+                                      inputChannels / groups_,
+                                      outputChannels / groups_,
+                                      inputSize,
+                                      padding,
+                                      kernelSize,
+                                      outputSubsampling,
+                                      inputData + inputOffset * g,
+                                      filterData + filterOffset * g,
+                                      nullptr, /* bias */
+                                      outputData + outputOffset * g,
+                                      bufferPtr,
+                                      sizePtr,
+                                      nnp_activation_identity,
+                                      nullptr,
+                                      threadpool_, /* threadpool */
+                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  static pthreadpool_t threadpool_;
+};
+
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000..4dd3982487
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/function/ConvOpTest.h"
+
+namespace paddle {
+
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 93a6a99848..3d6ced713f 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -23,6 +23,27 @@ endmacro()
 
 filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
+
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
+else()
+    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
+endif()
+
+if(NOT WITH_MKLML)
+    file(GLOB_RECURSE MKL_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.h")
+    file(GLOB_RECURSE MKL_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLPacked*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${MKL_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${MKL_SOURCES})
+    message(STATUS "Skip compiling with MKLPackedLayers")
+else()
+    message(STATUS "Compile with MKLPackedLayers")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
@@ -49,6 +70,75 @@ if(NOT WITH_PYTHON)
             dataproviders/PyDataProvider.h)
 endif()
 
+if(MOBILE_INFERENCE)
+    # Remove evaluators
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/ValidationLayer.cpp
+         evaluators/Evaluator.cpp
+         evaluators/DetectionMAPEvaluator.cpp
+         evaluators/CTCErrorEvaluator.cpp
+         evaluators/ChunkEvaluator.cpp)
+
+    # Remove dataproviders
+    list(REMOVE_ITEM GSERVER_SOURCES
+         dataproviders/DataProvider.cpp
+         dataproviders/MultiDataProvider.cpp
+         dataproviders/PyDataProvider2.cpp
+         dataproviders/PyDataProvider.cpp)
+
+    # Remove useless gradientmachines
+    list(REMOVE_ITEM GSERVER_SOURCES
+         gradientmachines/MultiNetwork.cpp
+         gradientmachines/RecurrentGradientMachine.cpp
+         gradientmachines/ParallelNeuralNetwork.cpp
+         gradientmachines/GradientMachineMode.cpp
+         gradientmachines/MultiGradientMachine.cpp)
+
+    # Remove layers that used in training
+    list(REMOVE_ITEM GSERVER_SOURCES
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
+endif()
+
 if(WITH_GPU)
     cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
 else()
@@ -58,7 +148,7 @@ endif()
 
 add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
 add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver gen_proto_cpp)
+add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index a40530f413..57c890e488 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -22,9 +22,12 @@ limitations under the License. */
 #include <type_traits>
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
-
 #include "paddle/utils/Logging.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "MKLDNNActivation.h"
+#endif
+
 namespace paddle {
 
 static ClassRegistrar<ActivationFunction> gActivationRegistrar;
@@ -112,7 +115,6 @@ BEGIN_DEFINE_ACTIVATION(softmax)
 private:
 MatrixPtr sftMaxSum_;
 MatrixPtr sftMaxDot_;
-MatrixPtr one_;
 
 public:
 Error __must_check forward(Argument& act) {
@@ -138,14 +140,6 @@ Error __must_check backward(Argument& act) {
                            1,
                            /* trans */ false,
                            useGpu(act.deviceId));
-    if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_,
-                             1,
-                             outputG->getWidth(),
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-      one_->one();
-    }
 
     sftMaxDot_->dotMul(*outputG, *outputV);
     sftMaxSum_->colMerge(*sftMaxDot_);
@@ -186,7 +180,10 @@ Error __must_check forward(Argument& act) {
                                     useGpu(act.deviceId));
   }
 
-  auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
+  auto starts =
+      act.hasSubseq()
+          ? act.subSequenceStartPositions->getVector(useGpu(act.deviceId))
+          : act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
   return Error();
 }
@@ -197,8 +194,9 @@ Error __must_check backward(Argument& act) {
         "Input width for each timestep of sequence softmax should be 1");
   }
 
-  size_t numSequences = act.getNumSequences();
-  const int* starts = act.sequenceStartPositions->getData(false);
+  size_t numSequences =
+      act.hasSubseq() ? act.getNumSubSequences() : act.getNumSequences();
+  const int* starts = act.getCpuStartPositions();
 
   for (size_t i = 0; i < numSequences; ++i) {
     // TODO(Dangqingqing) optimization for GPU
@@ -207,13 +205,44 @@ Error __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Error status = softmax_.backward(argument_);
-    if (!status) return status;
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
   }
   return Error();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
+/*
+ * @brief SoftSign Activation.
+ * \f[
+ * f(z) = \frac{z}{1 + |z|}
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(softsign)
+private:
+MatrixPtr denominator_;
+
+Error __must_check forward(Argument& act) {
+  size_t height = act.value->getHeight();
+  size_t width = act.value->getWidth();
+  Matrix::resizeOrCreate(
+      denominator_, height, width, false, useGpu(act.deviceId));
+  denominator_->assign(*act.value);
+  denominator_->abs2();
+  denominator_->add(1.);
+
+  act.value->dotDiv(*act.value, *denominator_);
+  return Error();
+}
+
+Error __must_check backward(Argument& act) {
+  denominator_->square2();
+  denominator_->scalarDiv(*denominator_, 1.);
+  act.grad->dotMul(*act.grad, *denominator_);
+  return Error();
+}
+END_DEFINE_ACTIVATION(softsign)
+
 /**
  * @brief Relu Activation.
  * forward. y = max(0, z)
@@ -461,6 +490,12 @@ Error __must_check backward(Argument& act) {
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
+#ifdef PADDLE_WITH_MKLDNN
+  if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) {
+    return MKLDNNActivation::create(type);
+  }
+#endif
+
   return gActivationRegistrar.createByType(type);
 }
 
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
new file mode 100644
index 0000000000..f3ccd68160
--- /dev/null
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -0,0 +1,249 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNActivation.h"
+#include "mkldnn.hpp"
+#include "paddle/utils/ClassRegistrar.h"
+
+namespace paddle {
+
+static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
+/**
+ * @def MKLDNN_ACTIVATION_CLASS_NAME
+ * @note MKLDNN_ACTIVATION_CLASS_NAME(relu) relu_;
+ * means mkldnn_reluActivation relu_;
+ */
+#define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
+
+/**
+ * @def BEGIN_MKLDNN_ACTIVATION
+ */
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
+ */
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+private:                                                           \
+  static const std::string name;                                   \
+                                                                   \
+public:                                                            \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
+  });
+
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
+/**
+ * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+private:                                                             \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+public:                                                              \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
+
+/**
+ * @brief MKLDNN Relu Activation.
+ * Actually mkldnn_relu is Leaky Relu.
+ *  f(x) = x                   (x >= 0)
+ *  f(x) = negative_slope * x  (x <  0)
+ * @note the negative_slope should be -0.f in forward
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
+
+/**
+ * @brief MKLDNN Tanh Activation.
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+/**
+ * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
+ *  f(x) = x                              (x >= 0)
+ *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
+ */
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
+
+/**
+ * @brief MKLDNN Softmax Activation
+ */
+DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
+
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+  Matrix::resizeOrCreate(sftMaxDot_,
+                         outputG->getHeight(),
+                         outputG->getWidth(),
+                         /* trans */ false,
+                         /* useGpu */ false);
+  Matrix::resizeOrCreate(sftMaxSum_,
+                         outputG->getHeight(),
+                         1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  sftMaxDot_->dotMul(*outputG, *outputV);
+  sftMaxSum_->colMerge(*sftMaxDot_);
+  act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  return Error();
+}
+
+ActivationFunction* MKLDNNActivation::create(const std::string& type) {
+  return gMKLDNNActivationRegistrar.createByType(type);
+}
+
+std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gMKLDNNActivationRegistrar.forEachType(
+      [&](const std::string& type) { types.push_back(type); });
+  return types;
+}
+
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
+}  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
new file mode 100644
index 0000000000..dd16421fd6
--- /dev/null
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "ActivationFunction.h"
+#include "mkldnn.hpp"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/math/MKLDNNMatrix.h"
+#include "paddle/parameter/Argument.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class of MKLDNN Activation.
+ * Common activation function are provieded,
+ * including mkldnn_relu, mkldnn_elu, mkldnn_tanh, mkldnn_softmax
+ */
+class MKLDNNActivation : public ActivationFunction {
+protected:
+  // input value element count
+  size_t cnt_;
+  // should not merge the resetBwd into resetFwd,
+  // because the grad data would be changing before backward.
+  bool needResetBwd_;
+  // mkldnn matrix, primitive, stream and pipeline
+  MKLDNNMatrixPtr val_;
+  MKLDNNMatrixPtr grad_;
+  std::shared_ptr<mkldnn::engine> engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwd_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+public:
+  MKLDNNActivation() : cnt_(0), needResetBwd_(true) {}
+  ~MKLDNNActivation() {}
+  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
+  virtual const std::string& getName() const = 0;
+  /**
+   * reset the forward primitives
+   */
+  virtual void resetFwd(Argument& act);
+  /**
+   * reset the backward primitives,
+   * can not merge this functions into resetFwd as the grad data
+   * would be changing before backward.
+   */
+  virtual void resetBwd(Argument& act) {}
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
+};
+
+/**
+ * @brief Base class of MKLDNN Eltwise Activation,
+ * includes mkldnn_relu, mkldnn_elu and mkldnn_tanh.
+ */
+class MKLDNNEltwiseActivation : public MKLDNNActivation {
+  typedef mkldnn::eltwise_forward eltwise_fwd;
+  typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
+
+protected:
+  // save the forward primitive desc, which can be used backward
+  std::shared_ptr<eltwise_fwd::primitive_desc> fwdPD_;
+  // eltwise_bwd need src input value
+  MKLDNNMatrixPtr inVal_;
+  // use for copy data
+  std::shared_ptr<mkldnn::reorder> copyInVal_;
+
+public:
+  MKLDNNEltwiseActivation() {}
+  ~MKLDNNEltwiseActivation() {}
+  virtual const std::string& getName() const = 0;
+
+  // in common, the alpha of forward and backward should be equal.
+  // but for relu, to avoid negative value, they should be opposite
+  virtual float getAlpha() const = 0;
+  virtual float getBwdAlpha() const = 0;
+  virtual float getBeta() const { return 0.f; }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
+};
+
+/**
+ * @brief Base class of MKLDNN softmax Activation,
+ * only have mkldnn forward, use cpu implement for backward.
+ */
+class MKLDNNSoftmaxActivation : public MKLDNNActivation {
+  typedef mkldnn::softmax_forward softmax_fwd;
+
+private:
+  // for backward
+  MatrixPtr sftMaxSum_;
+  MatrixPtr sftMaxDot_;
+
+public:
+  MKLDNNSoftmaxActivation() {}
+  ~MKLDNNSoftmaxActivation() {}
+  virtual const std::string& getName() const = 0;
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 0478256f9c..106cf5b622 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <unistd.h>
 #include <algorithm>
-#include "ProtoDataProvider.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
 
 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
 REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
-REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
-REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
 
 int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
   int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 4003676217..265dbb5493 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
deleted file mode 100644
index c6f5cab191..0000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ /dev/null
@@ -1,932 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ProtoDataProvider.h"
-#include <algorithm>
-#include <fstream>
-#include <istream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "DataProviderGroup.h"
-#include "paddle/utils/Logging.h"
-
-DEFINE_double(memory_threshold_on_load_data,
-              1.0,
-              "stop loading data when memory is not sufficient");
-
-namespace paddle {
-
-REGISTER_DATA_PROVIDER(proto_group, DataProviderGroup<ProtoDataProvider>);
-REGISTER_DATA_PROVIDER(proto_sequence_group,
-                       DataProviderGroup<ProtoSequenceDataProvider>);
-
-ProtoDataProvider::ProtoDataProvider(const DataConfig& config,
-                                     bool useGpu,
-                                     bool loadDataAll)
-    : DataProvider(config, useGpu), sampleNums_(0), currentSequenceIndex_(0) {
-  if (loadDataAll) {
-    loadData(config_.files());
-  }
-}
-
-void ProtoDataProvider::loadData(const std::vector<std::string>& fileList) {
-  for (auto& file : fileList) {
-    if (FLAGS_memory_threshold_on_load_data < 1.0) {
-      double memUsage = getMemoryUsage();
-      if (memUsage > FLAGS_memory_threshold_on_load_data) {
-        LOG(INFO) << "memUsage is " << memUsage << ", > "
-                  << FLAGS_memory_threshold_on_load_data
-                  << " therefore SKIP ALL REMAINING file.";
-        break;
-      }
-    }
-    LOG(INFO) << "load data file " << file;
-    loadDataFile(file);
-  }
-
-  if (sequenceStartPositions_.size() == sampleNums_) {
-    // This means that each sample is one sequence
-    shuffledSequenceIds_.swap(sequenceStartPositions_);
-  } else {
-    sequenceStartPositions_.push_back(sampleNums_);
-    shuffledSequenceIds_.reserve(sequenceStartPositions_.size() - 1);
-    for (size_t i = 0; i < sequenceStartPositions_.size() - 1; ++i) {
-      shuffledSequenceIds_.push_back(i);
-    }
-  }
-
-  LOG(INFO) << "read done, num of instance=" << sampleNums_;
-  showDataStats();
-}
-
-void ProtoDataProvider::loadData(const std::string& fileName) {
-  std::vector<std::string> fileList;
-  loadFileList(fileName, fileList);
-  loadData(fileList);
-}
-
-void ProtoDataProvider::checkDataHeader(const DataHeader& header) {
-  if (header_.slot_defs_size()) {
-    // header_ is already set. Need to check consistency.
-    CHECK_EQ(header_.slot_defs_size(), header.slot_defs_size())
-        << "Different header";
-    for (int i = 0; i < header.slot_defs_size(); ++i) {
-      CHECK_EQ(header_.slot_defs(i).type(), header.slot_defs(i).type());
-      CHECK_EQ(header_.slot_defs(i).dim(), header.slot_defs(i).dim());
-    }
-    return;
-  }
-
-  // header_ is not set before
-  CHECK(header.slot_defs_size()) << "Invalid header: no slot is defined";
-  int i;
-  for (i = 0; i < header.slot_defs_size(); ++i) {
-    if (header.slot_defs(i).type() == SlotDef::INDEX ||
-        header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX) {
-      break;
-    }
-    constexpr int kBufLen = 100;
-    char buf[kBufLen];
-    snprintf(buf, kBufLen, "slot%d_nnz", i);
-    nnzStats_.push_back(getStat(buf));
-  }
-  numVecSlots_ = i;
-
-  // Check that INDEX slots are after VECTOR slots
-  for (int i = numVecSlots_; i < header.slot_defs_size(); ++i) {
-    CHECK(header.slot_defs(i).type() == SlotDef::INDEX ||
-          header.slot_defs(i).type() == SlotDef::VAR_MDIM_INDEX);
-  }
-
-  slots_.clear();
-  slots_.reserve(header.slot_defs_size());
-  for (int i = 0; i < header.slot_defs_size(); ++i) {
-    slots_.emplace_back();
-    slots_.back().type = header.slot_defs(i).type();
-    slots_.back().dim = header.slot_defs(i).dim();
-    if (SlotDef::VECTOR_SPARSE_NON_VALUE == header.slot_defs(i).type() ||
-        SlotDef::VECTOR_SPARSE_VALUE == header.slot_defs(i).type()) {
-      slots_.back().indices.push_back(0);
-    }
-  }
-
-  header_ = header;
-}
-
-void ProtoDataProvider::checkSample(const DataSample& sample) {
-  CHECK_EQ(numVecSlots_, sample.vector_slots_size());
-  CHECK(header_.slot_defs_size() == numVecSlots_ + sample.id_slots_size() ||
-        header_.slot_defs_size() == numVecSlots_ + sample.var_id_slots_size());
-  for (int i = 0; i < numVecSlots_; ++i) {
-    uint32_t dim = header_.slot_defs(i).dim();
-    switch (header_.slot_defs(i).type()) {
-      case SlotDef::VECTOR_DENSE: {
-        CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          CHECK_EQ(0, sample.vector_slots(i).values_size());
-          break;
-        }
-        CHECK_LT(0, sample.vector_slots(i).values_size());
-        CHECK_GE(static_cast<int>(dim), sample.vector_slots(i).values_size());
-        CHECK_EQ(sample.vector_slots(i).values_size(),
-                 sample.vector_slots(i).ids_size());
-        auto maxId = *std::max_element(sample.vector_slots(i).ids().begin(),
-                                       sample.vector_slots(i).ids().end());
-        CHECK_GT(dim, maxId);
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        if (static_cast<int>(dim) != 0) {
-          CHECK_EQ(static_cast<int>(dim), sample.vector_slots(i).values_size());
-          if (sample.vector_slots(i).dims_size() != 0) {
-            int totalDim = sample.vector_slots(i).dims(0);
-            for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-              totalDim *= sample.vector_slots(i).dims(j);
-            }
-            CHECK_EQ(static_cast<int>(dim), totalDim);
-          }
-        } else {
-          CHECK_NE(sample.vector_slots(i).dims_size(), 0);
-          int totalDim = sample.vector_slots(i).dims(0);
-          for (int j = 1; j < sample.vector_slots(i).dims_size(); ++j) {
-            totalDim *= sample.vector_slots(i).dims(j);
-          }
-          CHECK_EQ(totalDim, sample.vector_slots(i).values_size());
-        }
-        break;
-      }
-      case SlotDef::STRING: {
-        CHECK_EQ(static_cast<int>(1), sample.vector_slots(i).strs_size());
-        CHECK_EQ(0, sample.vector_slots(i).ids_size());
-        CHECK_EQ(0, sample.vector_slots(i).values_size());
-        break;
-      }
-      default:
-        LOG(FATAL) << "BUG: Should not reach here";
-    }
-  }
-  for (int i = numVecSlots_; i < header_.slot_defs_size(); ++i) {
-    if (header_.slot_defs(i).type() != SlotDef::VAR_MDIM_INDEX) {
-      uint32_t id = sample.id_slots(i - numVecSlots_);
-      if (id == -1U) continue;
-      CHECK_LT(id, header_.slot_defs(i).dim());
-    } else {
-      for (int j = 0; j < sample.var_id_slots(i - numVecSlots_).ids_size();
-           ++j) {
-        uint32_t id = sample.var_id_slots(i - numVecSlots_).ids(j);
-        CHECK_LT(id, header_.slot_defs(i).dim());
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::loadDataFile(const std::string& fileName) {
-  std::ifstream is(fileName);
-  CHECK(is) << "Fail to open " << fileName;
-  bool dataCompression = str::endsWith(fileName, ".gz");
-  std::unique_ptr<ProtoReader> reader(new ProtoReader(&is, dataCompression));
-  CHECK(reader) << "Fail to create proto data input stream";
-
-  DataHeader header;
-  CHECK(reader->read(&header));
-  checkDataHeader(header);
-
-  DataSample sample;
-  do {
-    if (!reader->read(&sample)) {
-      break;
-    }
-    checkSample(sample);
-    if (sample.is_beginning()) {
-      sequenceStartPositions_.push_back(sampleNums_);
-    }
-    fillSlots(sample);
-    ++sampleNums_;
-  } while (true);
-
-  CHECK(is.eof()) << "Fail to read file";
-  reader.reset(nullptr);
-  is.close();
-}
-
-// checkSample has done before, no check here
-void ProtoDataProvider::fillSlots(const DataSample& sample) {
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    int dim = slot.dim;
-    switch (slot.type) {
-      case SlotDef::VECTOR_DENSE: {
-        size_t oldSize = slot.denseData.size();
-        slot.denseData.resize(oldSize + dim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(values, values + dim, slot.denseData.begin() + oldSize);
-#else
-        memcpy(slot.denseData.data() + oldSize, values, sizeof(real) * dim);
-#endif
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        int slotSize = sample.vector_slots(i).ids_size();
-        int subSlotSize = 0;
-        int id = 0;  // the slot id
-        // find whether this vector_slots has subseq. If not has subseq,
-        // subSlotSize = 0.
-        for (id = 0; id < sample.subseq_slots_size(); id++) {
-          if (sample.subseq_slots(id).slot_id() == i) {
-            subSlotSize = sample.subseq_slots(id).lens_size();
-            break;
-          }
-        }
-        if (subSlotSize && slot.subIndices.size() == 0UL) {
-          // If has subSeq, the first element of subIndices = 0.
-          slot.subIndices.push_back(0);
-        }
-        if (slotSize == 0UL) {
-          // if has no id, new indices = old indices.
-          slot.indices.push_back(slot.indices.back());
-          // if has subSeq, new subIndices = old subIndices.
-          if (slot.subIndices.size()) {
-            slot.subIndices.push_back(slot.subIndices.back());
-          }
-          break;
-        }
-        slot.sparseNonValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        memcpy(slot.sparseNonValueData.data() + slot.indices.back(),
-               ids,
-               sizeof(*ids) * slotSize);
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        if (subSlotSize) {
-          for (int ii = 0; ii < subSlotSize; ++ii) {
-            slot.subIndices.push_back(slot.subIndices.back() +
-                                      sample.subseq_slots(id).lens(ii));
-          }
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (0 == sample.vector_slots(i).ids_size()) {
-          slot.indices.push_back(slot.indices.back());
-          break;
-        }
-        int slotSize = sample.vector_slots(i).ids_size();
-        slot.sparseFloatValueData.resize(slot.indices.back() + slotSize);
-        const unsigned int* ids = sample.vector_slots(i).ids().data();
-        const float* values = sample.vector_slots(i).values().data();
-        for (int ii = 0; ii < slotSize; ++ii) {
-          slot.sparseFloatValueData[slot.indices.back() + ii].col = ids[ii];
-          slot.sparseFloatValueData[slot.indices.back() + ii].value =
-              values[ii];
-        }
-        slot.indices.push_back(slot.indices.back() + slotSize);
-        break;
-      }
-      case SlotDef::INDEX: {
-        slot.indexData.push_back(sample.id_slots(i - numVecSlots_));
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        size_t oldSize = slot.varDenseData.size();
-        slot.varDenseData.resize(oldSize + 1);
-        size_t varDim = sample.vector_slots(i).values_size();
-        slot.varDenseData[oldSize].data.resize(varDim);
-        const float* values = sample.vector_slots(i).values().data();
-#ifdef PADDLE_TYPE_DOUBLE
-        std::copy(
-            values, values + varDim, slot.varDenseData[oldSize].data.data());
-#else
-        memcpy(slot.varDenseData[oldSize].data.data(),
-               values,
-               sizeof(real) * varDim);
-#endif
-        slot.varDenseData[oldSize].dims.resize(
-            sample.vector_slots(i).dims_size());
-        memcpy(slot.varDenseData[oldSize].dims.data(),
-               sample.vector_slots(i).dims().data(),
-               sizeof(uint32_t) * sample.vector_slots(i).dims_size());
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        size_t oldSize = slot.varIndices.size();
-        slot.varIndices.resize(oldSize + 1);
-        size_t varDim = sample.var_id_slots(i - numVecSlots_).ids_size();
-        slot.varIndices[oldSize].resize(varDim);
-        memcpy(slot.varIndices[oldSize].data(),
-               sample.var_id_slots(i - numVecSlots_).ids().data(),
-               sizeof(uint32_t) * varDim);
-        break;
-      }
-      case SlotDef::STRING: {
-        slot.strData.push_back(sample.vector_slots(i).strs(0));
-        break;
-      }
-    }
-  }
-}
-
-void ProtoDataProvider::showDataStats() {
-  std::ostringstream oss;
-  for (size_t i = 0; i < slots_.size(); ++i) {
-    auto& slot = slots_[i];
-    if (slot.type == SlotDef::VECTOR_SPARSE_NON_VALUE) {
-      size_t nnz = slot.sparseNonValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    } else if (slot.type == SlotDef::VECTOR_SPARSE_VALUE) {
-      size_t nnz = slot.sparseFloatValueData.size();
-      oss << "slot" << i << ":avgNNZ=" << ((double)nnz / sampleNums_) << "; ";
-    }
-  }
-  LOG(INFO) << oss.str();
-}
-
-void ProtoDataProvider::reset() {
-  currentSequenceIndex_ = 0;
-  if (!skipShuffle_) {
-    shuffle();
-  }
-
-  DataProvider::reset();
-}
-
-void ProtoDataProvider::shuffle() {
-  std::shuffle(shuffledSequenceIds_.begin(),
-               shuffledSequenceIds_.end(),
-               ThreadLocalRandomEngine::get());
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sequence ranging from [begin, end),
-  op(begin, end) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sequenceLoop(Op op, int64_t size) {
-  int64_t sz = 0;
-  size_t i;
-  size_t sequenceCount = shuffledSequenceIds_.size();
-  if (usageRatio_ < 1.0f) {
-    sequenceCount = static_cast<int64_t>(sequenceCount * usageRatio_);
-  }
-  for (i = currentSequenceIndex_; i < sequenceCount; ++i) {
-    size_t id = shuffledSequenceIds_[i];
-    int64_t begin = sequenceStartPositions_[id];
-    int64_t end = sequenceStartPositions_[id + 1];
-    int64_t len = end - begin;
-    if (sz + len > size && sz > 0) break;
-    sz += len;
-    op(begin, end);
-  }
-  return i - currentSequenceIndex_;
-}
-
-/*
-  Loop through sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::sampleLoop(Op op, int64_t size) {
-  if (iidData()) {
-    size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-    for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-         ++i) {
-      size_t pos = shuffledSequenceIds_[i];
-      op(pos);
-    }
-    return size;
-  } else {
-    auto f = [op](int64_t begin, int64_t end) {
-      for (int64_t pos = begin; pos < end; ++pos) {
-        op(pos);
-      }
-    };
-    return sequenceLoop(f, size);
-  }
-}
-
-/*
-  Loop through sub-sequences starting from currentSequenceIndex_
-  for at most size samples. For each sample of each sub-sequence at position
-  pos, op(pos) will be called.
-
-  return the number of sub-sequences scanned
-*/
-template <class Op>
-int64_t ProtoDataProvider::subSampleLoop(Op op, int64_t size, int slot) {
-  CHECK(iidData()) << "subSampleLoop only accepts iid data";
-  size = std::min<int64_t>(sampleNums_ - currentSequenceIndex_, size);
-  int subSize = 0;
-  for (int64_t i = currentSequenceIndex_; i < currentSequenceIndex_ + size;
-       ++i) {
-    size_t pos = shuffledSequenceIds_[i];
-    int64_t* indexs = slots_[slot].indices.data();
-    int64_t* subIndexs = slots_[slot].subIndices.data();
-    int64_t subSeqStart = 0;
-    int64_t subSeqEnd = 0;
-    for (int j = 0; j < (int)slots_[slot].subIndices.size(); j++) {
-      if (subIndexs[j] == indexs[pos]) {
-        subSeqStart = j;
-        if (subIndexs[pos] == subIndexs[pos + 1]) {
-          subSeqEnd = j + 1;
-          break;
-        }
-      } else if (subIndexs[j] == indexs[pos + 1]) {
-        subSeqEnd = j;
-        break;
-      }
-    }
-    for (int j = subSeqStart; j < subSeqEnd; j++) {
-      op(j);
-    }
-    subSize += subSeqEnd - subSeqStart;
-  }
-  return subSize;
-}
-
-int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
-                                                DataBatch* batch) {
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  if (iidData()) {
-    size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-    numScannedSeqs = numSequences = size;
-  } else {
-    int64_t sz = 0;
-    auto op = [&sz, &numSequences](int64_t begin, int64_t end) {
-      ++numSequences;
-      sz += end - begin;
-    };
-    numScannedSeqs = sequenceLoop(op, size);
-    VLOG_IF(1, numScannedSeqs > numSequences)
-        << numScannedSeqs - numSequences
-        << " sequences are skipped because longer than " << size;
-    size = sz;
-  }
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  if (!iidData()) {
-    ICpuGpuVector::resizeOrCreate(cpuArguments[0].sequenceStartPositions,
-                                  numSequences + 1,
-                                  /* useGpu= */ false);
-    int* buf = cpuArguments[0].sequenceStartPositions->getMutableData(false);
-    int pos = 0;
-    int i = 0;
-    auto op = [buf, &pos, &i](int64_t begin, int64_t end) {
-      buf[i] = pos;
-      pos += end - begin;
-      ++i;
-    };
-    sequenceLoop(op, size);
-    buf[i] = size;
-    for (size_t slot = 1; slot < cpuArguments.size(); ++slot) {
-      cpuArguments[slot].sequenceStartPositions =
-          cpuArguments[0].sequenceStartPositions;
-    }
-  }
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    size_t dim = header_.slot_defs(slot).dim();
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_DENSE: {
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         NO_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseNonValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        size_t numElements = 0;
-        for (auto pos : dataPos) {
-          numElements +=
-              slots_[slot].indices[pos + 1] - slots_[slot].indices[pos];
-        }
-        nnzStats_[slot]->addSample(numElements);
-
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        if (!(cpuArguments[slot].value)) {
-          cpuArguments[slot].value =
-              Matrix::createSparseMatrix(size,
-                                         dim,
-                                         size /*DEFAULT_AVG_WIDTH = 1*/,
-                                         FLOAT_VALUE,
-                                         SPARSE_CSR,
-                                         false,
-                                         useGpu_);
-        }
-        auto mat = cpuArguments[slot].value;
-        mat->resize(size, dim);
-        if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data(),
-              HPPL_STREAM_1);
-        } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
-              dataPos.data(),
-              slots_[slot].indices.data(),
-              slots_[slot].sparseFloatValueData.data());
-        } else {
-          LOG(FATAL) << "Not Supported";
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE: {
-        CHECK_EQ(size, 1);
-        auto mat = cpuArguments[slot].value;
-        size_t totalDim = slots_[slot].varDenseData[dataPos[0]].data.size();
-
-        CHECK_EQ(slots_[slot].varDenseData[dataPos[0]].dims.size(), size_t(3));
-        size_t height, width, depth, oldWidth;
-        /* dims[2] is depth, will be changed to dims[0] in future */
-        depth = slots_[slot].varDenseData[dataPos[0]].dims[2];
-        height = slots_[slot].varDenseData[dataPos[0]].dims[1];
-        width = slots_[slot].varDenseData[dataPos[0]].dims[0];
-        oldWidth = width;
-        /* process the undesirable sample */
-        if (oldWidth < height) {
-          width = height;
-        }
-        cpuArguments[slot].setFrameHeight(height);
-        cpuArguments[slot].setFrameWidth(width);
-
-        if (oldWidth < height) {
-          totalDim = width * height * depth;
-        }
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               totalDim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        cpuArguments[slot].value->zeroMem();
-        if (oldWidth < height) {
-          real* srcBuf = slots_[slot].varDenseData[dataPos[0]].data.data();
-          for (size_t i = 0; i < depth; i++) {
-            for (size_t j = 0; j < height; j++) {
-              for (size_t k = 0; k < oldWidth; k++) {
-                buf[i * height * width + j * width + k] =
-                    srcBuf[i * height * oldWidth + j * oldWidth + k];
-              }
-            }
-          }
-        } else {
-          memcpy(buf,
-                 slots_[slot].varDenseData[dataPos[0]].data.data(),
-                 sizeof(real) * totalDim);
-        }
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        bufStarts[1] = 1;
-        break;
-      }
-      case SlotDef::VAR_MDIM_INDEX: {
-        CHECK_EQ(size, 1);
-        size_t totalDim = slots_[slot].varIndices[dataPos[0]].size();
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalDim,
-                                /*  useGpu= */ false);
-        int* buf = cpuArguments[slot].ids->getData();
-        memcpy(buf,
-               slots_[slot].varIndices[dataPos[0]].data(),
-               sizeof(int) * totalDim);
-
-        ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                      size + 1, /* size == 1 currently */
-                                      /* useGpu= */ false);
-        int* bufStarts =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        bufStarts[0] = 0;
-        /* we expand the convolutinal feature map to a sequence data,
-         * so there should be a corresponding sequence labels */
-        bufStarts[1] = totalDim;
-        break;
-      }
-      case SlotDef::STRING: {
-        if (cpuArguments[slot].strs) {
-          cpuArguments[slot].strs->resize(size);
-        } else {
-          cpuArguments[slot].strs =
-              std::make_shared<std::vector<std::string>>(size);
-        }
-        for (int i = 0; i < size; ++i) {
-          (*cpuArguments[slot].strs)[i] = slots_[slot].strData[dataPos[i]];
-        }
-        break;
-      }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (int i = 0; i < header_.slot_defs_size(); ++i) {
-      SlotDef::SlotType slotType = header_.slot_defs(i).type();
-      if (SlotDef::VECTOR_SPARSE_VALUE == slotType ||
-          SlotDef::VECTOR_SPARSE_NON_VALUE == slotType) {
-        gpuArguments[i] = cpuArguments[i];
-        gpuArguments[i].sequenceStartPositions =
-            cpuArguments[i].sequenceStartPositions;
-      } else {
-        gpuArguments[i].resizeAndCopyFrom(
-            cpuArguments[i], useGpu_, HPPL_STREAM_1);
-      }
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-
-  return batch->getSize();
-}
-
-ProtoSequenceDataProvider::ProtoSequenceDataProvider(const DataConfig& config,
-                                                     bool useGpu,
-                                                     bool loadDataAll)
-    : ProtoDataProvider(config, useGpu, loadDataAll) {}
-
-int64_t ProtoSequenceDataProvider::getNextBatchInternal(int64_t size,
-                                                        DataBatch* batch) {
-  CHECK(iidData()) << "ProtoSequenceDataProvider only accepts iid data";
-  int64_t numSequences = 0;  // actual number of sequences in the batch
-
-  // the number of sequences scanned, including those skipped because too long
-  int64_t numScannedSeqs = 0;
-  std::lock_guard<RWLock> guard(lock_);
-  size = std::min<int64_t>(getSize() - currentSequenceIndex_, size);
-  numScannedSeqs = numSequences = size;
-  if (size <= 0) return 0;
-
-  DataBatch& cpuBatch = *cpuBatch_;
-  std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-  cpuBatch.setSize(size);
-  cpuArguments.resize(header_.slot_defs_size());
-
-  for (int slot = 0; slot < header_.slot_defs_size(); ++slot) {
-    SlotDef::SlotType slotType = header_.slot_defs(slot).type();
-
-    std::vector<int64_t> dataPos;
-    dataPos.reserve(size);
-    auto op = [this, &dataPos](int64_t pos) { dataPos.push_back(pos); };
-    sampleLoop(op, size);
-
-    // current slot: sequenceStartPositions
-    ICpuGpuVector::resizeOrCreate(cpuArguments[slot].sequenceStartPositions,
-                                  size + 1,
-                                  /* useGpu= */ false);
-
-    switch (slotType) {
-      case SlotDef::VECTOR_SPARSE_VALUE:
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "ProtoSequenceDataProvider only support"
-                   << " VECTOR_DENSE, VECTOR_SPARSE_NON_VALUE and INDEX slots";
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // copy to IDS, not value
-        // pointers used in current slot
-        sparse_non_value_t* data = slots_[slot].sparseNonValueData.data();
-        int64_t* indexs = slots_[slot].indices.data();
-        int64_t* seqs = dataPos.data();
-
-        // current slot: i need size instances. what is the total length?
-        int totalFeatureInCurrentSlot = 0;
-        for (int ins = 0; ins < size; ins++) {
-          int64_t currInsId = seqs[ins];
-          totalFeatureInCurrentSlot +=
-              indexs[currInsId + 1] - indexs[currInsId];
-          // special: if current instance has NO feature in current slot
-          if (indexs[currInsId + 1] == indexs[currInsId]) {
-            totalFeatureInCurrentSlot++;
-          }
-        }
-        // done
-
-        // current slot: ids
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                totalFeatureInCurrentSlot,
-                                /* useGpu= */ false);
-
-        // where to write
-        int* currPosOfArgumentId = cpuArguments[slot].ids->getData();
-        int* currPosOfArgumentSeqStart =
-            cpuArguments[slot].sequenceStartPositions->getMutableData(false);
-        int allSequenceLength = 0;
-        currPosOfArgumentSeqStart[0] = 0;
-        // for each instance, copy data and fill sequence positions
-        for (int instance = 0; instance < size; instance++) {
-          int64_t currInstanceId = seqs[instance];
-          int64_t currInstanceLength =
-              indexs[currInstanceId + 1] - indexs[currInstanceId];
-          sparse_non_value_t* currInstanceData = data + indexs[currInstanceId];
-          // write sequenceStartPositions
-          allSequenceLength += currInstanceLength;
-          currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-          // copy features
-          for (int featCopier = 0; featCopier < currInstanceLength;
-               featCopier++) {
-            currPosOfArgumentId[featCopier] = currInstanceData[featCopier].col;
-          }
-          currPosOfArgumentId += currInstanceLength;
-          // special: if current instance has NO feature in current slot
-          if (currInstanceLength == 0) {
-            allSequenceLength++;
-            currPosOfArgumentSeqStart[instance + 1] = allSequenceLength;
-            currPosOfArgumentId[0] = -1;
-            currPosOfArgumentId++;
-          }
-          // done
-        }
-        if (slots_[slot].subIndices.size()) {
-          std::vector<int64_t> dataSubPos;
-          auto op = [this, &dataSubPos](int64_t pos) {
-            dataSubPos.push_back(pos);
-          };
-          int subSize = subSampleLoop(op, size, slot);
-          ICpuGpuVector::resizeOrCreate(
-              cpuArguments[slot].subSequenceStartPositions, subSize + 1, false);
-          int* currPosOfArgumentSubSeqStart =
-              cpuArguments[slot].subSequenceStartPositions->getMutableData(
-                  false);
-          int64_t* subSeqs = dataSubPos.data();
-          int64_t* subIndexs = slots_[slot].subIndices.data();
-          int allSubSequenceLength = 0;
-          currPosOfArgumentSubSeqStart[0] = 0;
-          // for each instance, compute sub-sequence number
-          for (int instance = 0; instance < subSize; instance++) {
-            int64_t currSubInstanceId = subSeqs[instance];
-            int64_t currSubInstanceLength =
-                subIndexs[currSubInstanceId + 1] - subIndexs[currSubInstanceId];
-            // write subSequenceStartPositions
-            allSubSequenceLength += currSubInstanceLength;
-            currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            // special: if current instance has NO feature in current slot
-            if (currSubInstanceLength == 0) {
-              allSubSequenceLength++;
-              currPosOfArgumentSubSeqStart[instance + 1] = allSubSequenceLength;
-            }
-          }
-          cpuArguments[slot].checkSubset();
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        // label slot
-        IVector::resizeOrCreate(cpuArguments[slot].ids,
-                                size,
-                                /* useGpu= */ false);
-        // fill labels
-        int* buf = cpuArguments[slot].ids->getData();
-        for (int i = 0; i < size; ++i) {
-          buf[i] = slots_[slot].indexData[dataPos[i]];
-        }
-        // label HAS sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // copy values
-        size_t dim = header_.slot_defs(slot).dim();
-        Matrix::resizeOrCreate(cpuArguments[slot].value,
-                               size,
-                               dim,
-                               false,   // trans = false
-                               false);  // useGpu = false
-        real* buf = cpuArguments[slot].value->getData();
-        for (int i = 0; i < size; ++i) {
-          memcpy(buf + i * dim,
-                 slots_[slot].denseData.data() + dataPos[i] * dim,
-                 sizeof(real) * dim);
-        }
-        // sequence structure
-        cpuArguments[slot].sequenceStartPositions->fillSequence(false);
-        break;
-      }
-      default: { LOG(FATAL) << "should not reach here"; }
-    }
-  }
-
-  if (useGpu_) {
-    std::vector<Argument>& cpuArguments = cpuBatch.getStreams();
-    DataBatch& gpuBatch = *gpuBatch_;
-    std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
-    gpuArguments.resize(cpuArguments.size());
-    gpuBatch.setSize(size);
-    for (size_t i = 0; i < cpuArguments.size(); ++i) {
-      gpuArguments[i].resizeAndCopyFrom(
-          cpuArguments[i], useGpu_, HPPL_STREAM_1);
-    }
-    hl_stream_synchronize(HPPL_STREAM_1);
-    *batch = gpuBatch;
-  } else {
-    *batch = cpuBatch;
-  }
-
-  currentSequenceIndex_ += numScannedSeqs;
-  return batch->getSize();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
deleted file mode 100644
index 7dd45e0622..0000000000
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-
-#include "DataFormat.pb.h"
-#include "paddle/utils/Stat.h"
-
-#include "DataProvider.h"
-#include "ProtoReader.h"
-
-namespace paddle {
-
-/**
- * @brief Provider data from protobuf data file with each sample
- * specified by proto message
- *
- * DataSample defined in DataFormat.proto.
- *
- * The file format is
- *
- *    header
- *
- *    sample1
- *
- *    sample2
- *
- *    ...
- *
- *    sampleN
- *
- * @note: In the data file, each message is prefixed with its length.
- * The read/write of the protbuf are implemented in ProtoReader.h
- */
-class ProtoDataProvider : public DataProvider {
-public:
-  ProtoDataProvider(const DataConfig& config,
-                    bool useGpu,
-                    bool loadDataAll = true);
-  virtual void reset();
-
-  /**
-   * @note this size includes the sequences which are skipped because they
-   * are longer than the batch size.
-   */
-  virtual int64_t getSize() {
-    int64_t size = sampleNums_;
-    if (usageRatio_ < 1.0f) {
-      size = static_cast<int64_t>(size * usageRatio_);
-    }
-    return size;
-  }
-  virtual void shuffle();
-
-  void loadData(const std::vector<std::string>& fileList);
-
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-
-protected:
-  /**
-   * @brief load protobuf data from a list of file
-   * @param[in]  fileName  file name of a file which contains
-   * a list of file names
-   */
-  void loadData(const std::string& fileName);
-
-  /**
-   * @brief load protobuf data from file
-   * @param[in]  fileName   data file name
-   */
-  void loadDataFile(const std::string& fileName);
-  /** @brief check data header of each data sample
-   *  @param[in] header     data header read from protobuf data
-   */
-  void checkDataHeader(const DataHeader& header);
-  /**
-   * @brief fill protobuf data into slot_,
-   * slot_ is a vector of ProtoSlot in memory.
-   * @param[in]  sample     data sample read from protobuf data
-   */
-  void fillSlots(const DataSample& sample);
-
-  /**
-   * @brief return true if each sample is one sequence, i.e., independent
-   * of other samples.
-   */
-  inline bool iidData() const { return sequenceStartPositions_.empty(); }
-
-  /**
-   * @brief check that sample is consistent with header_
-   */
-  void checkSample(const DataSample& sample);
-
-  template <class Op>
-  int64_t sequenceLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t sampleLoop(Op op, int64_t size);
-
-  template <class Op>
-  int64_t subSampleLoop(Op op, int64_t size, int slot);
-
-  void showDataStats();
-
-protected:
-  struct ProtoVarSlot {
-    std::vector<real> data;
-    std::vector<int> dims;
-  };
-
-  struct ProtoSlot {
-    SlotDef::SlotType type;
-    int dim;
-    std::vector<int> indexData;
-    std::vector<real> denseData;
-    std::vector<sparse_non_value_t> sparseNonValueData;
-    std::vector<sparse_float_value_t> sparseFloatValueData;
-    std::vector<int64_t> indices;
-    std::vector<int64_t> subIndices;
-
-    std::vector<ProtoVarSlot> varDenseData;
-    std::vector<std::vector<int>> varIndices;
-    std::vector<std::string> strData;
-  };
-  DataHeader header_;
-  int numVecSlots_;
-
-  std::vector<ProtoSlot> slots_;
-  size_t sampleNums_;
-
-  /**
-   * The starting position of each sequence in samples.
-   * The last element should be num of samples.
-   * If empty, each sample is one sequence.
-   */
-  std::vector<size_t> sequenceStartPositions_;
-
-  int64_t currentSequenceIndex_;
-
-  // The size should be the number of sequences.
-  std::vector<size_t> shuffledSequenceIds_;
-
-  ThreadLocalD<DataBatch> cpuBatch_;
-  ThreadLocalD<DataBatch> gpuBatch_;
-
-  RWLock lock_;
-  std::vector<StatPtr> nnzStats_;  // stats for number of none-zeros entries
-};
-
-/**
- * @brief Special use for Proto data: instances should contain sparse-non-value
- * slots
- * and label.
- *
- * @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
- */
-class ProtoSequenceDataProvider : public ProtoDataProvider {
-public:
-  ProtoSequenceDataProvider(const DataConfig& config,
-                            bool useGpu,
-                            bool loadDataAll = true);
-  ~ProtoSequenceDataProvider() {}
-  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 132119015f..92087fa32b 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -14,18 +14,20 @@ limitations under the License. */
 
 #include "Evaluator.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/StringUtil.h"
 
 namespace paddle {
 
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public NotGetableEvaluator {
+class CTCErrorEvaluator : public Evaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
   real deletions_, insertions_, substitutions_;
   int seqClassficationError_;
+  mutable std::unordered_map<std::string, real> evalResults_;
 
   std::vector<int> path2String(const std::vector<int>& path) {
     std::vector<int> str;
@@ -183,6 +185,18 @@ private:
     return stringAlignment(gtStr, recogStr);
   }
 
+  void storeLocalValues() const {
+    evalResults_["error"] = numSequences_ ? totalScore_ / numSequences_ : 0;
+    evalResults_["deletion_error"] =
+        numSequences_ ? deletions_ / numSequences_ : 0;
+    evalResults_["insertion_error"] =
+        numSequences_ ? insertions_ / numSequences_ : 0;
+    evalResults_["substitution_error"] =
+        numSequences_ ? substitutions_ / numSequences_ : 0;
+    evalResults_["sequence_error"] =
+        (real)seqClassficationError_ / numSequences_;
+  }
+
 public:
   CTCErrorEvaluator()
       : numTimes_(0),
@@ -245,16 +259,12 @@ public:
   }
 
   virtual void printStats(std::ostream& os) const {
-    os << config_.name() << "="
-       << (numSequences_ ? totalScore_ / numSequences_ : 0);
-    os << "  deletions error"
-       << "=" << (numSequences_ ? deletions_ / numSequences_ : 0);
-    os << "  insertions error"
-       << "=" << (numSequences_ ? insertions_ / numSequences_ : 0);
-    os << "  substitutions error"
-       << "=" << (numSequences_ ? substitutions_ / numSequences_ : 0);
-    os << "  sequences error"
-       << "=" << (real)seqClassficationError_ / numSequences_;
+    storeLocalValues();
+    os << config_.name() << " error = " << evalResults_["error"];
+    os << " deletions error = " << evalResults_["deletion_error"];
+    os << " insertions error = " << evalResults_["insertion_error"];
+    os << " substitution error = " << evalResults_["substitution_error"];
+    os << " sequence error = " << evalResults_["sequence_error"];
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -272,6 +282,37 @@ public:
     seqClassficationError_ = (int)buf[4];
     numSequences_ = (int)buf[5];
   }
+
+  void getNames(std::vector<std::string>* names) {
+    storeLocalValues();
+    names->reserve(names->size() + evalResults_.size());
+    for (auto it = evalResults_.begin(); it != evalResults_.end(); ++it) {
+      names->push_back(config_.name() + "." + it->first);
+    }
+  }
+
+  real getValue(const std::string& name, Error* err) const {
+    storeLocalValues();
+
+    std::vector<std::string> buffers;
+    paddle::str::split(name, '.', &buffers);
+    auto it = evalResults_.find(buffers[buffers.size() - 1]);
+
+    if (it == evalResults_.end()) {
+      *err = Error("Evaluator does not have the key %s", name.c_str());
+      return 0.0f;
+    }
+
+    return it->second;
+  }
+
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "ctc_edit_distance";
+  }
 };
 
 REGISTER_EVALUATOR(ctc_edit_distance, CTCErrorEvaluator);
diff --git a/paddle/gserver/evaluators/ChunkEvaluator.cpp b/paddle/gserver/evaluators/ChunkEvaluator.cpp
index 1658282f3a..a2ab15eede 100644
--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -268,7 +268,13 @@ public:
   }
 
   // get type of evaluator
-  std::string getTypeImpl() const { return "chunk"; }
+  std::string getType(const std::string& name, Error* err) const {
+    this->getValue(name, err);
+    if (!err->isOK()) {
+      return "";
+    }
+    return "chunk";
+  }
 
 private:
   void storeLocalValues() const {
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000..9b825db574
--- /dev/null
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index 9db6d252d9..8e66b1f0db 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -395,14 +395,24 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
   CHECK_LE(arguments.size(), (size_t)3);
   MatrixPtr output = arguments[0].value;
   IVectorPtr label = arguments[1].ids;
+  MatrixPtr labelval = arguments[1].value;
   bool supportWeight = (3 == arguments.size()) ? true : false;
   MatrixPtr weight = supportWeight ? arguments[2].value : nullptr;
-  if (nullptr == output || nullptr == label ||
-      (supportWeight && nullptr == weight)) {
+
+  if (nullptr == output || (supportWeight && nullptr == weight)) {
     return 0;
   }
   size_t insNum = output->getHeight();
   size_t outputDim = output->getWidth();
+  // Copy label from value to a vector.
+  if (nullptr == label && nullptr != labelval) {
+    // label width is 1
+    CHECK_EQ(1U, labelval->getWidth());
+    VectorPtr vec =
+        Vector::create(labelval->getData(), insNum, output->useGpu());
+    label = vec->castToInt();
+  }
+
   CHECK_EQ(insNum, label->getSize());
   if (supportWeight) {
     CHECK_EQ(insNum, weight->getHeight());
@@ -443,6 +453,7 @@ real AucEvaluator::evalImp(std::vector<Argument>& arguments) {
   int* labelD = label->getData();
   real* weightD = supportWeight ? weight->getData() : nullptr;
   size_t pos = realColumnIdx_;
+
   for (size_t i = 0; i < insNum; ++i) {
     real value = outputD[pos];
     uint32_t binIdx = static_cast<uint32_t>(value * kBinNum_);
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index b114500e2b..90203553e0 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -211,6 +211,7 @@ public:
     *err = Error("Not implemented");
     return .0f;
   }
+
   std::string getType(const std::string& name, Error* err) const {
     *err = Error("Not implemented");
     return "";
@@ -331,6 +332,7 @@ private:
 protected:
   std::string getTypeImpl() const;
 };
+
 /**
  * @brief precision, recall and f1 score Evaluator
  * \f[
@@ -358,6 +360,12 @@ public:
 
   virtual void distributeEval(ParameterClient2* client);
 
+  void getNames(std::vector<std::string>* names);
+
+  real getValue(const std::string& name, Error* err) const;
+
+  std::string getType(const std::string& name, Error* err) const;
+
   struct StatsInfo {
     /// numbers of true positives
     double TP;
@@ -428,11 +436,6 @@ private:
   mutable std::unordered_map<std::string, real> values_;
 
   void storeLocalValues() const;
-  // Evaluator interface
-public:
-  void getNames(std::vector<std::string>* names);
-  real getValue(const std::string& name, Error* err) const;
-  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index b44e4dc202..de5faf5e1e 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -17,12 +17,15 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/utils/Logging.h"
 
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "GradientMachineMode.h"
 #include "MultiGradientMachine.h"
 #include "MultiNetwork.h"
-#include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
-#include "hl_gpu.h"
+#endif
 
 namespace paddle {
 
@@ -30,13 +33,16 @@ GradientMachine* GradientMachine::create(
     const ModelConfig& config,
     int mode,
     const std::vector<ParameterType>& parameterTypes) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
     return gm;
   }
   if (FLAGS_trainer_count > 1) {
     return new MultiGradientMachine(config, FLAGS_use_gpu);
   }
+#endif
   if (FLAGS_trainer_count == 1) {  // single
+#ifndef PADDLE_MOBILE_INFERENCE
     NeuralNetwork* nn;
     if (config.type() == "multi_nn") {
       /* multi submodel calculate, thread(s) will be initialized inside */
@@ -48,6 +54,9 @@ GradientMachine* GradientMachine::create(
       /* single thread calculate */
       nn = NeuralNetwork::create(config);
     }
+#else
+    NeuralNetwork* nn = NeuralNetwork::create(config);
+#endif
     ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
       para->enableType(PARAMETER_VALUE);
     };
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index f9c82a2bef..4ab54a5022 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -20,13 +20,16 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "TrainerConfig.pb.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdaterBase.h"
 #include "paddle/utils/Thread.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "paddle/gserver/evaluators/Evaluator.h"
+#endif
+
 namespace paddle {
 /**
  * @brief A gradient machine is capable of calculating some outputs given
@@ -147,6 +150,7 @@ public:
 
   virtual void onPassEnd() = 0;
 
+#ifndef PADDLE_MOBILE_INFERENCE
   /**
    * Create an evaluator which can be used for eval()
    */
@@ -156,6 +160,7 @@ public:
    * evaluate using the given evaluator
    */
   virtual void eval(Evaluator* evaluator) const = 0;
+#endif
 
   std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
@@ -228,6 +233,13 @@ public:
     (void)numProcessed;
   }
 
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  virtual void releaseOutput() {}
+
 protected:
   virtual void onLoadParameter() {}
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 3159026e6b..018da6c76d 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
 
   outArgStream_ = HPPL_STREAM_1;
 
+  start();
+}
+
+void MultiGradientMachine::start() {
   for (auto& thread : threads_) {
     thread->start();
   }
 }
 
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
 std::vector<const std::vector<ParameterPtr>*>
 MultiGradientMachine::getSlaveParameters() {
   std::vector<const std::vector<ParameterPtr>*> vec;
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
   }
 }
 
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
 Evaluator* MultiGradientMachine::makeEvaluator() const {
   return threads_[0]->getGradientMachine()->makeEvaluator();
 }
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
-  stopping_ = false;
+  stopping_ = true;
   updateCounter_ = 0;
   parameterUpdated_ = false;
 }
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
   gradientMachine_->start();
 
   computeThread_.reset(new std::thread([this]() { computeThread(); }));
@@ -593,7 +601,7 @@ void TrainerThread::backward() {
 
 void TrainerThread::backwardCallback(Parameter* para) {
   // CPU parameters are merged in the end
-  if (!para->useGpu()) return;
+  if (!para->useGpu() || para->isStatic()) return;
 
   int paramId = para->getID();
   if (multiMachine_->getNumThreads() == 1) {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 70203bbb97..5e7622f929 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -176,6 +176,10 @@ public:
 
   explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
 
+  virtual void start();
+
+  virtual void finish();
+
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
@@ -193,8 +197,6 @@ public:
 
   virtual void onPassEnd();
 
-  virtual void finish();
-
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 4512aacc81..1f2aa61b6f 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -14,15 +14,21 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+#endif
+
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
-#include "NeuralNetwork.h"
 #include "RecurrentGradientMachine.h"
-#include "hl_gpu.h"
 #include "paddle/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Stat.h"
+#endif
 
 namespace paddle {
 void parameterInitNN(int paramId,
@@ -54,6 +60,7 @@ void parameterInitNN(int paramId,
 }
 
 NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (config.type() == "recurrent_nn") {
     return newNeuralNetwork("root");
   } else if (config.type() == "multi_nn") {
@@ -61,6 +68,9 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
   } else {
     return newNeuralNetwork();
   }
+#else
+  return new NeuralNetwork();
+#endif
 }
 
 std::map<std::string, bool> NeuralNetwork::dllInitMap;
@@ -177,14 +187,41 @@ void NeuralNetwork::init(const ModelConfig& config,
     CHECK(it != layerMap_.end());
     outputLayers_.push_back(it->second);
   }
+
+  for (const auto& layer : layers_) {
+    const auto& name = layer->getName();
+    bool isMiddleLayer = true;
+
+    // if data layer
+    for (const auto& dataLayer : dataLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    // if output layer
+    for (const auto& dataLayer : outputLayers_) {
+      if (name == dataLayer->getName()) {
+        isMiddleLayer = false;
+        break;
+      }
+    }
+
+    if (isMiddleLayer) {
+      middleLayers_.push_back(layer);
+    }
+  }
 }
 
 void NeuralNetwork::connect(LayerPtr agentLayer,
                             LayerPtr realLayer,
                             int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
   AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
   CHECK_NOTNULL(agent);
   agent->setRealLayer(realLayer, height);
+#endif
 }
 
 void NeuralNetwork::connect(std::string agentLayerName,
@@ -202,7 +239,7 @@ void NeuralNetwork::prefetch(const std::vector<Argument>& inArgs) {
         auto mat = dynamic_cast<SparsePrefetchRowCpuMatrix*>(
             para->getMat(PARAMETER_VALUE).get());
         para->clearGradient();
-        mat->clearIndices();
+        if (mat) mat->clearIndices();
       }
     }
   }
@@ -241,11 +278,14 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
     dataLayers_[i]->setData(inArgs[i]);
   }
 
+  gLayerStackTrace.set_stage(true);
+
   {
     for (auto& layer : layers_) {
       REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
       gLayerStackTrace.push(layer->getName());
       layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
     }
   }
 
@@ -254,9 +294,6 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
   for (auto& layer : outputLayers_) {
     outArgs->push_back(layer->getOutput());
   }
-  if (passType == PASS_TEST) {
-    gLayerStackTrace.clear();
-  }
 }
 
 void NeuralNetwork::resetState() {
@@ -283,9 +320,10 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.pop("");  // tell layer trace is during backward.
+  gLayerStackTrace.set_stage(false);
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
     if ((*layer)->needGradient()) {
       (*layer)->backward(callback);
     }
@@ -293,6 +331,17 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
+void NeuralNetwork::finish() {
+#ifdef PADDLE_WITH_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
 Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
   return getLayer(layerName)->getOutput();
 }
@@ -303,40 +352,49 @@ void NeuralNetwork::onPassEnd() {
   }
 }
 
+void NeuralNetwork::releaseOutput() {
+  for (auto& layer : middleLayers_) {
+    Argument& arg = layer->getOutput();
+    arg.value.reset();
+  }
+}
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 class CombinedEvaluator : public Evaluator {
 public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
-  virtual void start() {
+  void start() override {
     for (auto& evaluator : evaluators_) {
       evaluator->start();
     }
   }
 
-  virtual void finish() {
+  void finish() override {
     for (auto& evaluator : evaluators_) {
       evaluator->finish();
     }
   }
 
-  virtual void eval(const NeuralNetwork& nn) {
+  void eval(const NeuralNetwork& nn) override {
     for (auto& evaluator : evaluators_) {
       evaluator->eval(nn);
     }
   }
-  virtual real evalImp(std::vector<Argument>& arguments) {
+  real evalImp(std::vector<Argument>& arguments) override {
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) const {
+  void printStats(std::ostream& os) const override {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
     }
   }
 
-  virtual void distributeEval(ParameterClient2* client) {
+  void distributeEval(ParameterClient2* client) override {
     for (auto& evaluator : evaluators_) {
       evaluator->distributeEval(client);
     }
@@ -351,7 +409,7 @@ public:
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
    */
-  void getNames(std::vector<std::string>* names) {
+  void getNames(std::vector<std::string>* names) override {
     for (auto& eval : evaluators_) {
       eval->getNames(names);
     }
@@ -360,7 +418,7 @@ public:
   /**
    * @brief getValue could get all inside evaluators' value.
    */
-  real getValue(const std::string& name, Error* err) const {
+  real getValue(const std::string& name, Error* err) const override {
     return this->getMethodHelper<real>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getValue(name, err);
@@ -370,7 +428,7 @@ public:
   /**
    * @brief getType could get all inside evaluators' type.
    */
-  std::string getType(const std::string& name, Error* err) const {
+  std::string getType(const std::string& name, Error* err) const override {
     return this->getMethodHelper<std::string>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getType(name, err);
@@ -395,6 +453,30 @@ private:
   }
 };
 
+class SubnetEvaluator : public CombinedEvaluator {
+public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+protected:
+  std::string layerName_;
+};
+
 Evaluator* NeuralNetwork::makeEvaluator() const {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
   auto subModelConfig = std::find_if(config_.sub_models().begin(),
@@ -421,6 +503,15 @@ Evaluator* NeuralNetwork::makeEvaluator() const {
         combinedEvaluator->addEvaluator(std::move(evaluator));
       }
     }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
   } else {
     for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
       std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
@@ -432,6 +523,8 @@ Evaluator* NeuralNetwork::makeEvaluator() const {
 
 void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
 
+#endif
+
 void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_GE(outputLayers_.size(), args.size());
   for (size_t i = 0; i < args.size(); ++i) {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index e7b6c43840..968e198cf6 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -97,9 +97,12 @@ public:
 
   virtual void onPassEnd();
 
+#ifndef PADDLE_MOBILE_INFERENCE
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
+#endif
+
   virtual void resetState();
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
@@ -129,6 +132,18 @@ public:
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
                                          NeuralNetwork* rootNetwork = nullptr);
 
+  const std::string& getName() const { return subModelName_; }
+
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish();
+
+  /**
+   * @brief   Release the middle layer's output memory.
+   *
+   * @note    This function is used for memory optimization in inference.
+   */
+  void releaseOutput();
+
 protected:
   /**
    * The constructor of NeuralNetwork.
@@ -150,6 +165,7 @@ protected:
 
   std::vector<DataLayerPtr> dataLayers_;
   std::vector<LayerPtr> outputLayers_;
+  std::vector<LayerPtr> middleLayers_;
 
   static std::map<std::string, bool> dllInitMap;
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 01158d1dce..9f29b97466 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -184,7 +184,7 @@ public:
   }
 
   void backward(const UpdateCallback& callback) override {
-    if (biases_) {
+    if (biases_ && biases_->getWGrad()) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
       biases_->getParameterPtr()->incUpdate(callback);
@@ -208,13 +208,13 @@ void RecurrentGradientMachine::init(
                    });
   CHECK(subModelConfig != config.sub_models().end());
   reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
 
   inFrameLines_.resize(subModelConfig->in_links_size());
   for (size_t i = 0; i < inFrameLines_.size(); ++i) {
     inFrameLines_[i].linkName = subModelConfig->in_links(i).link_name();
     inFrameLines_[i].inLayer =
         rootNetwork_->getLayer(subModelConfig->in_links(i).layer_name());
-    inFrameLines_[i].hasSubseq = subModelConfig->in_links(i).has_subseq();
   }
 
   outFrameLines_.resize(subModelConfig->out_links_size());
@@ -241,11 +241,8 @@ void RecurrentGradientMachine::init(
           rootNetwork_->getLayer(memoryConfig.boot_layer_name());
 
       LayerConfig scatterConfig = *agentConfig;
-      memoryFrameLines_[i].is_sequence = memoryConfig.is_sequence();
       memoryFrameLines_[i].rootAgent.reset(
-          memoryConfig.is_sequence()
-              ? new SequenceScatterAgentLayer(scatterConfig)
-              : new ScatterAgentLayer(scatterConfig));
+          new ScatterAgentLayer(scatterConfig));
       memoryFrameLines_[i].rootAgent->init(LayerMap(), parameterMap_);
 
       memoryFrameLines_[i].bootLayer = memoryFrameLines_[i].rootAgent;
@@ -267,9 +264,7 @@ void RecurrentGradientMachine::init(
     if (subModelConfig->has_generator()) {
       memoryFrameLines_[i].scatterAgents.resize(2);
       for (auto& agent : memoryFrameLines_[i].scatterAgents) {
-        agent.reset(memoryConfig.is_sequence()
-                        ? new SequenceScatterAgentLayer(*agentConfig)
-                        : new ScatterAgentLayer(*agentConfig));
+        agent.reset(new ScatterAgentLayer(*agentConfig));
         agent->init(LayerMap(), parameterMap_);
       }
     }
@@ -293,12 +288,6 @@ void RecurrentGradientMachine::init(
       parameterIds_.push_back(para->getID());
     }
   }
-
-  if (subModelConfig->evaluator_names_size() > 0) {
-    evaluator_.reset(frames_[0]->makeEvaluator());
-  }
-
-  targetInfoInlinkId_ = subModelConfig->target_inlinkid();
 }
 
 void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
@@ -376,108 +365,102 @@ void RecurrentGradientMachine::prefetch(const std::vector<Argument>& inArgs) {
   LOG(FATAL) << "should not use this function";
 }
 
-void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
-                                       std::vector<Argument>* outArgs,
-                                       PassType passType) {
-  if (inFrameLines_.empty() && passType == PASS_TEST) {
-    generateSequence();
-    return;
-  }  // else forward..
-
-  const Argument& input = inFrameLines_[0].inLayer->getOutput();
-  CHECK(input.sequenceStartPositions);
-  int batchSize = input.getBatchSize();
-  size_t numSequences = input.getNumSequences();
-  const int* starts = input.sequenceStartPositions->getData(false);
-  bool hasSubseq = input.hasSubseq();
-
-  // In case of !hasSubseq or targetInfoInlinkId_ == -1, all inlinks share the
-  // same inframe info
-  bool shareInlinkInfo = !hasSubseq || targetInfoInlinkId_ == -1;
-
-  // Defaultly, share info with the first inlink
-  if (shareInlinkInfo) {
-    targetInfoInlinkId_ = 0;
-  }
-
-  // check hasSubseq in both config and input are the same
-  CHECK_EQ(hasSubseq, inFrameLines_[0].hasSubseq);
-
-  CHECK_EQ(starts[numSequences], batchSize);
-  CHECK(input.sequenceStartPositions);
-
-  // check other inputs has same sequence length and start
-  for (size_t i = 1; i < inFrameLines_.size(); ++i) {
-    const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-    CHECK_EQ((size_t)input1.getNumSequences(), numSequences);
-    // check all inputs should have same hasSubseq flag
-    CHECK_EQ(input.hasSubseq(), inFrameLines_[0].hasSubseq);
-
-    // if shareInlinkInfo, checks:
-    // 1. all inlinks have same number of total tokens
-    // 2. all inlinks have same number of tokens for each sentence of each
-    //    sample. If hasSubseq, one sample has multiple sentence, else, one
-    //    sample is one sentence
-    if (shareInlinkInfo) {
-      CHECK_EQ(input1.getBatchSize(), batchSize);
-      CHECK(std::equal(starts,
-                       starts + numSequences + 1,
-                       input1.sequenceStartPositions->getData(false)));
+void RecurrentGradientMachine::checkInputConsistency(
+    int inlinkId, const std::vector<Argument::SeqInfo>& seqInfo) {
+  if (commonSeqInfo_.empty()) {
+    commonSeqInfo_.resize(seqInfo.size());
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      commonSeqInfo_[i].topLevelLength = seqInfo[i].topLevelLength;
+      commonSeqInfo_[i].seqId = seqInfo[i].seqId;
+    }
+  } else {
+    CHECK_EQ(commonSeqInfo_.size(), seqInfo.size())
+        << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+        << " has mismatched number of sequences";
+    for (size_t i = 0; i < seqInfo.size(); ++i) {
+      CHECK_EQ(commonSeqInfo_[i].topLevelLength, seqInfo[i].topLevelLength)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
+      CHECK_EQ(commonSeqInfo_[i].seqId, seqInfo[i].seqId)
+          << " RecurrentGroup " << subModelName_ << " input " << inlinkId
+          << " has mismatched sequence length";
     }
   }
+}
 
-  if (hasSubseq) {
-    CHECK(input.subSequenceStartPositions);
-    size_t numSubSequences = input.getNumSubSequences();
-    const int* subStarts = input.subSequenceStartPositions->getData(false);
-    CHECK_EQ(subStarts[numSubSequences], batchSize);
-    // if hasSubseq, check other inputs has same sub-sequence and sub-start
-    for (size_t i = 1; i < inFrameLines_.size(); ++i) {
-      const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-      CHECK_EQ((size_t)input1.getNumSubSequences(), numSubSequences);
-      if (shareInlinkInfo) {
-        CHECK(std::equal(subStarts,
-                         subStarts + numSubSequences + 1,
-                         input1.subSequenceStartPositions->getData(false)));
-      }
+void RecurrentGradientMachine::calcNumSequencesAtEachStep() {
+  int numSequences = commonSeqInfo_.size();
+  numSeqs_.resize(maxSequenceLength_);
+  for (int i = 0; i < numSequences; ++i) {
+    for (int j = 0; j < commonSeqInfo_[i].topLevelLength; ++j) {
+      numSeqs_[j] = i + 1;
     }
   }
+}
 
+void RecurrentGradientMachine::reorganizeInput(PassType passType) {
   info_.clear();
   info_.resize(inFrameLines_.size());
 
+  commonSeqInfo_.clear();
   seqInfos_.clear();
   seqInfos_.resize(inFrameLines_.size());
 
+  for (size_t i = 0; i < inFrameLines_.size(); i++) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      continue;
+    }
+    input.getSeqInfo(&seqInfos_[i]);
+    checkInputConsistency(i, seqInfos_[i]);
+  }
+  CHECK(!commonSeqInfo_.empty())
+      << "At least one input needs to be sequence or subsequence";
+  maxSequenceLength_ = commonSeqInfo_[0].topLevelLength;
+
+  calcNumSequencesAtEachStep();
+
+  for (size_t i = 0; i < inFrameLines_.size(); ++i) {
+    const Argument& input = inFrameLines_[i].inLayer->getOutput();
+    if (!input.hasSeq()) {
+      seqInfos_[i] = commonSeqInfo_;
+    }
+    createInFrameInfo(i, input, passType);
+  }
+
   {
     AsyncGpuBlock asyncGpuBlock;
-    // if shareInlinkInfo, only calculate info of the first inlink
-    // else, calculate info for each inlink
-    if (shareInlinkInfo) {
-      input.getSeqInfo(&seqInfos_[0]);
-      maxSequenceLength_ = seqInfos_[0][0].topLevelLength;
-      createInFrameInfo(0, input, passType);
-    } else {
-      for (size_t i = 0; i < inFrameLines_.size(); i++) {
-        const Argument& input1 = inFrameLines_[i].inLayer->getOutput();
-        input1.getSeqInfo(&seqInfos_[i]);
-        maxSequenceLength_ = seqInfos_[i][0].topLevelLength;
-        createInFrameInfo(i, input1, passType);
-      }
-    }
 
     // inFrameLine select rows in real layer one time
     for (size_t i = 0; i < inFrameLines_.size(); i++) {
-      int curInlinkId = shareInlinkInfo ? 0 : i;
       selectRowsOneTime(inFrameLines_[i].inLayer,
-                        info_[curInlinkId].allIds,
+                        info_[i].allIds,
                         &(inFrameLines_[i].outArg),
                         passType);
     }
   }
-  resizeOrCreateFrames(maxSequenceLength_);
-  resizeBootFrame(numSequences);
+}
+
+void RecurrentGradientMachine::reorganizeOutput(PassType passType) {
+  calcSequenceStartPositions();
+  for (size_t i = 0; i < outFrameLines_.size(); ++i) {
+    Info info;
+    auto& outFrameLine = outFrameLines_[i];
+    ICpuGpuVectorPtr sequenceStartPositions;
+    ICpuGpuVectorPtr subSequenceStartPositions;
+    createOutFrameInfo(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+    auto gatherAgent =
+        dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
+    CHECK_NOTNULL(gatherAgent);
+    gatherAgent->copyIdAndSequenceInfo(sequenceStartPositions,
+                                       subSequenceStartPositions,
+                                       info.allIds,
+                                       info.idIndex);
+  }
+}
 
+void RecurrentGradientMachine::connectFrames(PassType passType) {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
@@ -487,8 +470,9 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
                                           memoryFrameLine.outArg,
                                           memoryFrameLine.allIds,
                                           /* idIndex */ 0,
-                                          memoryFrameLine.allIds->getSize());
-      if (memoryFrameLine.is_sequence) {  // memoryConfig is sequence
+                                          memoryFrameLine.allIds->getSize(),
+                                          /* handleBackward */ true);
+      if (memoryFrameLine.sequenceStartPositions) {
         int size = memoryFrameLine.sequenceStartPositions->getSize();
         scatterAgent->setSequenceStartPositions(
             memoryFrameLine.sequenceStartPositions,
@@ -501,28 +485,26 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
   for (auto& outFrameLine : outFrameLines_) {
     auto gatherAgent =
         dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
-    CHECK_NOTNULL(gatherAgent);
-    gatherAgent->copyIdAndSequenceInfo(input,
-                                       info_[targetInfoInlinkId_].allIds,
-                                       info_[targetInfoInlinkId_].idIndex);
+    gatherAgent->clearRealLayers();
   }
-
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    int idSize = 0;
     // connect in_links
     for (size_t j = 0; j < inFrameLines_.size(); ++j) {
-      Info& info = info_[shareInlinkInfo ? 0 : j];
+      Info& info = info_[j];
       // idSize denotes the sum number of tokens in each length i
-      idSize = info.idIndex[i + 1] - info.idIndex[i];
+      int idIndex = info.idIndex.empty() ? 0 : info.idIndex[i];
+      int idSize = info.idIndex.empty() ? numSeqs_[i]
+                                        : info.idIndex[i + 1] - info.idIndex[i];
       InFrameLine inFrameLine = inFrameLines_[j];
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
       scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
                                           inFrameLine.outArg,
                                           info.allIds,
-                                          info.idIndex[i],
-                                          idSize);
-      if (hasSubseq) {
+                                          idIndex,
+                                          idSize,
+                                          i == 0);
+      if (info.sequenceStartPositions) {
         // size: the length of subsequence
         int size = info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
         scatterAgent->setSequenceStartPositions(
@@ -536,11 +518,6 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           dynamic_cast<GatherAgentLayer*>(outFrameLine.agentLayer.get());
       gatherAgent->addRealLayer(outFrameLine.frames[i]);
     }
-    // connect memory links
-    // Adopt info_[0].idIndex because seq which has_subseq=True
-    // doesn't support Memory with !hasSubseq bootlayer;
-    // And inlinks that !hasSubSeq must have same inlink length.
-    idSize = info_[0].idIndex[i + 1] - info_[0].idIndex[i];
     for (auto& memoryFrameLine : memoryFrameLines_) {
       NeuralNetwork::connect(
           memoryFrameLine.agents[i],
@@ -548,6 +525,28 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
           numSeqs_[i] /*height of agent*/);
     }
   }
+}
+
+void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
+                                       std::vector<Argument>* outArgs,
+                                       PassType passType) {
+  /* inArgs and outArgs are not used.
+     The inputs are inFrameLines_[i].inLayer.
+     The outputs are outFramesLines_[i].agentLayer
+   */
+
+  if (generating_) {
+    generateSequence();
+    return;
+  }  // else forward..
+
+  reorganizeInput(passType);
+  int numSequences = commonSeqInfo_.size();
+
+  resizeOrCreateFrames(maxSequenceLength_);
+  resizeBootFrame(numSequences);
+
+  connectFrames(passType);
 
   REGISTER_TIMER_INFO("RecurrentFwTime", "RecurrentFwTime");
   // forward
@@ -558,19 +557,15 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     frames_[i]->forward(inArgs, &outArgs, passType);
-    if (hasSubseq) {
-      for (auto& outFrameLine : outFrameLines_) {
-        CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
-            << "In hierachical RNN, all out links should be from sequences.";
-      }
-    }
-  }
-  if (evaluator_ && passType == PASS_TEST) {
-    this->eval(evaluator_.get());
   }
+
+  reorganizeOutput(passType);
 }
 
 void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
   REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
   AsyncGpuBlock asyncGpuBlock;
   for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
@@ -579,11 +574,6 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     memoryFrameLine.bootLayer->backward(nullptr);
   }
-
-  // call printers here so the gradient can be printed
-  if (evaluator_) {
-    this->eval(evaluator_.get());
-  }
 }
 
 void RecurrentGradientMachine::forwardBackward(
@@ -597,9 +587,9 @@ void RecurrentGradientMachine::forwardBackward(
 void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
   // call printers frame by frame
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
     evaluator->eval(*(frames_[i].get()));
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " end";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
   }
 }
 
@@ -634,76 +624,228 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
     this->beamSearchStatistics_ = nullptr;
   }
 }
+
+namespace {
+void lenToStarts(std::vector<int>& starts) {
+  int pos = 0;
+  starts.back() = 0;
+  for (auto& start : starts) {
+    int tmp = start;
+    start = pos;
+    pos += tmp;
+  }
+  starts.back() = pos;
+}
+}  // namespace
+
+void RecurrentGradientMachine::calcSequenceStartPositions() {
+  std::vector<int> starts(commonSeqInfo_.size() + 1);
+  for (auto& seqInfo : commonSeqInfo_) {
+    starts[seqInfo.seqId] = seqInfo.topLevelLength;
+  }
+  lenToStarts(starts);
+  ICpuGpuVector::resizeOrCreate(sequenceStartPositions_, starts.size(), false);
+  std::copy(starts.begin(),
+            starts.end(),
+            sequenceStartPositions_->getMutableData(false));
+}
+
+void RecurrentGradientMachine::checkOutputConsistency(
+    OutFrameLine& outFrameLine) {
+  bool hasSeq = outFrameLine.frames[0]->getOutput().hasSeq();
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    CHECK_EQ(hasSeq, frame->getOutput().hasSeq());
+    int numSequences = frame->getOutput().getNumSequences();
+    CHECK_EQ(numSeqs_[i], numSequences);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  checkOutputConsistency(outFrameLine);
+
+  if (!outFrameLine.frames[0]->getOutput().hasSeq()) {
+    createOutFrameInfo_seq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  } else {
+    createOutFrameInfo_subseq(
+        outFrameLine, info, sequenceStartPositions, subSequenceStartPositions);
+  }
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_seq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int seqStart = starts[commonSeqInfo_[j].seqId];
+      int seqLength = commonSeqInfo_[j].topLevelLength;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+  sequenceStartPositions = sequenceStartPositions_;
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
+void RecurrentGradientMachine::createOutFrameInfo_subseq(
+    OutFrameLine& outFrameLine,
+    Info& info,
+    ICpuGpuVectorPtr& sequenceStartPositions,
+    ICpuGpuVectorPtr& subSequenceStartPositions) {
+  size_t numSequences = commonSeqInfo_.size();
+  std::vector<int> allIds;
+  info.idIndex.resize(1, 0);  // first idIndex = 0
+
+  const int* starts = sequenceStartPositions_->getData(false);
+  std::vector<int> subStarts(starts[numSequences] + 1);
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    const int* seqStarts =
+        frame->getOutput().sequenceStartPositions->getData(false);
+    for (size_t j = 0; j < numSequences; ++j) {
+      subStarts[starts[commonSeqInfo_[j].seqId] + i] =
+          seqStarts[j + 1] - seqStarts[j];
+    }
+  }
+  lenToStarts(subStarts);
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    LayerPtr frame = outFrameLine.frames[i];
+    size_t numSequences = frame->getOutput().getNumSequences();
+    for (size_t j = 0; j < numSequences; ++j) {
+      int pos = starts[commonSeqInfo_[j].seqId] + i;
+      int subSeqStart = subStarts[pos];
+      int subSeqEnd = subStarts[pos + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
+      }
+    }
+    info.idIndex.push_back(allIds.size());
+  }
+
+  ICpuGpuVector::resizeOrCreate(
+      subSequenceStartPositions, subStarts.size(), false);
+  int* cpuSubSequenceStartPositions =
+      subSequenceStartPositions->getMutableData(false);
+  std::copy(subStarts.begin(), subStarts.end(), cpuSubSequenceStartPositions);
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  int* cpuSequenceStartPositions =
+      sequenceStartPositions->getMutableData(false);
+  for (size_t i = 0; i <= numSequences; ++i) {
+    cpuSequenceStartPositions[i] = subStarts[starts[i]];
+  }
+  copyScattedId(allIds, &info.allIds, allIds.size());
+  CHECK_EQ(info.idIndex.size(), static_cast<size_t>(maxSequenceLength_ + 1));
+}
+
 /* create scattered id infomation for all realLayer of inFrameLines one time.
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
  */
-
 void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                  const Argument& input,
                                                  PassType passType) {
-  bool hasSubseq = input.hasSubseq();
-  // numSequences: # samples(sequences) in a batch
-  size_t numSequences = input.getNumSequences();
+  if (!input.hasSeq()) {
+    createInFrameInfo_nonseq(inlinkId, input, passType);
+  } else if (!input.hasSubseq()) {
+    createInFrameInfo_seq(inlinkId, input, passType);
+  } else {
+    createInFrameInfo_subseq(inlinkId, input, passType);
+  }
+}
+
+void RecurrentGradientMachine::createInFrameInfo_nonseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
   std::vector<int> allIds;
 
   auto& seqInfo = seqInfos_[inlinkId];
-
-  numSeqs_.clear();
   Info* inlinkInfo = &info_[inlinkId];
   inlinkInfo->idIndex.clear();
-  inlinkInfo->idIndex.push_back(0);  // first idIndex = 0
+  for (size_t i = 0; i < seqInfo.size(); ++i) {
+    allIds.push_back(seqInfo[i].seqId);
+  }
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+}
+
+void RecurrentGradientMachine::createInFrameInfo_seq(int inlinkId,
+                                                     const Argument& input,
+                                                     PassType passType) {
+  std::vector<int> allIds;
+  auto& seqInfo = seqInfos_[inlinkId];
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
+
+  for (int i = 0; i < maxSequenceLength_; ++i) {
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int seqLength = seqInfo[j].topLevelLength;
+      int seqStart = seqInfo[j].seqStart;
+      allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
+                                 : (seqStart + i));
+    }
+    inlinkInfo->idIndex.push_back(allIds.size());
+  }
 
+  // copy and check scatterId
+  copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
+  CHECK_EQ(inlinkInfo->idIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+}
+void RecurrentGradientMachine::createInFrameInfo_subseq(int inlinkId,
+                                                        const Argument& input,
+                                                        PassType passType) {
+  std::vector<int> allIds;
+
+  auto& seqInfo = seqInfos_[inlinkId];
+
+  Info* inlinkInfo = &info_[inlinkId];
+  inlinkInfo->idIndex.resize(1, 0);  // first idIndex = 0
   std::vector<int> sequenceStartPositions;
   const int* subSequenceStartPositions = nullptr;
 
-  if (hasSubseq) {  // for sequenceScatterAgentLayer
-    subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-    inlinkInfo->seqStartPosIndex.clear();
-    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
-  }
-  // maxSequenceLength_: max topLevelLength in allsamples
+  subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
+  inlinkInfo->seqStartPosIndex.clear();
+  inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    if (hasSubseq) {
-      sequenceStartPositions.push_back(0);  // first element = 0
-    }
-    int numSeqs = 0;
-    for (size_t j = 0; j < numSequences; ++j) {
-      int seqLength = seqInfo[j].topLevelLength;
-      if (i >= seqLength) {
-        break;
-      }
-      ++numSeqs;
-      if (hasSubseq) {
-        int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
-        int subSeqEnd =
-            subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
-        for (int k = subSeqStart; k < subSeqEnd; ++k) {
-          allIds.push_back(k);
-        }
-        sequenceStartPositions.push_back(sequenceStartPositions.back() +
-                                         subSeqEnd - subSeqStart);
-      } else {
-        int seqStart = seqInfo[j].seqStart;
-        allIds.push_back(reversed_ ? (seqStart + seqLength - 1 - i)
-                                   : (seqStart + i));
+    sequenceStartPositions.push_back(0);  // first element = 0
+    for (int j = 0; j < numSeqs_[i]; ++j) {
+      int subSeqStart = subSequenceStartPositions[seqInfo[j].subSeqStart + i];
+      int subSeqEnd = subSequenceStartPositions[seqInfo[j].subSeqStart + i + 1];
+      for (int k = subSeqStart; k < subSeqEnd; ++k) {
+        allIds.push_back(k);
       }
+      sequenceStartPositions.push_back(sequenceStartPositions.back() +
+                                       subSeqEnd - subSeqStart);
     }
     inlinkInfo->idIndex.push_back(allIds.size());
-    numSeqs_.push_back(numSeqs);
-    if (hasSubseq) {
-      inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
-    }
-  }
-  if (hasSubseq) {
-    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(
-        sequenceStartPositions.size(),
-        static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
-    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
-             static_cast<size_t>(maxSequenceLength_ + 1));
-    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
+    inlinkInfo->seqStartPosIndex.push_back(sequenceStartPositions.size());
   }
+  // inFrameLine create sequenceStartPositions one time
+  CHECK_EQ(
+      sequenceStartPositions.size(),
+      static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
+  CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
+           static_cast<size_t>(maxSequenceLength_ + 1));
+  createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
 
   // copy and check scatterId
   copyScattedId(allIds, &inlinkInfo->allIds, input.getBatchSize());
@@ -717,11 +859,11 @@ void RecurrentGradientMachine::createMemoryFrameInfo(
   const Argument& input = (*memoryFrameLine).rootLayer->getOutput();
   size_t numSequences = input.getNumSequences();
   std::vector<int> allIds;
-  bool seqFlag = (*memoryFrameLine).is_sequence;
+  bool seqFlag = input.hasSeq();
+  CHECK(!input.hasSubseq())
+      << "Subsequence boot layer for memory is not supported";
 
   if (seqFlag) {  // for sequenceScatterAgentLayer
-    CHECK(input.sequenceStartPositions)
-        << "boot layer must be a sequence when is_sequence = true";
     std::vector<int> sequenceStartPositions;
     sequenceStartPositions.push_back(0);  // first element = 0
     const int* starts = input.sequenceStartPositions->getData(false);
@@ -804,8 +946,7 @@ size_t RecurrentGradientMachine::getGenBatchSize() {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     if (!memoryFrameLine.rootLayer) continue;
     Argument& bootArg = memoryFrameLine.rootLayer->getOutput();
-    size_t batchSize = memoryFrameLine.is_sequence ? bootArg.getNumSequences()
-                                                   : bootArg.getBatchSize();
+    size_t batchSize = bootArg.getNumSequences();
     if (numSequences) {
       CHECK_EQ(numSequences, batchSize);
     } else {
@@ -826,8 +967,9 @@ void RecurrentGradientMachine::generateSequence() {
   size_t numSequences = getGenBatchSize();
 
   resizeBootFrame(numSequences);
-  // We create only two sub-network in generation for alternate use.
-  // Thus, we can reduce total memory of output_ in layer forward.
+  // We create only two sub-network in generation, one stores states of all
+  // layers in previous time step and the other storing the states at current
+  // time step.
   resizeOrCreateFrames(2);
 
   // outFrameLines_.size() > 1UL
@@ -845,12 +987,7 @@ void RecurrentGradientMachine::generateSequence() {
     if (memoryFrameLine.rootAgent) {
       auto scatterAgent =
           dynamic_cast<ScatterAgentLayer*>(memoryFrameLine.rootAgent.get());
-      bool seqFlag = memoryFrameLine.is_sequence;
-      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids, seqFlag);
-      if (seqFlag) {
-        CHECK(memoryFrameLine.rootLayer->getOutput().sequenceStartPositions)
-            << "boot layer must be a sequence when is_sequence = true";
-      }
+      scatterAgent->setRealLayer(memoryFrameLine.rootLayer, ids);
     }
     NeuralNetwork::connect(
         memoryFrameLine.agents[0], memoryFrameLine.bootLayer, ids.size());
@@ -858,16 +995,16 @@ void RecurrentGradientMachine::generateSequence() {
 
   // boot layer forward
   AsyncGpuBlock asyncGpuBlock;
+
   for (auto& memoryFrameLine : memoryFrameLines_) {
     memoryFrameLine.bootLayer->forward(PASS_TEST);
   }
 
   // init outArg
   size_t resultNum = generator_.config.num_results_per_sample();
-  IVector::resizeOrCreate(
-      generator_.outArg.ids,
-      generator_.config.max_num_frames() * numSequences * resultNum,
-      false);
+  size_t maxGenWordCount =
+      generator_.config.max_num_frames() * numSequences * resultNum;
+  IVector::resizeOrCreate(generator_.outArg.ids, maxGenWordCount, false);
   if (resultNum > 1) {
     CHECK_LE(resultNum, static_cast<size_t>(generator_.config.beam_size()));
     Matrix::resizeOrCreate(generator_.outArg.in,
@@ -884,7 +1021,7 @@ void RecurrentGradientMachine::generateSequence() {
   } else {
     oneWaySearch(numSequences);
   }
-  if (dataArgsSize_) createDataOutlink(batchMachineIdVec_);
+  if (dataArgsSize_) createDataOutlink();
 
   size_t size = generator_.ids.size();
   generator_.outArg.ids->resize(size);
@@ -930,8 +1067,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
         auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
             memoryFrameLine.scatterAgents[machineCur].get());
         scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                                   scatterIds,
-                                   memoryFrameLine.is_sequence);
+                                   scatterIds);
         scatterAgent->forward(PASS_TEST);
         NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                                memoryFrameLine.scatterAgents[machineCur]);
@@ -949,10 +1085,6 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
 
     copyDataOutlinkFrame(machineCur);
 
-    // call value printer
-    if (evaluator_) {
-      evaluator_->eval(*(frames_[machineCur].get()));
-    }
     // check eos
     const IVectorPtr& eosVec =
         eosFrameLine_->layers[machineCur]->getOutput().ids;
@@ -969,6 +1101,7 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
   }
 
   batchMachineIdVec_.clear();
+  batchMachineStartPos_.clear();
   int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
   starts[0] = 0;
   generator_.ids.clear();
@@ -1003,8 +1136,7 @@ void RecurrentGradientMachine::connectPrevFrame(int stepId,
     auto scatterAgent = dynamic_cast<ScatterAgentLayer*>(
         memoryFrameLine.scatterAgents[machineCur].get());
     scatterAgent->setRealLayer(memoryFrameLine.frames[machinePrev],
-                               isOutIds ? topIds_ : machineIds_,
-                               memoryFrameLine.is_sequence);
+                               isOutIds ? topIds_ : machineIds_);
     scatterAgent->forward(PASS_TEST);
     NeuralNetwork::connect(memoryFrameLine.agents[machineCur],
                            memoryFrameLine.scatterAgents[machineCur]);
@@ -1176,36 +1308,44 @@ void RecurrentGradientMachine::fillGenOutputs() {
     finalPaths_[i].resize(minFinalPathsSize);
   }
 
-  batchMachineIdVec_.clear();
   generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
   if (numResults > 1) {
+    int idsProbSaveSize = 0;
+    for (auto inSeq : finalPaths_) {
+      for (auto path : inSeq) idsProbSaveSize += path.ids.size();
+      idsProbSaveSize += inSeq.size();
+    }
+    Matrix::resizeOrCreate(
+        generator_.outArg.value, idsProbSaveSize, 1, false, false);
+    real* idsProb = generator_.outArg.value->getData();
+
     real* probs = generator_.outArg.in->getData();
-    int* starts =
-        generator_.outArg.sequenceStartPositions->getMutableData(false);
-    starts[0] = 0;
+    size_t curPos = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
-        generator_.ids.push_back(path.ids.size());  // sequence size
+        size_t genLen = path.ids.size();
+        generator_.ids.push_back(genLen);  // sequence size
         generator_.ids.insert(
             generator_.ids.end(), path.ids.begin(), path.ids.end());
         generator_.ids.push_back(-1);  // end of sequence
-        probs[i * numResults + j] = path.logProb;
 
-        if (!j && dataArgsSize_) {
-          // in beam search, here only reserved the top 1 generated result
-          // for out_links that are not the generated word indices.
-          batchMachineIdVec_.insert(batchMachineIdVec_.end(),
-                                    path.machineIdVec.begin(),
-                                    path.machineIdVec.end());
-        }
+        memcpy(idsProb + curPos, path.idsProb.data(), sizeof(real) * genLen);
+        curPos += genLen;
+        idsProb[curPos++] = -1.0;
+        probs[i * numResults + j] = path.logProb;
       }
       starts[i + 1] = generator_.ids.size();
     }
   } else {
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       CHECK(!finalPaths_[i].empty());
-      generator_.ids = finalPaths_[i][0].ids;
+      Path& path = finalPaths_[i][0];
+      generator_.ids.insert(
+          generator_.ids.end(), path.ids.begin(), path.ids.end());
+      starts[i + 1] = starts[i] + path.ids.size();
     }
   }
 }
@@ -1219,25 +1359,76 @@ void RecurrentGradientMachine::copyDataOutlinkFrame(size_t machineCur) {
   }
 }
 
-void RecurrentGradientMachine::createDataOutlink(
-    std::vector<int>& machineIdVec) {
-  size_t seqNum =
-      getBeamSize() > 1UL ? finalPaths_.size() : finalPaths_[0].size();
-  std::vector<int> starts(seqNum + 1, 0);
-  for (size_t i = 0; i < seqNum; ++i) {
-    size_t seqLen = getBeamSize() > 1UL ? finalPaths_[i][0].ids.size()
-                                        : finalPaths_[0][i].ids.size();
-    starts[i + 1] = starts[i] + seqLen;
+void RecurrentGradientMachine::createDataOutlinkSelRowsInfo(
+    bool isSeq, std::vector<Argument>& outArgs) {
+  batchMachineIdVec_.clear();
+
+  size_t seqIdx = 0;
+  for (size_t i = 0; i < finalPaths_.size(); ++i) {
+    for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+      std::vector<int>& machineIdVec = finalPaths_[i][j].machineIdVec;
+      if (isSeq) {
+        for (size_t i = 0; i < machineIdVec.size(); ++i) {
+          size_t rowId = machineIdVec[i];
+          int* seqPos =
+              outArgs[i].sequenceStartPositions->getMutableData(false);
+          batchMachineIdVec_.push_back(seqPos[rowId]);
+        }
+      } else {
+        batchMachineIdVec_.insert(
+            batchMachineIdVec_.end(), machineIdVec.begin(), machineIdVec.end());
+      }
+      seqIdx++;
+    }
   }
+}
 
+void RecurrentGradientMachine::createDataOutlinkCopySizeInfo(
+    bool isSeq, std::vector<Argument>& outArgs, std::vector<int>& copySize) {
+  size_t totalSeqNum = std::accumulate(
+      finalPaths_.begin(),
+      finalPaths_.end(),
+      0UL,
+      [](size_t a, const std::vector<Path>& b) { return a + b.size(); });
+  copySize.resize(totalSeqNum, 1);
+
+  batchMachineStartPos_.resize(totalSeqNum + 1, 0);
+  if (isSeq) {
+    ICpuGpuVectorPtr inputSeqStartPos = outArgs[0].sequenceStartPositions;
+    CHECK_EQ(static_cast<size_t>(inputSeqStartPos->getSize() - 1),
+             getBeamSize() > 1 ? finalPaths_.size() : finalPaths_[0].size());
+    int* starts = inputSeqStartPos->getMutableData(false);
+    int seqId = 0;
+    for (size_t i = 0; i < finalPaths_.size(); ++i) {
+      for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
+        copySize[seqId] = getBeamSize() > 1 ? starts[i + 1] - starts[i]
+                                            : starts[j + 1] - starts[j];
+        batchMachineStartPos_[seqId + 1] =
+            batchMachineStartPos_[seqId] + finalPaths_[i][j].ids.size();
+        seqId++;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < finalPaths_[0].size(); ++i)
+      batchMachineStartPos_[i + 1] =
+          batchMachineStartPos_[i] + finalPaths_[0][i].ids.size();
+  }
+}
+
+void RecurrentGradientMachine::createDataOutlink() {
   for (size_t i = 0; i < dataArgsSize_; i++) {
+    bool isSeq = dataArgsFrame_[i][0].hasSeq();
+    std::vector<int> copySize;
+    createDataOutlinkCopySizeInfo(isSeq, dataArgsFrame_[i], copySize);
+    createDataOutlinkSelRowsInfo(isSeq, dataArgsFrame_[i]);
+
     dataArgs_[i].concat(dataArgsFrame_[i],
-                        machineIdVec,
-                        starts,
+                        batchMachineIdVec_,
+                        batchMachineStartPos_,
+                        copySize,
                         useGpu_,
                         HPPL_STREAM_1,
                         PASS_TEST);
-
     auto dataAgent =
         dynamic_cast<DataLayer*>(outFrameLines_[i + 1].agentLayer.get());
     CHECK_NOTNULL(dataAgent);
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index c2bc52709a..c16fae6d17 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -189,6 +189,11 @@ public:
      */
     std::vector<int> ids;
 
+    /**
+     * @brief idsProb, log probability of each generated word.
+     */
+    std::vector<real> idsProb;
+
     /**
      * @brief logProb, current probability of path.
      */
@@ -228,11 +233,13 @@ public:
      */
     Path(Path& old, int newId, real logProb, int machineId, int topIndex)
         : ids(old.ids),
+          idsProb(old.idsProb),
           logProb(old.logProb + logProb),
           machineId(machineId),
           topIndex(topIndex),
           seqId(old.seqId) {
       ids.push_back(newId);
+      idsProb.push_back(logProb);
       if (!old.probHistory.empty()) {
         this->probHistory = old.probHistory;
         // probHistory store current prob, not sum
@@ -284,6 +291,16 @@ public:
   }
 
 protected:
+  std::vector<Argument::SeqInfo> commonSeqInfo_;
+  ICpuGpuVectorPtr sequenceStartPositions_;
+  void calcSequenceStartPositions();
+  void checkInputConsistency(int inlinkId,
+                             const std::vector<Argument::SeqInfo>& seqInfo);
+  void reorganizeInput(PassType passType);
+  void reorganizeOutput(PassType passType);
+  void connectFrames(PassType passType);
+  void calcNumSequencesAtEachStep();
+
   void resizeOrCreateFrames(int numFrames);
   void resizeBootFrame(int numSequences);
 
@@ -295,8 +312,7 @@ protected:
     std::string linkName;
     LayerPtr inLayer;
     std::vector<LayerPtr> agents;  // Scatter Agents to reform batch input
-    bool hasSubseq;
-    Argument outArg;  // scatter output argument
+    Argument outArg;               // scatter output argument
   };
   std::vector<InFrameLine> inFrameLines_;
 
@@ -318,7 +334,6 @@ protected:
     std::vector<LayerPtr> agents;
     std::vector<LayerPtr> scatterAgents;  // scatter agent used by beam search
     Argument outArg;                      // scatter output argument
-    bool is_sequence;
     // Different memoryFrameLine have different element as follows
     IVectorPtr allIds;  // scattered id of realLayer
     ICpuGpuVectorPtr
@@ -330,22 +345,27 @@ protected:
   // and all outFrameLines(outlinks) share the info with one inFrameLine,
   // which is assigned by targetInfoInlinkId_.
   struct Info {
-    IVectorPtr allIds;         // scattered id of realLayer
-    std::vector<int> idIndex;  // index of allIds
+    // The original positions in the original batch
+    IVectorPtr allIds;  // scattered id of realLayer [batchSize]
+
+    // index of allIds for each step [maxSequenceLength_]
+    // idIndex[i] is the total length of the first i sequences
+    std::vector<int> idIndex;
+
     ICpuGpuVectorPtr
         sequenceStartPositions;         // scattered sequenceStartPositions
     std::vector<int> seqStartPosIndex;  // index of sequenceStartPositions
   };
-  std::vector<Info> info_;
+  std::vector<Info> info_;  // for input
 
   // numSeqs_[i] is the number sequences which is longer than i (for sequence
   // data) or has more than i subsequences (for subsequence data)
+  // Equivalently, numSeqs_[i] is the number of sequences at step i;
   std::vector<int> numSeqs_;
 
   std::vector<std::vector<Argument::SeqInfo>> seqInfos_;
 
-  // the id of inlink which share info with outlinks
-  int targetInfoInlinkId_;
+  void checkOutputConsistency(OutFrameLine& outFrameLine);
 
   /* create scattered id infomation for all realLayer of inFrameLines one time.
    *  If hasSubseq, will also create scattered sequenceStartPositions infomation
@@ -354,6 +374,28 @@ protected:
   void createInFrameInfo(int inlinks_id,
                          const Argument& input,
                          PassType passType);
+  void createInFrameInfo_nonseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+  void createInFrameInfo_seq(int inlinks_id,
+                             const Argument& input,
+                             PassType passType);
+  void createInFrameInfo_subseq(int inlinks_id,
+                                const Argument& input,
+                                PassType passType);
+
+  void createOutFrameInfo(OutFrameLine& outFrameLine,
+                          Info& info,
+                          ICpuGpuVectorPtr& sequenceStartPositions,
+                          ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_seq(OutFrameLine& outFrameLine,
+                              Info& info,
+                              ICpuGpuVectorPtr& sequenceStartPositions,
+                              ICpuGpuVectorPtr& subSequenceStartPositions);
+  void createOutFrameInfo_subseq(OutFrameLine& outFrameLine,
+                                 Info& info,
+                                 ICpuGpuVectorPtr& sequenceStartPositions,
+                                 ICpuGpuVectorPtr& subSequenceStartPositions);
 
   void createMemoryFrameInfo(MemoryFrameLine* memoryFrameLine,
                              PassType passType);
@@ -376,9 +418,11 @@ protected:
 
   struct Generator {
     GeneratorConfig config;
-    std::vector<int> ids;  // store generated sequences
-    Argument outArg;       // final output argument
+    std::vector<int> ids;       // store generated sequences
+    std::vector<real> idsProb;  // log probability of each generated word
+    Argument outArg;            // final output argument
   };
+  bool generating_;
   Generator generator_;
 
   std::vector<std::unique_ptr<NeuralNetwork>> frames_;
@@ -386,17 +430,13 @@ protected:
   NeuralNetwork* rootNetwork_;
   bool reversed_;
 
-  // if hasSubseq: max number of sentences(subseq)in batchsize samples
-  // else: max number of tokens in batchsize samples(sentences)
-  int maxSequenceLength_;
+  int maxSequenceLength_;  // Max top-level length
   bool useGpu_;
   bool stopBeamSearch_;
 
   std::vector<int>
       parameterIds_;  // parameters actually used by this Layer Group
 
-  std::unique_ptr<Evaluator> evaluator_;  // frame printers in this layer group
-
   // store final argument of outFrameLines_
   std::vector<Argument> dataArgs_;
   // store each frame's output argument of outFrameLines_
@@ -432,15 +472,43 @@ private:
   void copyDataOutlinkFrame(size_t machineCur);
 
   /*
-   * @brief In generation, if the layer group has more than 1 outlink, outlinks
-   * except the first one are data outlinks. This function creates the data
-   * outlinks.
-   * @note In beam search, only one generated sequence with the hightest log
-   * probabilites are retained.
-   * @param machineIdVec : select a row of output matrix in each frame
-   * that the generation process expanded.
+   * @brief In generation, if the layer group has more than 1 outlink, outlink
+   * except the first one is a data outlink. In RecurrentLayerGroup, each time
+   * step is a separate Network, outputs of a layer inside the
+   * RecurrentLayerGroup are stored in separate Arguments. If one layer is
+   * specified as an outlink of RecurrentLayerGroup. This function will
+   * collect outputs in each time step of each generated sequence which are
+   * dispersed in separate Arguments to form a new single Argument as output of
+   * RecurrentLayerGroup.
+   */
+  void createDataOutlink();
+
+  /*
+   * @brief decide to select how many rows from the Matrix stored the forward
+   * pass results from a start position.
+   *
+   * @param isSeq: a flag indicating whetehr the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the the returned Arguments of the forward pass
+   * during the generation process.
+   * @param copySize: the returned result, number of rows to select from the
+   * Matrix stored the forward pass results from a start position.
+   */
+  void createDataOutlinkCopySizeInfo(bool isSeq,
+                                     std::vector<Argument>& outArgs,
+                                     std::vector<int>& copySize);
+
+  /*
+   * @brief decide index of the start row for each time step of a generated
+   * sequence in Matrix stored the entire beam search batch's forward pass
+   * results.
+   *
+   * @param isSeq: a flag indicating whether the layer to be output of the
+   * RecurrentGradientMachine is a sequence or not
+   * @param outArgs: all of the returned Arguments of the forward pass
+   * during the generation process.
    */
-  void createDataOutlink(std::vector<int>& machineIdVec);
+  void createDataOutlinkSelRowsInfo(bool isSeq, std::vector<Argument>& outArgs);
 
   /*
    * @brief used in beam search, connect previous frame to form recurrent link
@@ -503,6 +571,7 @@ private:
   std::vector<int> topIds_;
   std::vector<int> seqIds_;
   std::vector<int> batchMachineIdVec_;
+  std::vector<int> batchMachineStartPos_;
   std::vector<std::vector<Path>> finalPaths_;
   std::vector<real> minFinalPathLogProb_;
   BeamSearchControlCallbacks* beamSearchCtrlCallbacks_;
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 7b1b99b135..bdae7e623a 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -36,14 +36,23 @@ void AgentLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   Argument& realOutput = realLayer_->getOutput();
-  int realHeight = realOutput.getBatchSize();
-  CHECK_LE(numSamples_, realHeight);
+  int realNumSequences = realOutput.getNumSequences();
+  CHECK_LE(numSamples_, realNumSequences);
 
   // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realHeight) {
-    if (realOutput.ids) {
-      output_.ids =
-          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
+  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
+    if (realOutput.hasSeq()) {
+      int numRows =
+          realOutput.sequenceStartPositions->getData(false)[numSamples_];
+      output_.subArgFrom(realOutput,
+                         /* offset */ 0,
+                         numRows,
+                         getSize(),
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ 0,
+                         /* seqSize */ numSamples_ + 1);
     } else {
       output_.subArgFrom(
           realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
@@ -53,34 +62,6 @@ void AgentLayer::forward(PassType passType) {
   }
 }
 
-void SequenceAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  Argument& realOutput = realLayer_->getOutput();
-  int realNumSequences = realOutput.getNumSequences();
-  CHECK_LE(numSamples_, realNumSequences);
-
-  // get Arguments from real layers
-  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows =
-        realOutput.sequenceStartPositions->getData(false)[numSamples_];
-    CHECK(!realOutput.ids) << "Not supported";
-    output_.subArgFrom(realOutput,
-                       /* offset */ 0,
-                       numRows,
-                       getSize(),
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ 0,
-                       /* seqSize */ numSamples_ + 1);
-  } else {
-    output_ = realOutput;
-  }
-}
-
-REGISTER_LAYER(sequence_agent, SequenceAgentLayer);
-
 bool GatherAgentLayer::init(const LayerMap& layerMap,
                             const ParameterMap& parameterMap) {
   CHECK_EQ(config_.inputs_size(), 0);
@@ -91,18 +72,26 @@ bool GatherAgentLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-void GatherAgentLayer::copyIdAndSequenceInfo(const Argument& input,
-                                             const IVectorPtr& ids,
-                                             const std::vector<int>& idIndex) {
-  output_.sequenceStartPositions = input.sequenceStartPositions;
-  output_.subSequenceStartPositions = input.subSequenceStartPositions;
-  realLayers_.clear();
+void GatherAgentLayer::copyIdAndSequenceInfo(
+    ICpuGpuVectorPtr sequenceStartPositions,
+    ICpuGpuVectorPtr subSequenceStartPositions,
+    const IVectorPtr& ids,
+    const std::vector<int>& idIndex) {
+  output_.sequenceStartPositions = sequenceStartPositions;
+  output_.subSequenceStartPositions = subSequenceStartPositions;
   allIds_ = ids;
   idIndex_ = idIndex;
 }
 
 void GatherAgentLayer::forward(PassType passType) {
   Layer::forward(passType);
+  forwardIds(passType);
+  forwardValue(passType);
+}
+
+void GatherAgentLayer::forwardValue(PassType passType) {
+  MatrixPtr valueReal = realLayers_[0]->getOutputValue();
+  if (!valueReal) return;
 
   int height = allIds_->getSize();
   int width = this->getSize();
@@ -120,6 +109,40 @@ void GatherAgentLayer::forward(PassType passType) {
   }
 }
 
+namespace {
+
+// dest[index[i]] <- src[i] for each i
+void copyElements(const IVector& srcVec,
+                  const IVector& indexVec,
+                  IVector& destVec) {
+  const int* src = srcVec.getData();
+  const int* index = indexVec.getData();
+  int* dest = destVec.getData();
+  int len = indexVec.getSize();
+  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
+  for (int i = 0; i < len; ++i) {
+    dest[index[i]] = src[i];
+  }
+}
+}  // namespace
+
+void GatherAgentLayer::forwardIds(PassType passType) {
+  IVectorPtr realId = realLayers_[0]->getOutputLabel();
+  if (!realId) return;
+
+  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
+  IVectorPtr outId = output_.ids;
+  idsVec_.resize(idIndex_.size());
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realId->getSize(),
+                                 useGpu_);
+    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
+  }
+}
+
 void GatherAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   const MatrixPtr& outputGrad = getOutputGrad();
@@ -147,21 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) {
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
   int width = this->getSize();
-  if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(
-        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-  } else {  // used in generation
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
+  if (selectionMode_) {
+    forwardWithSelection(passType);
+  } else {
+    if (realOutArg_.hasSeq()) {
+      output_.subArgFrom(realOutArg_,
+                         /* offset */ idIndex_,
+                         idSize_,
+                         width,
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ seqStartPosIndex_,
+                         /* seqSize */ numSequences_);
+    } else {
+      output_.subArgFrom(
+          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
     }
   }
 }
@@ -169,12 +193,14 @@ void ScatterAgentLayer::forward(PassType passType) {
 void ScatterAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
+  CHECK(!selectionMode_);
+
   const MatrixPtr& outputGrad = realOutArg_.grad;
   const MatrixPtr& realGrad = realLayer_->getOutputGrad();
   if (realGrad) {
     // for agent in inFrameLines and memoryFrameLines,
     // only first scatterAgentLayer should do addToRows in backward
-    if (idIndex_ == 0) {
+    if (handleBackward_) {
       outputGrad->addToRows(*realGrad, *ids_);
     }
   }
@@ -183,42 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
 REGISTER_LAYER(gather_agent, GatherAgentLayer);
 REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
 
-void SequenceGatherAgentLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  int height = 0;
-  int* starts = output_.subSequenceStartPositions->getMutableData(false);
-  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
-  if (idReal) {
-    // Gather generator.idsVec
-    // if is beam search generation result. Get first result.
-    if (idReal->getData()[idReal->getSize() - 1] == -1) {
-      for (size_t i = 0; i < realLayers_.size(); ++i) {
-        // The first element stores first result size
-        idReal = realLayers_[i]->getOutputLabel();
-        idReal->subVecFrom(*idReal, 1, idReal->getData()[0]);
-      }
-    }
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      CHECK(realLayers_[i]->getOutputLabel());
-      starts[i] = height;
-      height += realLayers_[i]->getOutputLabel()->getSize();
-    }
-    starts[realLayers_.size()] = height;
-    output_.sequenceStartPositions->getMutableData(false)[1] = height;
-
-    IVector::resizeOrCreate(output_.ids, height, false);
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      output_.ids->subVec(starts[i], starts[i + 1] - starts[i])
-          ->copyFrom(*realLayers_[i]->getOutputLabel());
-    }
-  } else {
-    // Gather output.value, same as GatherAgentLayer
-    CHECK(output_.subSequenceStartPositions);
-    GatherAgentLayer::forward(passType);
-  }
-}
-
-void SequenceScatterAgentLayer::forward(PassType passType) {
+void ScatterAgentLayer::forwardWithSelection(PassType passType) {
   Layer::forward(passType);
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
@@ -229,18 +220,21 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
   AsyncGpuBlock asyncGpuBlock;
   REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
 
-  if (realOutArg_.value || realOutArg_.ids) {
-    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_,
-                       /* offset */ idIndex_,
-                       idSize_,
-                       width,
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ seqStartPosIndex_,
-                       /* seqSize */ numSequences_);
+  if (!input.hasSeq()) {
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
   } else {
+    // Putting the generation logic here is really an ugly hack!
     // used in generation
     int height = 0;
     size_t numSequences = ids_->getSize();
@@ -284,7 +278,4 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
   }
 }
 
-REGISTER_LAYER(sequence_gather_agent, SequenceGatherAgentLayer);
-REGISTER_LAYER(sequence_scatter_agent, SequenceScatterAgentLayer);
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index b6dac7ae6f..29681b29c6 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -49,18 +49,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
-/**
- * like AgentLayer, but use first *numSamples* sequences
- */
-class SequenceAgentLayer : public AgentLayer {
-public:
-  explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
-  ~SequenceAgentLayer() {}
-
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback = nullptr) override {}
-};
-
 /**
  * Like AgentLayer, but it can gather many real layers. Each real
  * layer give a few rows of a sequence, after gather all real layers,
@@ -83,7 +71,10 @@ public:
             const ParameterMap& parameterMap) override;
 
   // call before addRealLayer
-  void copyIdAndSequenceInfo(const Argument& input,
+  void clearRealLayers() { realLayers_.clear(); }
+
+  void copyIdAndSequenceInfo(ICpuGpuVectorPtr sequenceStartPositions,
+                             ICpuGpuVectorPtr subSequenceStartPositions,
                              const IVectorPtr& allIds,
                              const std::vector<int>& idIndex);
 
@@ -92,24 +83,8 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
-};
-
-/**
- * Like GatherAgentLayer, but select a few sequence in real layer.
- * *ids* in addRealLayer() are the ids of selected sequence.
- * It's used to reorder sequence output.
- */
-class SequenceGatherAgentLayer : public GatherAgentLayer {
-public:
-  explicit SequenceGatherAgentLayer(const LayerConfig& config)
-      : GatherAgentLayer(config) {}
-  virtual ~SequenceGatherAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    // same as GatherAgentLayer
-    GatherAgentLayer::backward(callback);
-  }
+  void forwardValue(PassType passType);
+  void forwardIds(PassType passType);
 };
 
 /**
@@ -129,6 +104,14 @@ protected:
   int idSize_;
   int seqStartPosIndex_;
   int numSequences_;  // number of sequences in this scatterAgentLayer
+  bool handleBackward_;
+
+  // use to store expanded cpuStartPositions or subSequenceStartPositions
+  // of real layer.
+  ICpuGpuVectorPtr inputStartPos_;
+
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
 
 public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
@@ -147,20 +130,17 @@ public:
    *                        false(default) in ScatterAgentLayer, and
    *                        true in SequenceScatterAgentLayer.
    */
-  void setRealLayer(LayerPtr layer,
-                    const std::vector<int>& ids,
-                    bool copyId = false) {
+  void setRealLayer(LayerPtr layer, const std::vector<int>& ids) {
     realLayer_ = layer;
     IVector::resizeOrCreate(ids_, ids.size(), useGpu_);
     ids_->copyFrom(ids.data(), ids.size());
-    if (copyId) {
-      if (useGpu_) {
-        IVector::resizeOrCreate(cpuIds_, ids.size(), false);
-        cpuIds_->copyFrom(ids.data(), ids.size());
-      } else {
-        cpuIds_ = ids_;
-      }
+    if (useGpu_) {
+      IVector::resizeOrCreate(cpuIds_, ids.size(), false);
+      cpuIds_->copyFrom(ids.data(), ids.size());
+    } else {
+      cpuIds_ = ids_;
     }
+    selectionMode_ = true;
   }
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
@@ -169,12 +149,15 @@ public:
                              const Argument& outArg,
                              const IVectorPtr& ids,
                              int idIndex,
-                             int idSize) {
+                             int idSize,
+                             bool handleBackward) {
     realLayer_ = layer;
     realOutArg_ = outArg;
     ids_ = ids;
     idIndex_ = idIndex;
     idSize_ = idSize;
+    handleBackward_ = handleBackward;
+    selectionMode_ = false;
   }
 
   void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -187,28 +170,8 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
-};
 
-/**
- * Like ScatterAgentLayer, but select a few sequence in real layer.
- * *ids* in setRealLayer() or setRealLayerAndOutput() are the ids of
- * selected sequence. It's used to reorder sequence input.
- */
-class SequenceScatterAgentLayer : public ScatterAgentLayer {
-protected:
-  // use to store expanded cpuStartPositions or subSequenceStartPositions
-  // of real layer.
-  ICpuGpuVectorPtr inputStartPos_;
-
-public:
-  explicit SequenceScatterAgentLayer(const LayerConfig& config)
-      : ScatterAgentLayer(config) {}
-  virtual ~SequenceScatterAgentLayer() {}
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {
-    ScatterAgentLayer::backward(callback);
-  }
+  void forwardWithSelection(PassType passType);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 332552a304..db4a17bfb0 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -25,6 +25,10 @@ namespace paddle {
  * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = average_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the average pooling
+ *              operation is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 1ceaaaa206..925af31289 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnBatchNormLayer.h"
 #endif
 
@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
     useGlobalStats_ = config_.use_global_stats();
   }
   movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
 
   weight_.reset(new Weight(1, channels_, parameters_[0]));
   movingMean_.reset(new Weight(1, channels_, parameters_[1]));
@@ -62,14 +63,18 @@ void BatchNormBaseLayer::calFeatureMapSize() {
   const ImageConfig& conf = config_.inputs(0).image_conf();
   imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
   imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  imageD_ = inputLayers_[0]->getOutput().getFrameDepth();
+
+  if (0 == imageD_) imageD_ = conf.img_size_z();
   if (imageH_ == 0 && imageW_ == 0) {
     imageH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
     imageW_ = conf.img_size();
   } else {
     getOutput().setFrameHeight(imageH_);
     getOutput().setFrameWidth(imageW_);
+    getOutput().setFrameDepth(imageD_);
   }
-  imgPixels_ = imageH_ * imageW_;
+  imgPixels_ = imageH_ * imageW_ * imageD_;
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 230bafc31d..2ac3cd9d67 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -80,6 +80,7 @@ protected:
 
   /// Height or width of input image feature.
   /// Both of them are 1 if the input is fully-connected layer.
+  int imageD_;
   int imageH_;
   int imageW_;
   /// Height * Width.
@@ -93,6 +94,8 @@ protected:
   bool useGlobalStats_;
   // use to compute moving mean and variance.
   real movingAvgFraction_;
+  // Epsilon is a small random noise used in batch normalization for stability.
+  real epsilon_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index 412762d384..25ab5cd927 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "hl_batch_transpose.h"
 #endif
 #include "BatchNormalizationLayer.h"
@@ -22,8 +22,6 @@ namespace paddle {
 
 REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
 
-const real BatchNormalizationLayer::EPS = 1E-5;
-
 bool BatchNormalizationLayer::init(const LayerMap& layerMap,
                                    const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
 
   calMovingMeanAndVar();
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
   savedInvVar_->copyFrom(*(movingVar_->getW()));
   savedInvVar_->downClip(real(0.0));
 
-  savedInvVar_->subScalar(-EPS);
+  savedInvVar_->subScalar(-epsilon_);
   savedInvVar_->sqrt2(*savedInvVar_);
 }
 
@@ -90,7 +88,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
   size_t batchSize = in->getHeight();
   CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
@@ -127,7 +125,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
   }
   CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index f6115801fc..1fdb5e2070 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -39,9 +39,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /// Epsilon value used in the batch normalization formula.
-  static const real EPS;
-
   /// Load pre-calculated mean and std.
   void setMeanAndStd();
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb9215..3b1f346359 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,22 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
   return true;
 }
 
@@ -63,48 +79,27 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
@@ -113,48 +108,13 @@ void BlockExpandLayer::forward(PassType passType) {
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  size_t blockNum = outputH_ * outputW_;
-  size_t blockSize = blockH_ * blockW_ * channels_;
   /* Calculate the input layers error */
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (!preGrad) {
-    return;
-  }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
-
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e6..15ce73ab8b 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,8 +50,8 @@ protected:
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
+  TensorShape inputShape_;
+  TensorShape outputShape_;
 
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index 0b54442009..867303b4fa 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -101,8 +101,10 @@ void CRFLayer::backward(const UpdateCallback& callback) {
                               : real(1.0f);
     instanceWeight *= coeff_;
 
-    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (output.grad) {
+      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+      grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    }
     if (needWGrad) {
       weight_->getWGrad()->add(
           *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
new file mode 100644
index 0000000000..13f16c9537
--- /dev/null
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for clipping the input value by the threshold.
+ * \f[
+ *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+ * \f]
+ */
+
+class ClipLayer : public Layer {
+protected:
+  double min_;
+  double max_;
+
+public:
+  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(clip, ClipLayer);
+
+bool ClipLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+  auto layerConf = config_.inputs(0).clip_conf();
+  min_ = layerConf.min();
+  max_ = layerConf.max();
+  CHECK_LT(min_, max_);
+  return true;
+}
+
+void ClipLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(*inV);
+  outV->clip(min_, max_);
+}
+
+void ClipLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  if (inG) {
+    MatrixPtr outV = getOutputValue();
+    MatrixPtr outG = getOutputGrad();
+    MatrixPtr tmpMtx;
+    Matrix::resizeOrCreate(
+        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
+    tmpMtx->clipDerivative(*inV, min_, max_);
+    inG->addDotMul(*outG, *tmpMtx, 1, 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.cpp b/paddle/gserver/layers/Conv3DLayer.cpp
new file mode 100644
index 0000000000..9deda2de98
--- /dev/null
+++ b/paddle/gserver/layers/Conv3DLayer.cpp
@@ -0,0 +1,253 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Conv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(conv3d, Conv3DLayer);
+
+bool Conv3DLayer::init(const LayerMap &layerMap,
+                       const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    M_.push_back(numFilters_ / conf.groups());
+    K_.push_back(filterPixels_[index] * filterChannels_[index]);
+
+    // create a new weight
+    size_t height, width;
+    width = filterPixels_[index] * filterChannels_[index];
+    height = numFilters_;
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    ++index;
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t Conv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  outputH_.clear();
+  outputW_.clear();
+  outputD_.clear();
+  N_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    outputW_.push_back(outputSize(
+        imgSizeW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    outputH_.push_back(outputSize(
+        imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    outputD_.push_back(outputSize(
+        imgSizeD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += N_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  getOutput().setFrameDepth(outputD_[0]);
+  return layerSize;
+}
+
+void Conv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+
+  REGISTER_TIMER_INFO("FwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    const MatrixPtr &outMat = getOutputValue();
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    MatrixPtr wMat = weights_[i]->getW();
+    for (int n = 0; n < batchSize; ++n) {
+      colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                       channels_[i],
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i]);
+
+      real *outData = outMat->getData() + n * outMat->getStride();
+      MatrixPtr outMatSub =
+          Matrix::create(outData, groups_[i] * M, N, false, useGpu_);
+      for (int g = 0; g < groups_[i]; g++) {
+        MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+        MatrixPtr in = colBuf_->subMatrix(g * K, K);
+        MatrixPtr out = outMatSub->subMatrix(g * M, M);
+        out->mul(*wMatSub, *in, 1.0, 1.0);
+      }
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void Conv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
+  REGISTER_TIMER_INFO("BwdConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad()) {
+      bpropWeights(i);
+    }
+    if (getInputGrad(i)) {
+      bpropData(i);
+    }
+    weights_[i]->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void Conv3DLayer::bpropWeights(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  const MatrixPtr &inMat = getInputValue(i);
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wGradMat = weights_[i]->getWGrad();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    colBuf_->vol2Col(inMat->getData() + n * inMat->getStride(),
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i]);
+
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, groups_[i] * M, N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr inMatSub = colBuf_->subMatrix(g * K, K);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr wGradSub = wGradMat->subMatrix(g * M, M);
+      wGradSub->mul(*outG, *(inMatSub->getTranspose()), 1.0, 1.0);
+    }
+  }
+}
+
+void Conv3DLayer::bpropData(int i) {
+  int M = M_[i];
+  int N = N_[i];
+  int K = K_[i];
+  Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+  MatrixPtr wMat = weights_[i]->getW();
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  for (int n = 0; n < batchSize; ++n) {
+    real *outGradData =
+        getOutputGrad()->getData() + n * getOutputGrad()->getStride();
+    real *preGradData =
+        getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+    MatrixPtr outGradSub =
+        Matrix::create(outGradData, M * groups_[i], N, false, useGpu_);
+    for (int g = 0; g < groups_[i]; ++g) {
+      MatrixPtr wMatSub = wMat->subMatrix(g * M, M);
+      MatrixPtr outG = outGradSub->subMatrix(g * M, M);
+      MatrixPtr inGradMatSub = colBuf_->subMatrix(g * K, K);
+      inGradMatSub->mul(*(wMatSub->getTranspose()), *outG, 1.0, 0.0);
+    }
+    colBuf_->col2Vol(preGradData,
+                     channels_[i],
+                     imgSizeD_[i],
+                     imgSizeH_[i],
+                     imgSizeW_[i],
+                     filterSizeZ_[i],
+                     filterSizeY_[i],
+                     filterSize_[i],
+                     strideZ_[i],
+                     strideY_[i],
+                     stride_[i],
+                     paddingZ_[i],
+                     paddingY_[i],
+                     padding_[i],
+                     1.0,
+                     1.0);
+  }
+}
+
+void Conv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void Conv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Conv3DLayer.h b/paddle/gserver/layers/Conv3DLayer.h
new file mode 100644
index 0000000000..b622508d0c
--- /dev/null
+++ b/paddle/gserver/layers/Conv3DLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of convolution layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate convolution operation.
+ */
+class Conv3DLayer : public ConvBaseLayer {
+public:
+  explicit Conv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~Conv3DLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 7b234dc2a6..b848ab6bdd 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -32,11 +32,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     const ConvConfig& conf = inputConfig.conv_conf();
     padding_.push_back(conf.padding());
     stride_.push_back(conf.stride());
+    dilation_.push_back(conf.dilation());
     filterSize_.push_back(conf.filter_size());
     paddingY_.push_back(conf.padding_y());
     strideY_.push_back(conf.stride_y());
+    dilationY_.push_back(conf.dilation_y());
     filterSizeY_.push_back(conf.filter_size_y());
-    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
     imgSizeH_.push_back(conf.has_img_size_y() ? conf.img_size_y()
                                               : conf.img_size());
@@ -45,31 +46,20 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterChannels_.push_back(conf.filter_channels());
     outputH_.push_back(conf.has_output_y() ? conf.output_y() : conf.output_x());
     outputW_.push_back(conf.output_x());
+
+    paddingZ_.push_back(conf.padding_z());
+    strideZ_.push_back(conf.stride_z());
+    filterSizeZ_.push_back(conf.filter_size_z());
+    imgSizeD_.push_back(conf.img_size_z());
+    outputD_.push_back(conf.output_z());
+    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back() *
+                            filterSizeZ_.back());
   }
 
   CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = (!isDeconv_) ? numFilters_ : channels_[i];
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
-  }
 
-  /* initialize the biases_ */
-  if (biasParameter_.get()) {
-    if (sharedBiases_) {
-      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
-    } else {
-      biases_ =
-          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
-    }
-  }
+  // create new weights_ in derived class
+  // create new biases_ in derived class
 
   // default caffe model
   caffeMode_ = true;
@@ -89,7 +79,11 @@ size_t ConvBaseLayer::calOutputSize() {
   size_t layerSize = 0;
 
   auto setLayerSize = [&](IntV& inH, IntV& inW, IntV& outH, IntV& outW) {
+    size_t filterSizeY;
+    size_t filterSize;
     for (size_t i = 0; i < inputLayers_.size(); i++) {
+      filterSizeY = (filterSizeY_[i] - 1) * dilationY_[i] + 1;
+      filterSize = (filterSize_[i] - 1) * dilation_[i] + 1;
       inH.push_back(inputLayers_[i]->getOutput().getFrameHeight());
       inW.push_back(inputLayers_[i]->getOutput().getFrameWidth());
       const ConvConfig& conf = config_.inputs(i).conv_conf();
@@ -98,17 +92,17 @@ size_t ConvBaseLayer::calOutputSize() {
           inH[i] = conf.has_output_y() ? conf.output_y() : conf.output_x();
         if (inW[i] == 0) inW[i] = conf.output_x();
         outH.push_back(imageSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
-        outW.push_back(imageSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
+        outW.push_back(
+            imageSize(inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       } else {
         if (inH[i] == 0)
           inH[i] = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
         if (inW[i] == 0) inW[i] = conf.img_size();
         outH.push_back(outputSize(
-            inH[i], filterSizeY_[i], paddingY_[i], strideY_[i], caffeMode_));
+            inH[i], filterSizeY, paddingY_[i], strideY_[i], caffeMode_));
         outW.push_back(outputSize(
-            inW[i], filterSize_[i], padding_[i], stride_[i], caffeMode_));
+            inW[i], filterSize, padding_[i], stride_[i], caffeMode_));
       }
       CHECK_EQ(outH[i], outH[0]);
       CHECK_EQ(outW[i], outW[0]);
@@ -118,11 +112,7 @@ size_t ConvBaseLayer::calOutputSize() {
     layerSize = outH[0] * outW[0] * size_t(numFilters_);
   };
 
-  if (isDeconv_) {
-    setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_);
-  } else {
-    setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-  }
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
 
   return layerSize;
 }
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index e9d15d94f8..ccd170d9d8 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -40,6 +40,10 @@ protected:
   IntV stride_;
   /// The y dimension of the stride.
   IntV strideY_;
+  /// The x dimension of the dilation.
+  IntV dilation_;
+  /// The y dimension of the dilation.
+  IntV dilationY_;
   /// The x dimension of a filter kernel.
   IntV filterSize_;
   /// The y dimension of a filter kernel.
@@ -58,6 +62,13 @@ protected:
   IntV outputH_;
   /// The spatial dimensions of output feature map width.
   IntV outputW_;
+
+  IntV outputD_;
+  IntV imgSizeD_;
+  IntV filterSizeZ_;
+  IntV strideZ_;
+  IntV paddingZ_;
+
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
   /// filters are only connected to the first half of the input channels,
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
index 5c23198629..5469c41c87 100644
--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -59,7 +59,8 @@ void ConvBaseOperator::allocConvWorkSpace() {
                     &bwdDataAlgo_,
                     &bwdDataLimitBytes_,
                     &bwdFilterAlgo_,
-                    &bwdFilterLimitBytes_);
+                    &bwdFilterLimitBytes_,
+                    /*useDilation*/ false);
 
   size_t maxWorkSpace = 0;
   maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index d1e932ded5..19efed7b52 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ThreadLocalD<std::vector<MemoryHandlePtr>> ConvBaseProjection::convMem_;
 
 ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
                                        ParameterPtr parameter,
@@ -41,6 +41,11 @@ void ConvBaseProjection::getConvParams() {
   strideH_ = conf.stride_y();
   strideW_ = conf.stride();
 
+  dilationH_ = conf.dilation_y();
+  dilationW_ = conf.dilation();
+  CHECK_GT(dilationH_, 0);
+  CHECK_GT(dilationW_, 0);
+
   filterH_ = conf.filter_size_y();
   filterW_ = conf.filter_size();
 
@@ -77,7 +82,9 @@ void ConvBaseProjection::initCudnn() {
                                    paddingH_,
                                    paddingW_,
                                    strideH_,
-                                   strideW_);
+                                   strideW_,
+                                   dilationH_,
+                                   dilationW_);
 
   // initialize all to default algorithms
   fwdAlgo_ = 0;
@@ -87,9 +94,6 @@ void ConvBaseProjection::initCudnn() {
   bwdDataLimitBytes_ = 0;
   bwdFilterLimitBytes_ = 0;
   workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
 }
 
 void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
@@ -134,7 +138,9 @@ void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
                                   paddingH_,
                                   paddingW_,
                                   strideH_,
-                                  strideW_);
+                                  strideW_,
+                                  dilationH_,
+                                  dilationW_);
 }
 
 void ConvBaseProjection::reshape(int batchSize) {
@@ -142,47 +148,45 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(width, out_->value->getWidth());
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(imageDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  reshapeTensorDesc(batchSize);
+  bool useDilation = false;
+  if (dilationH_ > 1 || dilationW_ > 1) {
+    useDilation = true;
   }
-
-  isSelectAlgo_ = true;
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_,
+                    useDilation);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
+  std::vector<MemoryHandlePtr> &convMem = *convMem_;
   if (convMem.empty()) {
     int numDevices = hl_get_device_count();
     convMem.resize(numDevices);
   }
 
   int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
+  MemoryHandlePtr localMem = convMem[devId];
+  if (NULL == localMem || size > localMem->getAllocSize()) {
+    localMem = std::make_shared<GpuMemoryHandle>(size);
   }
-  return (*localMem)->getBuf();
+  return localMem->getBuf();
 }
 
 ConvBaseProjection::~ConvBaseProjection() {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index 4a33aa1837..bb7ffa627b 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -63,6 +63,7 @@ protected:
   int configChannels_, configNumFilters_;
   int paddingH_, paddingW_;
   int strideH_, strideW_;
+  int dilationH_, dilationW_;
   int filterH_, filterW_;
   /// One group offset of input data.
   int inputOffset_;
@@ -101,16 +102,10 @@ protected:
   size_t bwdFilterLimitBytes_;
   /// Size of total work space.
   size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  static ThreadLocalD<std::vector<MemoryHandlePtr>> convMem_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 5b7ecc5560..6f0106b713 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -25,12 +25,12 @@ size_t ConvProjection::calOutputSize() {
   if (imageH_ == 0) imageH_ = configImgH_;
   if (imageW_ == 0) imageW_ = configImgW_;
   outputH_ = outputSize(imageH_,
-                        filterH_,
+                        (filterH_ - 1) * dilationH_ + 1,
                         paddingH_,
                         strideH_,
                         /* caffeMode */ true);
   outputW_ = outputSize(imageW_,
-                        filterW_,
+                        (filterW_ - 1) * dilationW_ + 1,
                         paddingW_,
                         strideW_,
                         /* caffeMode */ true);
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
index 48132a3ce4..e7f081c023 100644
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
   if (outputH_ == 0) outputH_ = configOutH_;
   if (outputW_ == 0) outputW_ = configOutW_;
   imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                       paddingH_,
                       strideH_,
                       /* caffeMode */ true);
 
   imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                       paddingW_,
                       strideW_,
                       /* caffeMode */ true);
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 6bfdea3c6e..0bb6f84c22 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -462,8 +462,8 @@ void LambdaCost::calcGrad(const real* outputScore,
       real score_j = score[index_j];
       real dcgDif = 0;
       if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) /
-                 (std::log(i + 2) - std::log(j + 2));
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
       } else {
         dcgDif =
             (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
@@ -572,13 +572,8 @@ void MultiBinaryLabelCrossEntropy::backwardImp(Matrix& output,
   }
 }
 
-//
-// Huber loss for robust 2-classes classification
-//
-REGISTER_LAYER(huber, HuberTwoClass);
-
-bool HuberTwoClass::init(const LayerMap& layerMap,
-                         const ParameterMap& parameterMap) {
+bool HuberCost::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
   CostLayer::init(layerMap, parameterMap);
   if (useGpu_) {
     tmpCpuInput_.reserve(inputLayers_.size());
@@ -589,7 +584,7 @@ bool HuberTwoClass::init(const LayerMap& layerMap,
   return true;
 }
 
-void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
+void HuberCost::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
   if (useGpu_) {
     for (size_t i = 0; i < inputLayers_.size(); i++) {
       tmpCpuInput_[i].resizeAndCopyFrom(
@@ -597,13 +592,87 @@ void HuberTwoClass::forwardImp(Matrix& output, Argument& label, Matrix& cost) {
     }
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
-  forwardImpIn(output, label, cost);
 }
 
-void HuberTwoClass::forwardImpIn(Matrix& output,
-                                 Argument& label,
-                                 Matrix& target) {
+//
+// Huber loss for robust regression.
+//
+REGISTER_LAYER(huber_regression, HuberRegressionLoss);
+
+bool HuberRegressionLoss::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  HuberCost::init(layerMap, parameterMap);
+  delta_ = config_.delta();
+  return true;
+}
+
+void HuberRegressionLoss::forwardImp(Matrix& output,
+                                     Argument& label,
+                                     Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
+  size_t numSamples = target.getHeight();
+  size_t dim = output.getWidth();
+  CHECK(label.value);
+  CHECK_EQ((*label.value).getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(dim, (*label.value).getWidth());
+  CHECK_EQ(target.getWidth(), (size_t)1);
+
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  std::vector<real> cost(numSamples, 0);
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = std::abs(lbl[index] - out[index]);
+      if (a <= delta_)
+        cost[i] += a * a / 2;
+      else
+        cost[i] += delta_ * (a - delta_ / 2);
+    }
+  }
+  target.copyFrom(cost.data(), numSamples);
+}
+
+void HuberRegressionLoss::backwardImp(Matrix& output,
+                                      Argument& label,
+                                      Matrix& outputG) {
+  size_t numSamples = output.getHeight();
+  size_t dim = output.getWidth();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  real* lbl =
+      useGpu_ ? tmpCpuInput_[1].value->getData() : (*label.value).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
+  for (size_t i = 0; i < numSamples; ++i) {
+    for (size_t j = 0; j < dim; ++j) {
+      int index = i * dim + j;
+      real a = lbl[index] - out[index];
+      if (std::abs(a) <= delta_)
+        grad[index] += -a;
+      else
+        grad[index] += a > 0 ? -delta_ : delta_;
+    }
+  }
+  if (useGpu_) outputG.copyFrom(grad, numSamples * dim);
+}
+
+//
+// Huber loss for robust 2-classes classification
+//
+REGISTER_LAYER(huber_classification, HuberTwoClassification);
+
+bool HuberTwoClassification::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  return HuberCost::init(layerMap, parameterMap);
+}
+
+void HuberTwoClassification::forwardImp(Matrix& output,
+                                        Argument& label,
+                                        Matrix& target) {
+  HuberCost::forwardImp(output, label, target);
   size_t numSamples = target.getHeight();
+  CHECK(label.ids);
   CHECK_EQ((*label.ids).getSize(), numSamples);
   CHECK_EQ(output.getHeight(), numSamples);
   CHECK_EQ(output.getWidth(), (size_t)1);
@@ -611,47 +680,35 @@ void HuberTwoClass::forwardImpIn(Matrix& output,
 
   real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
   int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
-  std::vector<real> cost(numSamples);
+  std::vector<real> cost(numSamples, 0);
   for (size_t i = 0; i < numSamples; ++i) {
     int y = 2 * lbl[i] - 1;
-    if (out[i] * y < -1)
-      cost[i] = -4 * out[i] * y;
-    else if (out[i] * y < 1)
-      cost[i] = (1 - out[i] * y) * (1 - out[i] * y);
-    else
-      cost[i] = 0;
+    real a = out[i] * y;
+    if (a < -1)
+      cost[i] = -4 * a;
+    else if (a < 1)
+      cost[i] = (1 - a) * (1 - a);
   }
   target.copyFrom(cost.data(), numSamples);
 }
 
-void HuberTwoClass::backwardImp(Matrix& outputValue,
-                                Argument& label,
-                                Matrix& outputGrad) {
-  if (useGpu_) {
-    backwardImpIn(
-        *tmpCpuInput_[0].value, tmpCpuInput_[1], *tmpCpuInput_[0].grad);
-    outputGrad.copyFrom(*tmpCpuInput_[0].grad);
-  } else {
-    backwardImpIn(outputValue, label, outputGrad);
-  }
-}
-
-void HuberTwoClass::backwardImpIn(Matrix& output,
-                                  Argument& label,
-                                  Matrix& outputG) {
+void HuberTwoClassification::backwardImp(Matrix& output,
+                                         Argument& label,
+                                         Matrix& outputG) {
   size_t numSamples = output.getHeight();
-  real* out = output.getData();
-  real* grad = outputG.getData();
-  int* lbl = (*label.ids).getData();
+  real* out = useGpu_ ? tmpCpuInput_[0].value->getData() : output.getData();
+  int* lbl = useGpu_ ? tmpCpuInput_[1].ids->getData() : (*label.ids).getData();
+  real* grad = useGpu_ ? tmpCpuInput_[0].grad->getData() : outputG.getData();
   for (size_t i = 0; i < numSamples; ++i) {
     int y = 2 * lbl[i] - 1;
-    if (y * out[i] < -1)
+    real a = out[i] * y;
+    if (a < -1)
       grad[i] += -4 * y;
-    else if (y * out[i] < 1)
-      grad[i] += -2 * (1 - y * out[i]) * y;
+    else if (a < 1)
+      grad[i] += -2 * (1 - a) * y;
   }
+  if (useGpu_) outputG.copyFrom(grad, numSamples);
 }
-
 /**
  * This cost layer compute the sum of its input as loss.
  * \f[
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 14c0b33ec1..0f655b48ee 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -304,37 +304,70 @@ public:
                    Matrix& outputGrad) override;
 };
 
-/**
- * Huber loss for robust 2-classes classification.
- *
- * For label={0, 1}, let y=2*label-1. Given output f, the loss is:
- * \f[
- * Loss =
- * \left\{\begin{matrix}
- *  4 * y * f     &   \textit{if}  \ \  y* f < -1 \\
- *  (1 - y * f)^2 &  \textit{if}   \ \  -1 < y * f < 1  \\
- *  0             &                    \textit{otherwise}
- * \end{matrix}\right.
- * \f]
+/*
+ * A base layer for HuberRegressionLoss and HuberTwoClassification.
  */
-class HuberTwoClass : public CostLayer {
+class HuberCost : public CostLayer {
+public:
   std::vector<Argument> tmpCpuInput_;
 
-public:
-  explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
+  explicit HuberCost(const LayerConfig& config) : CostLayer(config) {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
 
   void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override {}
+};
+
+/**
+ * Huber loss for robust regression.
+ *
+ * Given output f(x), label y and delta, the loss is:
+ * Loss = 0.5 * (1 - y * f)^2, if abs(y - f) <= delta \\
+ * Loss = delta * abs(y - f) - 0.5 * delta^2, otherwise
+ */
+class HuberRegressionLoss : public HuberCost {
+public:
+  explicit HuberRegressionLoss(const LayerConfig& config) : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
   void backwardImp(Matrix& outputValue,
                    Argument& label,
                    Matrix& outputGrad) override;
 
-  void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+protected:
+  real delta_;
+};
+
+/**
+ * Huber loss for robust 2-classes classification.
+ *
+ * For label={0, 1}, let y=2*label-1. Given output f(x), the loss is:
+ * Loss = 4 * y * f, if y* f < -1 \\
+ * Loss = (1 - y * f)^2, if -1 < y * f < 1  \\
+ * Loss = 0, otherwise
+ */
+class HuberTwoClassification : public HuberCost {
+public:
+  explicit HuberTwoClassification(const LayerConfig& config)
+      : HuberCost(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 typedef std::shared_ptr<CostLayer> CostLayerPtr;
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/gserver/layers/CropLayer.cpp
new file mode 100644
index 0000000000..69ad913420
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(crop, CropLayer);
+
+bool CropLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
+  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
+  crop_axis_ = config_.axis();
+  for (int i = 0; i < config_.offset_size(); i++) {
+    crop_offsets_.push_back(config_.offset(i));
+  }
+
+  // 1. get input_0 shape
+  auto& input0_img_conf = config_.inputs(0).image_conf();
+  inDims_ = TensorShape({0,
+                         input0_img_conf.channels(),
+                         input0_img_conf.has_img_size_y()
+                             ? input0_img_conf.img_size_y()
+                             : input0_img_conf.img_size(),
+                         input0_img_conf.img_size()});
+  // 2. get target dims from config
+  if (config_.inputs_size() == 1) {
+    targetDims_ = TensorShape({config_.shape(0),
+                               config_.shape(1),
+                               config_.shape(2),
+                               config_.shape(3)});
+  } else {
+    // 2. get input_1 shape
+    auto& input1_img_conf = config_.inputs(1).image_conf();
+    targetDims_ = TensorShape({0,
+                               input1_img_conf.channels(),
+                               input1_img_conf.has_img_size_y()
+                                   ? input1_img_conf.img_size_y()
+                                   : input1_img_conf.img_size(),
+                               input1_img_conf.img_size()});
+  }
+
+  // 3. get final crop corner
+  int dimSize = 4;
+  crop_corner_ = {0, 0, 0, 0};
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      if (crop_offsets_.size() > 1) {
+        crop_corner_[i] = crop_offsets_[i - crop_axis_];
+      } else {
+        crop_corner_[i] = crop_offsets_[0];
+      }
+    }
+  }
+
+  outDims_ = TensorShape(4);
+
+  createFunction(
+      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
+  createFunction(
+      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
+
+  return true;
+}
+
+void CropLayer::setOutDims() {
+  MatrixPtr input = inputLayers_[1]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  // get target dims from input_1
+  if (config_.inputs_size() == 2) {
+    targetDims_.setDim(0, batchSize);
+    int ch = config_.inputs(0).image_conf().channels();
+    if (ch != 0) targetDims_.setDim(1, ch);
+    int h = inputLayers_[1]->getOutput().getFrameHeight();
+    if (h != 0) targetDims_.setDim(2, h);
+    int w = inputLayers_[1]->getOutput().getFrameWidth();
+    if (w != 0) targetDims_.setDim(3, w);
+  }
+  // get final crop shape from target dims and crop axis
+  std::vector<uint32_t> crop_shape;
+  int dimSize = 4;
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      crop_shape.push_back(targetDims_[i]);
+    } else {
+      crop_shape.push_back(inDims_[i]);
+    }
+  }
+
+  outDims_.reshape(
+      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
+  output_.setFrameHeight(crop_shape[2]);
+  output_.setFrameWidth(crop_shape[3]);
+}
+
+void CropLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+}
+
+void CropLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(outDims_[0], size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("CropForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CropLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
new file mode 100644
index 0000000000..6b62026210
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer crop input according to the specify conf.
+ *         input_0: input to be cropped
+ *         input_1: optional reference input
+ *         axis: start dimension to be croped
+ *         offset: offset of cropping  in each dimension
+ *         shape: if reference input layer was not setted,
+ *                  crop input as this shape conf
+ */
+class CropLayer : public Layer {
+public:
+  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CropLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims();
+  void setInDims();
+
+  int32_t crop_axis_;
+  std::vector<uint32_t> crop_offsets_;
+  std::vector<uint32_t> crop_corner_;
+  TensorShape inDims_;
+  TensorShape targetDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
index 3fbccc1103..d72503217f 100644
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -36,6 +36,16 @@ MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
       data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 void CrossChannelNormLayer::forward(PassType passType) {
   Layer::forward(passType);
   MatrixPtr inV = getInputValue(0);
@@ -51,9 +61,7 @@ void CrossChannelNormLayer::forward(PassType passType) {
   Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
   Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
   Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
-  normBuffer_->zeroMem();
-  // add eps to avoid overflow
-  normBuffer_->addScalar(*normBuffer_, 1e-6);
+
   inV->square2(*dataBuffer_);
   for (size_t i = 0; i < batchSize; i++) {
     const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
@@ -63,6 +71,8 @@ void CrossChannelNormLayer::forward(PassType passType) {
 
     // compute norm.
     spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    // add eps to avoid overflow
+    spatialBuffer_->add(1e-6);
     spatialBuffer_->sqrt2(*spatialBuffer_);
     normTmp->copyFrom(*spatialBuffer_);
     outVTmp->copyFrom(*inVTmp);
@@ -82,6 +92,9 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
   size_t dataDim = inG->getWidth();
   size_t spatialDim = dataDim / channels_;
 
+  MatrixPtr inGBuffer;
+  Matrix::resizeOrCreate(inGBuffer, channels_, spatialDim, false, useGpu_);
+
   dataBuffer_->dotMul(*outG, *outV);
   Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
   Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
@@ -100,22 +113,24 @@ void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
     scaleDiff_->add(*channelBuffer_, 1.);
 
     sampleBuffer_->dotMul(*inVTmp, *outGTmp);
-    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 0.);
     // scale the grad
-    inGTmp->copyFrom(*inVTmp);
-    inGTmp->mulRowVector(*spatialBuffer_);
+    inGBuffer->copyFrom(*inVTmp);
+    inGBuffer->mulRowVector(*spatialBuffer_);
     // divide by square of norm
     spatialBuffer_->dotMul(*normTmp, *normTmp);
-    inGTmp->divRowVector(*spatialBuffer_);
+    inGBuffer->divRowVector(*spatialBuffer_);
     // subtract
-    inGTmp->add(*outGTmp, -1, 1);
+    inGBuffer->add(*outGTmp, -1, 1);
     // divide by norm
-    inGTmp->divRowVector(*normTmp);
+    inGBuffer->divRowVector(*normTmp);
     // scale the diff
-    inGTmp->mulColVector(*scale_->getW());
+    inGBuffer->mulColVector(*scale_->getW());
+
+    inGTmp->add(*inGBuffer);
   }
   // updata scale
-  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  if (scale_->getWGrad()) scale_->getWGrad()->add(*scaleDiff_);
   scale_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.cpp b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
new file mode 100644
index 0000000000..578bdbbe72
--- /dev/null
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.cpp
@@ -0,0 +1,393 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossEntropyOverBeam.h"
+
+namespace paddle {
+
+void CostForOneSequence::calValidExpandStep() {
+  validExpansionCount_ = 0;
+  goldAsExtraPath_ = true;
+
+  for (size_t i = 0; i < beams_->expansionCount; ++i) {
+    real gold = static_cast<real>(beams_->gold[i]);
+    if (i) {
+      real* start = beams_->candidateIds[i - 1]->getData();
+      goldRowIds_[i] = std::count_if(
+          start,
+          start + goldRowIds_[i - 1] * beamSize_ + goldColIds_[i - 1],
+          [](const real& val) { return val != -1.; });
+    } else {
+      goldRowIds_[i] = 0;
+    }
+
+    real* start =
+        beams_->candidateIds[i]->getData() + goldRowIds_[i] * beamSize_;
+    real* findEnd = std::find(start, start + beamSize_, gold);
+    validExpansionCount_++;
+
+    if (start + beamSize_ == findEnd) return;
+    goldColIds_[i] = findEnd - start;
+  }
+  if (goldColIds_[beams_->expansionCount - 1] != -1) goldAsExtraPath_ = false;
+}
+
+size_t CostForOneSequence::initLastExpansion() {
+  int beamId = validExpansionCount_ - 1;
+  const MatrixPtr candidates = beams_->candidateIds[beamId];
+  size_t height = candidates->getHeight();
+
+  /* initialization the last expansion. */
+  size_t pathCount = std::count_if(candidates->getData(),
+                                   candidates->getData() + height * beamSize_,
+                                   [](const real& val) { return val != -1; });
+  /*
+   * if the gold sequence falls off the beam during search, add the gold
+   * sequence as the last path into the all expanded candidates.
+   */
+  if (goldAsExtraPath_) goldIdsInFinalExpansion_ = pathCount++;
+
+  pathRowIdsInEachBeam_.clear();
+  pathRowIdsInEachBeam_.resize(validExpansionCount_,
+                               std::vector<int>(pathCount, 0));
+  parentIdsInBeam_.clear();
+  parentIdsInBeam_.resize(pathCount, 0);
+
+  if (goldAsExtraPath_) {
+    /* add gold sequence into the total expansion. */
+    pathRowIdsInEachBeam_[beamId].back() =
+        beams_->gold[beamId] +
+        getSeqStartPos(beamId, goldRowIds_[validExpansionCount_ - 1]);
+    parentIdsInBeam_.back() = goldRowIds_[validExpansionCount_ - 1];
+  } else {
+    size_t goldOffset = goldRowIds_[beamId] * beamSize_ + goldColIds_[beamId];
+    goldIdsInFinalExpansion_ =
+        std::count_if(candidates->getData(),
+                      candidates->getData() + goldOffset,
+                      [](const real& val) { return val != -1.; });
+  }
+
+  /*
+   * TODO(caoying): fix this, store the indices of selected candidate
+   * paths into Argument.ids
+   */
+  real* ids = candidates->getData();
+  size_t curIdx = 0;
+  for (size_t i = 0; i < height; ++i) {
+    int basePos = getSeqStartPos(beamId, i);
+    for (size_t j = 0; j < beamSize_; ++j) {
+      int id = ids[i * beamSize_ + j];
+      if (id == -1) continue;
+      pathRowIdsInEachBeam_[beamId][curIdx] = id + basePos;
+      parentIdsInBeam_[curIdx++] = i;
+    }
+  }
+  return pathCount;
+}
+
+void CostForOneSequence::constructTotalExpansion() {
+  /*
+   * construct the entire expanded beam by begining with the last search
+   * in which gold falls off the beam.
+   */
+  size_t totalPathCount = initLastExpansion();
+
+  for (int beamId = validExpansionCount_ - 2; beamId >= 0; --beamId) {
+    const MatrixPtr candidates = beams_->candidateIds[beamId];
+    real* ids = candidates->getData();
+
+    int lastParentIdInBeam = -1;
+    int basePos = -1;
+    for (size_t i = 0;
+         i < (goldAsExtraPath_ ? totalPathCount - 1 : totalPathCount);
+         ++i) {
+      int id = ids[parentIdsInBeam_[i]];
+      int parentRowId = std::div(parentIdsInBeam_[i], beamSize_).quot;
+      if (parentIdsInBeam_[i] != lastParentIdInBeam)
+        basePos = getSeqStartPos(beamId, parentRowId);
+
+      pathRowIdsInEachBeam_[beamId][i] = id + basePos;
+      lastParentIdInBeam = parentIdsInBeam_[i];
+      parentIdsInBeam_[i] = parentRowId;
+
+      if (goldAsExtraPath_)
+        pathRowIdsInEachBeam_[beamId][totalPathCount - 1] =
+            beams_->gold[beamId] + getSeqStartPos(beamId, goldRowIds_[beamId]);
+    }
+  }
+}
+
+real CostForOneSequence::globallyNormalizedScore() {
+  expandedPathScores_.resize(validExpansionCount_);
+
+  Matrix::resizeOrCreate(
+      softmaxOut_, 1, pathRowIdsInEachBeam_[0].size(), false, false);
+  softmaxOut_->zeroMem();
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    Matrix::resizeOrCreate(expandedPathScores_[i],
+                           pathRowIdsInEachBeam_[i].size(),
+                           1,
+                           false,
+                           false);
+    expandedPathScores_[i]->zeroMem();
+
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    expandedPathScores_[i]->selectRows(*(beams_->scores[i]), *rowIds);
+    tmp->add(*expandedPathScores_[i]);
+  }
+
+  softmaxOut_->softmax(*softmaxOut_);
+  return -std::log(softmaxOut_->getData()[goldIdsInFinalExpansion_]);
+}
+
+real CostForOneSequence::forward() {
+  calValidExpandStep();
+  constructTotalExpansion();
+  return globallyNormalizedScore();
+}
+
+void CostForOneSequence::backward() {
+  /*
+   * when softmax layer is the output layer, and it is combined with
+   * cross-entropy as cost. The derivate with regard to softmax's input
+   * is simply:
+   *
+   * grad_i = softmax_out_i - target_i,
+   *
+   * and here hard label is used.
+   */
+  softmaxOut_->getData()[goldIdsInFinalExpansion_] -= 1.;
+
+  MatrixPtr tmp = Matrix::create(
+      softmaxOut_->getData(), softmaxOut_->getWidth(), 1, false, false);
+
+  for (size_t i = 0; i < validExpansionCount_; ++i) {
+    IVectorPtr rowIds = IVector::create(pathRowIdsInEachBeam_[i].data(),
+                                        pathRowIdsInEachBeam_[i].size(),
+                                        false);
+    /*
+      beams_->scoreGrad[i] has been intialized outside this class, this
+      class only keeps a pointer pointing to the original input gradients,
+      so here does not need to allocate or initalize the memory.
+    */
+    tmp->addToRows(*beams_->scoreGrad[i], *rowIds);
+  }
+}
+
+REGISTER_LAYER(cross_entropy_over_beam, CrossEntropyOverBeam);
+
+bool CrossEntropyOverBeam::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(0U, inputLayers_.size() % 3) << "Error input number.";
+
+  beamExpanCount_ = inputLayers_.size() / 3;
+
+  candidateScores_.resize(beamExpanCount_);
+  candidateScoreGrad_.resize(beamExpanCount_);
+
+  candidateInBeam_.resize(beamExpanCount_);
+  goldSequence_.resize(beamExpanCount_);
+  gradToInputs_.resize(beamExpanCount_);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void CrossEntropyOverBeam::checkInputs() {
+  batchSize_ = 0;
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    const Argument& scores = getInput(i * 3);
+    const Argument& selCandidates = getInput(i * 3 + 1);
+    const Argument& goldSeq = getInput(i * 3 + 2);
+
+    if (i) {
+      CHECK(scores.hasSubseq()) << "input " << i << " "
+                                << inputLayers_[i * 3]->getName()
+                                << " should be a nested sequence";
+      CHECK_EQ(getInputValue(i * 3 + 1)->getWidth(), beamSize_);
+      CHECK_EQ(batchSize_, static_cast<size_t>(scores.getNumSequences()));
+      CHECK_EQ(scores.getNumSubSequences(), selCandidates.getBatchSize());
+    } else {
+      CHECK(scores.hasSeq()) << "input " << i << " "
+                             << inputLayers_[i]->getName()
+                             << " should be a sequence";
+      batchSize_ = scores.getNumSequences();
+      beamSize_ = getInputValue(i * 3 + 1)->getWidth();
+      CHECK_EQ(batchSize_, static_cast<size_t>(selCandidates.getBatchSize()));
+    }
+    CHECK_EQ(1U, scores.value->getWidth());
+    CHECK_EQ(batchSize_, static_cast<size_t>(goldSeq.getBatchSize()));
+  }
+}
+
+void CrossEntropyOverBeam::copyInputsToCpu() {
+  auto copyValue = [](const MatrixPtr& src, MatrixPtr& trg) {
+    if (dynamic_cast<GpuMatrix*>(src.get())) {
+      Matrix::resizeOrCreate(
+          trg, src->getHeight(), src->getWidth(), false, false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+
+  auto copyIds = [](const IVectorPtr& src, IVectorPtr& trg) {
+    if (dynamic_cast<GpuIVector*>(src.get())) {
+      IVector::resizeOrCreate(trg, src->getSize(), false);
+      trg->copyFrom(*src);
+    } else {
+      trg = std::move(src);
+    }
+  };
+
+  beamSplitPos_.clear();
+  beamSplitPos_.resize(batchSize_, std::vector<int>(beamExpanCount_, 0));
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    copyValue(getInputValue(i * 3), candidateScores_[i]);
+    copyValue(getInputValue(i * 3 + 1), candidateInBeam_[i]);
+    copyIds(getInput(i * 3 + 2).ids, goldSequence_[i]);
+
+    if (i) {
+      ICpuGpuVectorPtr seqInfo = getInput(i * 3).sequenceStartPositions;
+      const int* seqStarts = seqInfo->getMutableData(false);
+      ICpuGpuVectorPtr subSeqInfo = getInput(i * 3).subSequenceStartPositions;
+      const int* subSeqStarts = subSeqInfo->getMutableData(false);
+
+      size_t seqId = 1;
+      for (size_t subSeqId = 0; subSeqId < subSeqInfo->getSize() - 1;
+           ++subSeqId) {
+        CHECK_LT(seqId, seqInfo->getSize());
+        if (subSeqStarts[subSeqId] == seqStarts[seqId]) {
+          beamSplitPos_[seqId][i] = beamSplitPos_[seqId - 1][i];
+          seqId++;
+        }
+        beamSplitPos_[seqId - 1][i]++;
+      }
+    } else {
+      for (size_t j = 0; j < batchSize_; ++j) beamSplitPos_[j][i] = j + 1;
+    }
+  }
+}
+
+void CrossEntropyOverBeam::splitBatchBeams() {
+  beamCosts_.resize(batchSize_);
+  beamPerSeq_.resize(batchSize_, BeamExpansion(beamExpanCount_));
+
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    int* seqStarts =
+        getInput(i * 3).sequenceStartPositions->getMutableData(false);
+
+    int* subSeqStarts = nullptr;
+    int maxLen = 0;
+    if (i) {
+      subSeqStarts =
+          getInput(i * 3).subSequenceStartPositions->getMutableData(false);
+      maxLen = getInput(i * 3).subSequenceStartPositions->getSize() - 1;
+    } else {
+      maxLen = getInput(i).sequenceStartPositions->getSize() - 1;
+    }
+
+    for (size_t j = 0; j < batchSize_; ++j) {
+      beamPerSeq_[j].scores[i] =
+          Matrix::create(candidateScores_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+      beamPerSeq_[j].scoreGrad[i] =
+          Matrix::create(candidateScoreGrad_[i]->getData() + seqStarts[j],
+                         seqStarts[j + 1] - seqStarts[j],
+                         1,
+                         false,
+                         false);
+
+      int offset = j ? beamSplitPos_[j - 1][i] : 0;
+      int height = beamSplitPos_[j][i] - (j ? beamSplitPos_[j - 1][i] : 0);
+      CHECK_GE(maxLen, offset + height);
+      beamPerSeq_[j].seqInfo[i] = IVector::create(
+          (i ? subSeqStarts : seqStarts) + offset, height + 1, false);
+
+      beamPerSeq_[j].candidateIds[i] =
+          Matrix::create(candidateInBeam_[i]->getData() + offset * beamSize_,
+                         height,
+                         beamSize_,
+                         false,
+                         false);
+      beamPerSeq_[j].gold[i] = goldSequence_[i]->getData()[j];
+
+      CHECK_LE(beamPerSeq_[j].gold[i], seqStarts[j + 1] - seqStarts[j]);
+    }
+  }
+}
+
+void CrossEntropyOverBeam::resizeOutput() {
+  Matrix::resizeOrCreate(output_.value, batchSize_, 1, false, false);
+  output_.value->zeroMem();
+
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    MatrixPtr inGrad = getInputGrad(i * 3);
+    if (dynamic_cast<GpuMatrix*>(inGrad.get())) {
+      Matrix::resizeOrCreate(candidateScoreGrad_[i],
+                             inGrad->getHeight(),
+                             inGrad->getWidth(),
+                             false,
+                             false);
+    } else {
+      candidateScoreGrad_[i] = std::move(inGrad);
+    }
+    candidateScoreGrad_[i]->zeroMem();
+  }
+}
+
+void CrossEntropyOverBeam::copyGradToGpu(size_t copyCount) {
+  for (size_t i = 0; i < beamExpanCount_; ++i) {
+    if (dynamic_cast<GpuMatrix*>(getInputGrad(i * 3).get()))
+      getInputGrad(i * 3)->copyFrom(*candidateScoreGrad_[i]);
+
+    if (i == copyCount - 1) break;
+  }
+}
+
+void CrossEntropyOverBeam::forward(PassType passType) {
+  Layer::forward(passType);
+
+  checkInputs();
+  copyInputsToCpu();
+
+  resizeOutput();
+  splitBatchBeams();
+
+  MatrixPtr outputValue = getOutputValue();
+  for (size_t i = 0; i < batchSize_; ++i) {
+    BeamExpansionPtr ptr = std::make_shared<BeamExpansion>(beamPerSeq_[i]);
+    beamCosts_[i].setData(std::move(ptr), beamSize_);
+    outputValue->getData()[i] = beamCosts_[i].forward();
+  }
+}
+
+void CrossEntropyOverBeam::backward(const UpdateCallback& callback) {
+  for (size_t i = 0; i < batchSize_; ++i) {
+    beamCosts_[i].backward();
+    copyGradToGpu(beamCosts_[i].getValidExpansionCount());
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CrossEntropyOverBeam.h b/paddle/gserver/layers/CrossEntropyOverBeam.h
new file mode 100644
index 0000000000..5643556f43
--- /dev/null
+++ b/paddle/gserver/layers/CrossEntropyOverBeam.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "CrossEntropyOverBeam.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/* This struct stores the beams in all search steps for a single sequence. */
+struct BeamExpansion {
+  std::vector<MatrixPtr> scores;
+  std::vector<IVectorPtr> seqInfo;
+
+  std::vector<MatrixPtr> candidateIds;
+  std::vector<int> gold;
+
+  std::vector<MatrixPtr> scoreGrad;
+
+  size_t expansionCount;
+
+  explicit BeamExpansion(int n) {
+    expansionCount = n;
+    scores.resize(expansionCount);
+    seqInfo.resize(expansionCount);
+    candidateIds.resize(expansionCount);
+    scoreGrad.resize(expansionCount);
+
+    gold.resize(expansionCount);
+  }
+};
+typedef std::shared_ptr<BeamExpansion> BeamExpansionPtr;
+
+class CostForOneSequence {
+public:
+  CostForOneSequence()
+      : beamSize_(0), validExpansionCount_(0), goldAsExtraPath_(false) {}
+  void setData(const BeamExpansionPtr bPtr, size_t beamSize) {
+    beams_ = bPtr;
+    beamSize_ = beamSize;
+
+    expandedPathScores_.clear();
+    expandedPathScores_.resize(beams_->expansionCount);
+
+    goldRowIds_.clear();
+    goldRowIds_.resize(beams_->expansionCount, 0);
+    goldColIds_.clear();
+    goldColIds_.resize(beams_->expansionCount, -1);
+  }
+  size_t getValidExpansionCount() { return validExpansionCount_; }
+
+  real forward();
+  void backward();
+
+private:
+  void calValidExpandStep();
+  void constructTotalExpansion();
+  size_t initLastExpansion();
+  real globallyNormalizedScore();
+
+  int getSeqStartPos(size_t beamId, size_t rowId) {
+    CHECK_GT(beams_->seqInfo[beamId]->getSize() - 1, rowId);
+    int* starts = beams_->seqInfo[beamId]->getData();
+    return starts[rowId] - starts[0];
+  }
+
+  size_t beamSize_;
+  size_t validExpansionCount_;
+  bool goldAsExtraPath_;
+  std::vector<int> goldRowIds_;
+  std::vector<int> goldColIds_;
+
+  BeamExpansionPtr beams_;
+  std::vector<std::vector<int>> pathRowIdsInEachBeam_;
+  std::vector<int> parentIdsInBeam_;
+  size_t goldIdsInFinalExpansion_;
+
+  std::vector<MatrixPtr> expandedPathScores_;
+
+  MatrixPtr softmaxOut_;
+};
+
+class CrossEntropyOverBeam : public Layer {
+public:
+  explicit CrossEntropyOverBeam(const LayerConfig& config) : Layer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
+
+private:
+  void checkInputs();
+  void copyInputsToCpu();
+  void resizeOutput();
+  void copyGradToGpu(size_t copyCount);
+  void splitBatchBeams();
+
+  size_t beamExpanCount_;
+  size_t batchSize_;
+  size_t beamSize_;
+
+  /*
+   * the process of constructing beams is not friendly to GPU, currently, this
+   * layer only runs on CPU, if any of its inputs is on GPU memory, then copy
+   * it to CPU memory.
+   */
+  std::vector<MatrixPtr> candidateScores_;
+  std::vector<MatrixPtr> candidateScoreGrad_;
+  std::vector<MatrixPtr> candidateInBeam_;
+  std::vector<MatrixPtr> gradToInputs_;
+  std::vector<IVectorPtr> goldSequence_;
+  std::vector<std::vector<int>> beamSplitPos_;
+
+  /*
+   * split entire bath of beams into beam per sequnence and store the result
+   * into this member.
+   */
+  std::vector<BeamExpansion> beamPerSeq_;
+  /* beamCosts_ is used to propagate error in one sequence. */
+  std::vector<CostForOneSequence> beamCosts_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index 09dac05a7a..8390b55026 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -14,14 +14,13 @@ limitations under the License. */
 
 #include "CudnnBatchNormLayer.h"
 #include "Layer.h"
+#include "paddle/cuda/include/hl_batch_norm.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
 
 REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
 
-const double CudnnBatchNormLayer::EPS = 1E-5;
-
 bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
                                const ParameterMap& parameterMap) {
   /* Initialize the basic parent class */
@@ -36,7 +35,7 @@ bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
 }
 
 void CudnnBatchNormLayer::reshape(int batchSize) {
-  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_, imageW_);
+  hl_tensor_reshape(ioDesc_, batchSize, channels_, imageH_ * imageD_, imageW_);
 }
 
 void CudnnBatchNormLayer::forward(PassType passType) {
@@ -60,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
   real* movingMean = movingMean_->getW()->getData();
   real* movingVar = movingVar_->getW()->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   if (!useGlobalStats_) {
     REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
     real* savedMean = savedMean_->getData();
@@ -74,21 +76,38 @@ void CudnnBatchNormLayer::forward(PassType passType) {
                                    1.0 - movingAvgFraction_,
                                    movingMean,
                                    movingVar,
-                                   EPS,
+                                   eps_,
                                    savedMean,
                                    savedInvVar);
   } else {
     // used movingMean and movingVar in testing
-    hl_batch_norm_forward_inference(ioDesc_,
-                                    input,
-                                    ioDesc_,
-                                    output,
-                                    bnParamDesc_,
-                                    gamma,
-                                    beta,
-                                    movingMean,
-                                    movingVar,
-                                    EPS);
+    if (batchSize <= 1024) {
+      hl_batch_norm_forward_inference(ioDesc_,
+                                      input,
+                                      ioDesc_,
+                                      output,
+                                      bnParamDesc_,
+                                      gamma,
+                                      beta,
+                                      movingMean,
+                                      movingVar,
+                                      eps_);
+    } else {
+      // There is a limitation in cudnn library.
+      // When the batch size is larger than 1024 in cuDNN v5.1,
+      // the cudnnBatchNormalizationForwardInference will fail.
+      hl_batch_norm_cuda_inference(input,
+                                   output,
+                                   gamma,
+                                   beta,
+                                   movingMean,
+                                   movingVar,
+                                   eps_,
+                                   batchSize,
+                                   channels_,
+                                   imageH_ * imageD_,
+                                   imageW_);
+    }
   }
 
   /* activation */ {
@@ -110,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
   real* savedMean = savedMean_->getData();
   real* savedInvVar = savedInvVar_->getData();
 
+  // cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
+  eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
+
   auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
     Matrix::resizeOrCreate(m, h, w, false, true);
     m->zeroMem();
@@ -139,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
                          gamma,
                          gammaGrad,
                          betaGrad,
-                         EPS,
+                         eps_,
                          savedMean,
                          savedInvVar);
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index 413efd4d3e..1a3f0c0cbf 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cudnn.h>
 #include "BatchNormBaseLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
@@ -46,12 +47,9 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  /**
-   * Epsilon value used in the batch normalization formula.
-   * Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
-   * Same epsilon value should be used in forward and backward functions.
-   */
-  static const double EPS;
+  /// Epsilon value used in the batch normalization formula.
+  /// Same epsilon value should be used in forward and backward functions.
+  double eps_;
 
   /// Input/output tensor descriptor desc
   hl_tensor_descriptor ioDesc_;
diff --git a/paddle/gserver/layers/CudnnConvBaseLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
index 24363bb8b0..9e954615cd 100644
--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -46,8 +46,26 @@ bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
     projConf_.emplace_back(conf);
     projections_.emplace_back(
         Projection::create(*projConf_[i], parameters_[i], useGpu_));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = (!isDeconv_) ? numFilters_ : channels_[i];
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
   }
 
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
     hl_create_tensor_descriptor(&outputDesc_);
@@ -70,14 +88,8 @@ void CudnnConvBaseLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH, outW;
-    if (isDeconv_) {
-      outH = imgSizeH_[0];
-      outW = imgSizeW_[0];
-    } else {
-      outH = outputH_[0];
-      outW = outputW_[0];
-    }
+    int outH = outputH_[0];
+    int outW = outputW_[0];
 
     hl_tensor_reshape(outputDesc_,
                       batchSize,
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index 4adb2d4709..810a1af2d0 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -29,9 +29,9 @@ bool CudnnPoolLayer::typeCheck(const std::string &poolType,
     if (mode) {
       *mode = HL_POOLING_AVERAGE;
     }
-  } else if (poolType == "cudnn-avg-excl-pad-pool") {
+  } else if (poolType == "cudnn-avg-incl-pad-pool") {
     if (mode) {
-      *mode = HL_POOLING_AVERAGE_EXCLUDE_PADDING;
+      *mode = HL_POOLING_AVERAGE_INCLUDE_PADDING;
     }
   } else {
     return false;
diff --git a/paddle/gserver/layers/DeConv3DLayer.cpp b/paddle/gserver/layers/DeConv3DLayer.cpp
new file mode 100644
index 0000000000..3eea638649
--- /dev/null
+++ b/paddle/gserver/layers/DeConv3DLayer.cpp
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DeConv3DLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(deconv3d, DeConv3DLayer);
+
+bool DeConv3DLayer::init(const LayerMap &layerMap,
+                         const ParameterMap &parameterMap) {
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
+  // for Deconv, the dimension of Kernel is
+  // channel * output * depth * height * weigth
+  // Matrix storage format: (output * depth * height * weigth) x  channel
+  for (int index = 0; index < config_.inputs().size(); ++index) {
+    M_.push_back(filterChannels_[index]);
+    K_.push_back(filterPixels_[index] * (numFilters_ / groups_[index]));
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * numFilters_;
+    width = filterChannels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+  }
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(numFilters_, 1, biasParameter_));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(getSize(), 1, biasParameter_));
+    }
+  }
+  return true;
+}
+
+size_t DeConv3DLayer::getSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  imgSizeW_.clear();
+  imgSizeH_.clear();
+  imgSizeD_.clear();
+  N_.clear();
+  NOut_.clear();
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    imgSizeW_.push_back(
+        imageSize(outputW_[i], filterSize_[i], padding_[i], stride_[i], true));
+    imgSizeH_.push_back(imageSize(
+        outputH_[i], filterSizeY_[i], paddingY_[i], strideY_[i], true));
+    imgSizeD_.push_back(imageSize(
+        outputD_[i], filterSizeZ_[i], paddingZ_[i], strideZ_[i], true));
+    NOut_.push_back(imgSizeD_[i] * imgSizeH_[i] * imgSizeW_[i]);
+    N_.push_back(outputD_[i] * outputH_[i] * outputW_[i]);
+    CHECK(layerSize == 0 || N_[i] * size_t(numFilters_) == layerSize);
+    layerSize += NOut_[i] * numFilters_;
+  }
+  getOutput().setFrameHeight(imgSizeH_[0]);
+  getOutput().setFrameWidth(imgSizeW_[0]);
+  getOutput().setFrameDepth(imgSizeD_[0]);
+  return layerSize;
+}
+
+void DeConv3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  int outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  const MatrixPtr outMat = getOutputValue();
+
+  REGISTER_TIMER_INFO("FwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i != inputLayers_.size(); ++i) {
+    const MatrixPtr &inMat = getInputValue(i);
+    int M = M_[i];
+    int N = N_[i];
+    int K = K_[i];
+    MatrixPtr wMat = weights_[i]->getW();
+    Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+    for (int n = 0; n < batchSize; ++n) {
+      real *inData = inMat->getData() + n * inMat->getStride();
+      for (int g = 0; g < groups_[i]; ++g) {
+        MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+        MatrixPtr wMatSub = wMat->subMatrix(g * K, K);
+        MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+        colBufDataSub->mul(*wMatSub, *inMatSub, 1.0, 0.0);
+        inData += M * N;
+      }
+      colBuf_->col2Vol(outMat->getData() + n * outMat->getStride(),
+                       numFilters_,
+                       imgSizeD_[i],
+                       imgSizeH_[i],
+                       imgSizeW_[i],
+                       filterSizeZ_[i],
+                       filterSizeY_[i],
+                       filterSize_[i],
+                       strideZ_[i],
+                       strideY_[i],
+                       stride_[i],
+                       paddingZ_[i],
+                       paddingY_[i],
+                       padding_[i],
+                       1.0,
+                       1.0);
+    }
+  }
+  if (nullptr != this->biasParameter_) {
+    this->addBias();
+  }
+  forwardActivation();
+}
+
+void DeConv3DLayer::backward(const UpdateCallback &callback) {
+  backwardActivation();
+  int batchSize = getOutputGrad()->getHeight();
+  if (biases_ && biases_->getWGrad()) {
+    bpropBiases();
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+  REGISTER_TIMER_INFO("BwdDeConv3D", getName().c_str());
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    if (weights_[i]->getWGrad() || this->needGradient_) {
+      int M = M_[i];
+      int N = N_[i];
+      int K = K_[i];
+      Matrix::resizeOrCreate(colBuf_, K * groups_[i], N, false, useGpu_);
+      const MatrixPtr &inMat = getInputValue(i);
+      for (int n = 0; n < batchSize; ++n) {
+        colBuf_->vol2Col(
+            getOutputGrad()->getData() + n * getOutputGrad()->getStride(),
+            numFilters_,
+            imgSizeD_[i],
+            imgSizeH_[i],
+            imgSizeW_[i],
+            filterSizeZ_[i],
+            filterSizeY_[i],
+            filterSize_[i],
+            strideZ_[i],
+            strideY_[i],
+            stride_[i],
+            paddingZ_[i],
+            paddingY_[i],
+            padding_[i]);
+        if (weights_[i]->getWGrad()) {
+          real *inData = inMat->getData() + n * inMat->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr colBufDataSub = colBuf_->subMatrix(g * K, K);
+            MatrixPtr wGradMatSub =
+                weights_[i]->getWGrad()->subMatrix(g * K, K);
+            MatrixPtr inMatSub = Matrix::create(inData, M, N, false, useGpu_);
+            wGradMatSub->mul(
+                *colBufDataSub, *(inMatSub->getTranspose()), 1.0, 1.0);
+            inData += M * N;
+          }
+        }
+        if (getInputGrad(i)) {
+          real *preGrad =
+              getInputGrad(i)->getData() + n * getInputGrad(i)->getStride();
+          for (int g = 0; g < groups_[i]; ++g) {
+            MatrixPtr w = weights_[i]->getW()->subMatrix(g * K, K);
+            MatrixPtr outGradMat = colBuf_->subMatrix(g * K, K);
+            MatrixPtr inGradMatSub =
+                Matrix::create(preGrad, M, N, false, useGpu_);
+            inGradMatSub->mul(*(w->getTranspose()), *outGradMat, 1.0, 1.0);
+            preGrad += M * N;
+          }
+        }
+      }
+      weights_[i]->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+void DeConv3DLayer::bpropWeights(int i) {}
+void DeConv3DLayer::bpropData(int i) {}
+
+void DeConv3DLayer::bpropBiases() {
+  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
+                                    1,
+                                    biases_->getWGrad()->getElementCnt(),
+                                    false,
+                                    useGpu_);
+  const MatrixPtr &outGradMat = getOutputGrad();
+
+  if (this->sharedBiases_) {
+    biases->collectSharedBias(*outGradMat, 1.0f);
+  } else {
+    biases->collectBias(*outGradMat, 1.0f);
+  }
+}
+
+void DeConv3DLayer::addBias() {
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
+                                  1,
+                                  biases_->getW()->getElementCnt(),
+                                  false,
+                                  useGpu_);
+  if (this->sharedBiases_) {
+    outMat->addSharedBias(*(bias), 1.0f);
+  } else {
+    outMat->addBias(*(bias), 1.0f);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DeConv3DLayer.h b/paddle/gserver/layers/DeConv3DLayer.h
new file mode 100644
index 0000000000..a2a3d3f827
--- /dev/null
+++ b/paddle/gserver/layers/DeConv3DLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "ConvBaseLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of deconvolution3D layer.
+ * This layer expands input and use matrix multiplication to
+ * calculate deconvolution3D operation.
+ */
+class DeConv3DLayer : public ConvBaseLayer {
+public:
+  explicit DeConv3DLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  ~DeConv3DLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void addBias();
+  void backward(const UpdateCallback& callback);
+  void bpropBiases();
+  void bpropData(int i);
+  void bpropWeights(int i);
+  size_t getSize();
+
+protected:
+  // Figure out the dimensions for individual gemms.
+  IntV M_;  /// numFilters_ / filter_group_;
+  IntV N_;  /// channels_ * filterSizeZ_ * filterSize_ * filterSizeY_
+  IntV K_;  /// outputD_ * outputH_ * outputW_
+  IntV NOut_;
+  MatrixPtr colBuf_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.cpp b/paddle/gserver/layers/DetectionOutputLayer.cpp
new file mode 100644
index 0000000000..f9040f7ae7
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionOutputLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(detection_output, DetectionOutputLayer);
+
+bool DetectionOutputLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  nmsThreshold_ = layerConf.nms_threshold();
+  confidenceThreshold_ = layerConf.confidence_threshold();
+  nmsTopK_ = layerConf.nms_top_k();
+  keepTopK_ = layerConf.keep_top_k();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void DetectionOutputLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      confTmpBuffer_, confSizeSum_ / numClasses_, numClasses_, false, useGpu_);
+
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).detection_output_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locTmpBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confTmpBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  MatrixPtr priorValue;
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(
+        confCpuBuffer_, confSizeSum_ / numClasses_, numClasses_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  confBuffer_->softmax(*confBuffer_);
+
+  size_t numPriors = priorValue->getElementCnt() / 8;
+  std::vector<std::vector<NormalizedBBox>> allDecodedBBoxes;
+  for (size_t n = 0; n < batchSize; ++n) {
+    std::vector<NormalizedBBox> decodedBBoxes;
+    for (size_t i = 0; i < numPriors; ++i) {
+      size_t priorOffset = i * 8;
+      size_t locPredOffset = n * numPriors * 4 + i * 4;
+      std::vector<NormalizedBBox> priorBBoxVec;
+      getBBoxFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVec);
+      std::vector<std::vector<real>> priorBBoxVar;
+      getBBoxVarFromPriorData(
+          priorValue->getData() + priorOffset, 1, priorBBoxVar);
+      std::vector<real> locPredData;
+      for (size_t j = 0; j < 4; ++j)
+        locPredData.push_back(*(locBuffer_->getData() + locPredOffset + j));
+      NormalizedBBox bbox =
+          decodeBBoxWithVar(priorBBoxVec[0], priorBBoxVar[0], locPredData);
+      decodedBBoxes.push_back(bbox);
+    }
+    allDecodedBBoxes.push_back(decodedBBoxes);
+  }
+
+  std::vector<std::map<size_t, std::vector<size_t>>> allIndices;
+  size_t numKept = getDetectionIndices(confBuffer_->getData(),
+                                       numPriors,
+                                       numClasses_,
+                                       backgroundId_,
+                                       batchSize,
+                                       confidenceThreshold_,
+                                       nmsTopK_,
+                                       nmsThreshold_,
+                                       keepTopK_,
+                                       allDecodedBBoxes,
+                                       &allIndices);
+
+  if (numKept > 0) {
+    resetOutput(numKept, 7);
+  } else {
+    MatrixPtr outV = getOutputValue();
+    if (outV) outV->resize(0, 0);
+    return;
+  }
+  MatrixPtr outV = getOutputValue();
+  getDetectionOutput(confBuffer_->getData(),
+                     numKept,
+                     numPriors,
+                     numClasses_,
+                     batchSize,
+                     allIndices,
+                     allDecodedBBoxes,
+                     *outV);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionOutputLayer.h b/paddle/gserver/layers/DetectionOutputLayer.h
new file mode 100644
index 0000000000..a232af0a69
--- /dev/null
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * The detection output layer for a SSD detection task. This layer applies the
+ * Non-maximum suppression to the all predicted bounding box and keeps the
+ * Top-K bounding boxes.
+ * - Input: This layer needs three input layers: The first input layer
+ *          is the priorbox layer. The rest two input layers are convolution
+ *          layers for generating bbox location offset and the classification
+ *          confidence.
+ * - Output: The predict bounding box locations.
+ */
+
+class DetectionOutputLayer : public Layer {
+public:
+  explicit DetectionOutputLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[1 + index];
+  }
+
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[1 + inputNum_ + index];
+  }
+
+private:
+  size_t numClasses_;  // number of classes
+  size_t inputNum_;    // number of input layers
+  real nmsThreshold_;
+  real confidenceThreshold_;
+  size_t nmsTopK_;
+  size_t keepTopK_;
+  size_t backgroundId_;
+
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
new file mode 100644
index 0000000000..d83674f45a
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -0,0 +1,576 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionUtil.h"
+
+namespace paddle {
+
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNCHWToNHWC) {
+    size_t inElementCnt = inMatrix.getElementCnt();
+    size_t channels = inElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (outTotalSize / batchSize) + outOffset;
+      const MatrixPtr inTmp = Matrix::create(
+          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      MatrixPtr outTmp =
+          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t inTotalSize,
+                            size_t inOffset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNHWCToNCHW) {
+    size_t outElementCnt = outMatrix.getElementCnt();
+    size_t channels = outElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (inTotalSize / batchSize) + inOffset;
+      const MatrixPtr inTmp =
+          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      MatrixPtr outTmp = Matrix::create(
+          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+
+    real interWidth = interXMax - interXMin;
+    real interHeight = interYMax - interYMin;
+    real interArea = interWidth * interHeight;
+
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real gtBBoxWidth = gtBBox.getWidth();
+  real gtBBoxHeight = gtBBox.getHeight();
+  real gtBBoxCenterX = gtBBox.getCenterX();
+  real gtBBoxCenterY = gtBBox.getCenterY();
+
+  outVec.clear();
+  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
+                   priorBBoxVar[0]);
+  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
+                   priorBBoxVar[1]);
+  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
+                   priorBBoxVar[2]);
+  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
+                   priorBBoxVar[3]);
+}
+
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real decodedBBoxCenterX =
+      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
+  real decodedBBoxCenterY =
+      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
+  real decodedBBoxWidth =
+      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
+  real decodedBBoxHeight =
+      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
+
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+
+  return decodedBBox;
+}
+
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(priorData + i * 8);
+    bbox.yMin = *(priorData + i * 8 + 1);
+    bbox.xMax = *(priorData + i * 8 + 2);
+    bbox.yMax = *(priorData + i * 8 + 3);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec) {
+  size_t outOffset = varVec.size();
+  varVec.resize(varVec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    vector<real> var;
+    var.push_back(*(priorData + i * 8 + 4));
+    var.push_back(*(priorData + i * 8 + 5));
+    var.push_back(*(priorData + i * 8 + 6));
+    var.push_back(*(priorData + i * 8 + 7));
+    varVec[outOffset + i] = var;
+  }
+}
+
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(labelData + i * 6 + 1);
+    bbox.yMin = *(labelData + i * 6 + 2);
+    bbox.xMax = *(labelData + i * 6 + 3);
+    bbox.yMax = *(labelData + i * 6 + 4);
+    real isDifficult = *(labelData + i * 6 + 5);
+    if (std::abs(isDifficult - 0.0) < 1e-6)
+      bbox.isDifficult = false;
+    else
+      bbox.isDifficult = true;
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  labelVec.resize(outOffset + numBBoxes);
+  scoreVec.resize(outOffset + numBBoxes);
+  bboxVec.resize(outOffset + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
+    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
+    NormalizedBBox bbox;
+    bbox.xMin = *(detectData + i * 7 + 3);
+    bbox.yMin = *(detectData + i * 7 + 4);
+    bbox.xMax = *(detectData + i * 7 + 5);
+    bbox.yMax = *(detectData + i * 7 + 6);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps) {
+  map<size_t, map<size_t, real>> overlaps;
+  size_t numPriors = priorBBoxes.size();
+  size_t numGTs = gtBBoxes.size();
+
+  matchIndices->clear();
+  matchIndices->resize(numPriors, -1);
+  matchOverlaps->clear();
+  matchOverlaps->resize(numPriors, 0.0);
+
+  // Store the positive overlap between predictions and ground truth
+  for (size_t i = 0; i < numPriors; ++i) {
+    for (size_t j = 0; j < numGTs; ++j) {
+      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
+      if (overlap > 1e-6) {
+        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+  // Bipartite matching
+  vector<int> gtPool;
+  for (size_t i = 0; i < numGTs; ++i) {
+    gtPool.push_back(i);
+  }
+  while (gtPool.size() > 0) {
+    // Find the most overlapped gt and corresponding predictions
+    int maxPriorIdx = -1;
+    int maxGTIdx = -1;
+    real maxOverlap = -1.0;
+    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+         it != overlaps.end();
+         ++it) {
+      size_t i = it->first;
+      if ((*matchIndices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored
+        continue;
+      }
+      for (size_t p = 0; p < gtPool.size(); ++p) {
+        int j = gtPool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth
+          continue;
+        }
+        // Find the maximum overlapped pair
+        if (it->second[j] > maxOverlap) {
+          maxPriorIdx = (int)i;
+          maxGTIdx = (int)j;
+          maxOverlap = it->second[j];
+        }
+      }
+    }
+    if (maxPriorIdx == -1) {
+      break;
+    } else {
+      (*matchIndices)[maxPriorIdx] = maxGTIdx;
+      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
+      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
+    }
+  }
+
+  // Get most overlaped for the rest prediction bboxes
+  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+       it != overlaps.end();
+       ++it) {
+    size_t i = it->first;
+    if ((*matchIndices)[i] != -1) {
+      // The prediction already has matched ground truth or is ignored
+      continue;
+    }
+    int maxGTIdx = -1;
+    real maxOverlap = -1;
+    for (size_t j = 0; j < numGTs; ++j) {
+      if (it->second.find(j) == it->second.end()) {
+        // No overlap between the i-th prediction and j-th ground truth
+        continue;
+      }
+      // Find the maximum overlapped pair
+      real overlap = it->second[j];
+      if (overlap > maxOverlap && overlap >= overlapThreshold) {
+        maxGTIdx = j;
+        maxOverlap = overlap;
+      }
+    }
+    if (maxGTIdx != -1) {
+      (*matchIndices)[i] = maxGTIdx;
+      (*matchOverlaps)[i] = maxOverlap;
+    }
+  }
+}
+
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr) {
+  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
+  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<int> matchIndices;
+    vector<int> negIndices;
+    vector<real> matchOverlaps;
+    matchIndices.resize(numPriorBBoxes, -1);
+    matchOverlaps.resize(numPriorBBoxes, 0.0);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    vector<NormalizedBBox> gtBBoxes;
+    getBBoxFromLabelData(
+        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
+
+    matchBBox(
+        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
+
+    size_t numPos = 0;
+    size_t numNeg = 0;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] != -1) ++numPos;
+    totalPos += numPos;
+    vector<pair<real, size_t>> scoresIndices;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
+        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
+        ++numNeg;
+      }
+    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
+    std::sort(scoresIndices.begin(),
+              scoresIndices.end(),
+              sortScorePairDescend<size_t>);
+    for (size_t i = 0; i < numNeg; ++i)
+      negIndices.push_back(scoresIndices[i].second);
+    totalNeg += numNeg;
+    matchIndicesVecPtr->push_back(matchIndices);
+    negIndicesVecPtr->push_back(negIndices);
+  }
+  return std::make_pair(totalPos, totalNeg);
+}
+
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr) {
+  maxConfScoreVecPtr->clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    vector<real> maxConfScore;
+    for (size_t j = 0; j < numPriorBBoxes; ++j) {
+      int offset = j * numClasses;
+      real maxVal = -FLT_MAX;
+      real maxPosVal = -FLT_MAX;
+      real maxScore = 0.0;
+      for (size_t c = 0; c < numClasses; ++c) {
+        maxVal = std::max<real>(confData[offset + c], maxVal);
+        if (c != backgroundId)
+          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
+      }
+      real sum = 0.0;
+      for (size_t c = 0; c < numClasses; ++c)
+        sum += std::exp(confData[offset + c] - maxVal);
+      maxScore = std::exp(maxPosVal - maxVal) / sum;
+      maxConfScore.push_back(maxScore);
+    }
+    confData += numPriorBBoxes * numClasses;
+    maxConfScoreVecPtr->push_back(maxConfScore);
+  }
+}
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const real confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
+        bufferData[count * 7 + 3] = clippedBBox.xMin;
+        bufferData[count * 7 + 4] = clippedBBox.yMin;
+        bufferData[count * 7 + 5] = clippedBBox.xMax;
+        bufferData[count * 7 + 6] = clippedBBox.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
+  real realOne = static_cast<real>(1.0);
+  real realZero = static_cast<real>(0.0);
+  NormalizedBBox clippedBBox;
+  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
+  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
+  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
+  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
+  return clippedBBox;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000..641ed873b4
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const real confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DotProdLayer.cpp b/paddle/gserver/layers/DotProdLayer.cpp
new file mode 100644
index 0000000000..9e2dbe3c3c
--- /dev/null
+++ b/paddle/gserver/layers/DotProdLayer.cpp
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief A layer for computing the dot product of two vectors.
+ * Input1: vector (batchSize * dim)
+ * Input2: vector (batchSize * dim)
+ * Output: a matrix: (batchSize * 1)
+ */
+
+class DotProdLayer : public Layer {
+public:
+  explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~DotProdLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(dot_prod, DotProdLayer);
+
+bool DotProdLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2U);
+  CHECK_EQ(1UL, getSize())
+      << "The output dimensionality of this layer should be fixed to 1.";
+
+  return true;
+}
+
+void DotProdLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+
+  size_t batchSize = inV0->getHeight();
+  CHECK_EQ(inV1->getHeight(), batchSize);
+  CHECK_EQ(inV0->getWidth(), inV1->getWidth());
+
+  {
+    REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
+    reserveOutput(batchSize, 1);
+  }
+
+  MatrixPtr outV = getOutputValue();
+  {
+    REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
+    outV->sumOfProducts(*inV0, *inV1, 1, 0);
+  }
+}
+
+void DotProdLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV0 = getInputValue(0);
+  MatrixPtr inV1 = getInputValue(1);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr inG0 = getInputGrad(0);
+  MatrixPtr inG1 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
+
+    if (inG0) {
+      inG0->addRowScale(0, *inV1, *outG);
+    }
+
+    if (inG1) {
+      inG1->addRowScale(0, *inV0, *outG);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
deleted file mode 100644
index fdcf994cdb..0000000000
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvBaseLayer.h"
-
-#include "paddle/utils/Logging.h"
-namespace paddle {
-
-bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
-                               const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ConvBaseLayer::init(layerMap, parameterMap);
-
-  /* The class fields channels_ and numFilters_ are the same as in the config
-   * i.e., channels_ is the for the input and numFilters_ is for the output
-   *
-   * But in order for the variables in convTrans having the same semantic
-   * meaning as in conv, we need to swap channels_ and numFilters here for
-   * convTrans, and in other functions too.
-   * */
-
-  /* Initialize the projection */
-  for (auto &inputConfig : config_.inputs()) {
-    const ConvConfig &conf = inputConfig.conv_conf();
-    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
-    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() *
-                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
-    int channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(
-        channel * conf.filter_size() *
-        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
-        conf.groups());
-    /* Consistent caffe mode for multiple input */
-    caffeMode_ = conf.caffe_mode();
-  }
-
-  getOutputSize();
-
-  return true;
-}
-
-size_t ExpandConvBaseLayer::getOutputSize() {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  size_t layerSize = ConvBaseLayer::calOutputSize();
-  subN_.clear();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    subN_.push_back(outputH_[i] * outputW_[i]);
-  }
-  return layerSize;
-}
-
-void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
-
-void ExpandConvBaseLayer::addSharedBias() {
-  size_t mapW = getOutputSize() / numFilters_;
-  size_t mapH = getOutputValue()->getElementCnt() / mapW;
-  MatrixPtr out =
-      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
-
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-
-  out->transpose(transOutValue_, false);  // false means no memory allocation
-  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
-                          numFilters_);
-
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  transOutValue_->addBias(*bias, 1.0f);
-
-  transOutValue_->reshape(mapW, mapH);
-  transOutValue_->transpose(out, false);  // false means no memory allocation
-
-  out->clear();
-  bias->clear();
-}
-
-void ExpandConvBaseLayer::addUnsharedBias() {
-  MatrixPtr outValue = getOutputValue();
-  MatrixPtr bias = Matrix::create(biases_->getW()->getData(),
-                                  1,
-                                  biases_->getW()->getElementCnt(),
-                                  false,
-                                  useGpu_);
-  outValue->addBias(*bias, 1.0f);
-}
-
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
-                                         size_t startIdx,
-                                         int inIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
-
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-
-  CHECK_EQ(image->getWidth(),
-           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
-
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp =
-      Matrix::create(imgData,
-                     1,
-                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
-                     false,
-                     useGpu_);
-  expandInput_->convExpand(*imageTmp,
-                           imgSizeH_[inIdx],
-                           imgSizeW_[inIdx],
-                           channel,
-                           filterSizeY_[inIdx],
-                           filterSize_[inIdx],
-                           strideY_[inIdx],
-                           stride_[inIdx],
-                           paddingY_[inIdx],
-                           padding_[inIdx],
-                           outputH_[inIdx],
-                           outputW_[inIdx]);
-  imageTmp->clear();
-}
-
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
-                                        MatrixPtr out,
-                                        int inIdx,
-                                        int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-
-  expandOneFrame(image, startIdx, inIdx);
-
-  int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
-
-  real *outData = out->getData() + startIdx * subN * numFilters;
-
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(*A, *B, 1, 1);
-
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
-                                    MatrixPtr image,
-                                    int inpIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-
-  real *localGradData = out->getData();
-  real *tgtGradData = image->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
-      C->mul(*A, *B);  // mul
-
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp =
-        Matrix::create(tgtGradData,
-                       1,
-                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
-                       false,
-                       useGpu_);
-    vTmp->convShrink(*oneGradTmp,
-                     imgSizeH_[inpIdx],
-                     imgSizeW_[inpIdx],
-                     channel,
-                     filterSizeY_[inpIdx],
-                     filterSize_[inpIdx],
-                     strideY_[inpIdx],
-                     stride_[inpIdx],
-                     paddingY_[inpIdx],
-                     padding_[inpIdx],
-                     outputH_[inpIdx],
-                     outputW_[inpIdx],
-                     1.0f,
-                     1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
-  }
-}
-
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
-                                       MatrixPtr out,
-                                       int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-
-  real *gradData = out->getData();
-
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(image, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
-      C->mul(*B, *A, 1, 1);
-
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
-
-void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getOutputSize() / numFilters_;
-  size_t mapH = v->getElementCnt() / mapW;
-  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
-
-  Matrix::resizeOrCreate(transOutValue_, mapW, mapH, false, useGpu_);
-
-  vTmp->transpose(transOutValue_, false);  // false means no memory allocation
-  transOutValue_->reshape(transOutValue_->getElementCnt() / numFilters_,
-                          numFilters_);
-  biases->collectBias(*transOutValue_, 1.0f);
-}
-
-void ExpandConvBaseLayer::bpropBiases(MatrixPtr v) {
-  MatrixPtr biases = Matrix::create(biases_->getWGrad()->getData(),
-                                    1,
-                                    biases_->getWGrad()->getElementCnt(),
-                                    false,
-                                    useGpu_);
-  if (sharedBiases_) {
-    bpropSharedBias(biases, v);
-  } else {
-    biases->collectBias(*v, 1.0f);
-  }
-  biases->clear();
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
deleted file mode 100644
index aabcdfc392..0000000000
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <vector>
-#include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
-
-namespace paddle {
-
-/**
- * @brief A subclass of ConvBaseLayer that is a superclass of both
- * ExpandConvLayer and ExpandConvTransLayer
- */
-class ExpandConvBaseLayer : public ConvBaseLayer {
-protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-
-  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc
-   * Expand one sample at a time. shape:
-   * (numChannels * filterPixels_, outputSizeH * outputSizeW)
-   * */
-  MatrixPtr expandInput_;
-  /// The transpose of output, which is an auxiliary matrix.
-  MatrixPtr transOutValue_;
-
-public:
-  explicit ExpandConvBaseLayer(const LayerConfig& config)
-      : ConvBaseLayer(config) {}
-
-  ~ExpandConvBaseLayer() {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  size_t getOutputSize();
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
-
-  /**
-   * Add shared bias.
-   */
-  void addSharedBias();
-
-  /**
-   * Add unshared bias.
-   */
-  void addUnsharedBias();
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
-
-  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
-  void bpropBiases(MatrixPtr v);
-  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
-  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
-};
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index f9267b81a7..7ff0c73721 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -16,41 +16,189 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
+DEFINE_bool(use_nnpack,
+            false,
+            "Whether to use nnpack for convolution calculation.");
+
 namespace paddle {
 
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
 REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
+
+inline bool isDepthwiseConv(int channels, int groups) {
+  return channels == groups;
+}
 
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                            const ParameterMap &parameterMap) {
   /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
+  ConvBaseLayer::init(layerMap, parameterMap);
+
+  int index = 0;
+  for (auto &inputConfig : config_.inputs()) {
+    const ConvConfig &conf = inputConfig.conv_conf();
+    /* Consistent caffe mode for multiple input */
+    caffeMode_ = conf.caffe_mode();
+
+    // create a new weight
+    size_t height, width;
+    height = filterPixels_[index] * filterChannels_[index];
+    width = (!isDeconv_) ? numFilters_ : channels_[index];
+    CHECK_EQ(parameters_[index]->getSize(), width * height);
+    Weight *w = new Weight(height, width, parameters_[index]);
+    weights_.emplace_back(w);
+    index++;
+  }
+
+  if (biasParameter_.get()) {
+    if (sharedBiases_) {
+      CHECK_EQ((size_t)numFilters_, biasParameter_->getSize());
+      biases_ = std::unique_ptr<Weight>(
+          new Weight(1, numFilters_, biasParameter_, 0));
+    } else {
+      biases_ =
+          std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_, 0));
+    }
+  }
+
+  getOutputSize();
+
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    std::vector<size_t> dilations = {(size_t)dilationY_[i],
+                                     (size_t)dilation_[i]};
+
+    bool useDilation = ((size_t)dilationY_[i] > 1 || (size_t)dilation_[i] > 1);
+
+    // Convolution Layer uses the GemmConv function by default.
+    convType = "GemmConv";
+    convGradInputType = "GemmConvGradInput";
+    convGradFilterType = "GemmConvGradFilter";
+
+    // If depth wise convolution and useGpu == true
+    if (useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    }
+
+    // If depth wise convolution and useGpu == false and ARM-NEON
+    if (!useGpu_ && isDepthwiseConv(channels_[i], groups_[i]) && !isDeconv_) {
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+      if ((filterSize_[i] == filterSizeY_[i]) &&
+          (filterSize_[i] == 3 || filterSize_[i] == 4) &&
+          (stride_[i] == strideY_[i]) && (stride_[i] == 1 || stride_[i] == 2) &&
+          !useDilation) {
+        convType = "NeonDepthwiseConv";
+      }
+#endif
+    }
+
+    if (FLAGS_use_nnpack && !isDeconv_ && !useDilation) {
+      createFunction(forward_,
+                     "NNPACKConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i])
+                         .set("algo", std::string("auto")));
+    } else {
+      createFunction(forward_,
+                     !isDeconv_ ? convType : convGradInputType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     !isDeconv_ ? convGradInputType : convType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     convGradFilterType,
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("dilations", dilations)
+                         .set("groups", (size_t)groups_[i]));
+    }
+  }
   return true;
 }
 
+size_t ExpandConvLayer::getOutputSize() {
+  CHECK_NE(inputLayers_.size(), 0UL);
+  size_t layerSize = ConvBaseLayer::calOutputSize();
+  return layerSize;
+}
+
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
+
 void ExpandConvLayer::forward(PassType passType) {
   Layer::forward(passType);
 
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
   resetOutput(batchSize, getOutputSize());
 
-  MatrixPtr image = nullptr;
-  MatrixPtr outV = getOutputValue();
+  // Calculate the shape of the input, output, and filter.
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    image = prevLayer->getOutputValue();
-    for (size_t off = 0; off < image->getHeight(); off++) {
-      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
-      expandFwdOnce(image, outV, i, off);
-    }
+    inputShape_[i] = TensorShape({(size_t)batchSize,
+                                  (size_t)channels_[i],
+                                  (size_t)imgSizeH_[i],
+                                  (size_t)imgSizeW_[i]});
+    filterShape_[i] =
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
   }
+
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+
+    forward_[i]->calc(inputs, outputs);
+  }
+
   /* add the bias-vector */
   if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
+    output_.value->addBias(*biases_->getW(), 1.0, sharedBiases_);
   }
 
   /* activation */
@@ -62,19 +210,35 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
 
   MatrixPtr outGrad = getOutputGrad();
   if (biases_ && biases_->getWGrad()) {
-    bpropBiases(outGrad);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBiases_);
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
+  // Calculate the input grad and filter grad.
   for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    if (getPrev(i)->getOutputGrad()) {
-      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+    if (getInputGrad(i)) {
+      BufferArgs inputs;
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
     }
+
     if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
+      BufferArgs inputs;
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
+
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index 60681690e5..a0873de192 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "ExpandConvBaseLayer.h"
+#include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
@@ -28,10 +28,9 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 
-class ExpandConvLayer : public ExpandConvBaseLayer {
+class ExpandConvLayer : public ConvBaseLayer {
 public:
-  explicit ExpandConvLayer(const LayerConfig& config)
-      : ExpandConvBaseLayer(config) {}
+  explicit ExpandConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~ExpandConvLayer() {}
 
@@ -40,6 +39,13 @@ public:
 
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
+
+  size_t getOutputSize();
+
+protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
deleted file mode 100644
index 520586b138..0000000000
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ExpandConvTransLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-
-/* The implementation of the convTransLayer is basically a swap of forward and
- * backward of the original convLayer.
- * The variable naming follows the convention of the convLayer.
- * */
-
-namespace paddle {
-
-REGISTER_LAYER(exconvt, ExpandConvTransLayer);
-
-bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                                const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
-
-  return true;
-}
-
-void ExpandConvTransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-
-  MatrixPtr output = nullptr;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    output = prevLayer->getOutputValue();
-    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
-    bpropActs(output, getOutputValue(), i);
-  }
-
-  /* add the bias-vector */
-  if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
-  }
-
-  /* activation */
-  forwardActivation();
-}
-
-void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-
-  MatrixPtr imageGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases(imageGrad);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
-      if (getPrev(i)->getOutputGrad()) {
-        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
-      }
-    }
-    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-
-}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000..be26b9ba88
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000..df20a49934
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index b3850f543a..8a2ae6b49f 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -40,6 +40,7 @@ namespace paddle {
 class FeatureMapExpandLayer : public Layer {
 private:
   int numFilters_;
+  bool asRowVector_;
 
 public:
   explicit FeatureMapExpandLayer(const LayerConfig& config) : Layer(config) {}
@@ -62,6 +63,7 @@ bool FeatureMapExpandLayer::init(const LayerMap& layerMap,
 
   CHECK_EQ(inputLayers_.size(), 1UL);
   numFilters_ = config_.num_filters();
+  asRowVector_ = config_.user_arg() != "as_col_vec";
   return true;
 }
 
@@ -76,16 +78,30 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 
   {
     AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outVTmp =
-          Matrix::create(outputV->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inVTmp = Matrix::create(
-          inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      outVTmp->addRowVector(*inVTmp);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        outVTmp->addRowVector(*inVTmp);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outVTmp =
+            Matrix::create(outputV->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inVTmp = Matrix::create(
+            inputV->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        outVTmp->addColVector(*inVTmp);
+      }
     }
   }
   /* activation */ {
@@ -102,24 +118,38 @@ void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outGrad = getOutputGrad();
   size_t batchSize = getInput(0).getBatchSize();
   int imgSize = inGrad->getWidth();
+  /* Do activation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
   {
     AsyncGpuBlock asyncGpuBlock;
-    for (size_t i = 0; i < batchSize; i++) {
-      MatrixPtr outGradTmp =
-          Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
-                         numFilters_,
-                         imgSize,
-                         false,
-                         useGpu_);
-      MatrixPtr inGradTmp = Matrix::create(
-          inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
-      inGradTmp->collectBias(*outGradTmp, 1);
+    if (asRowVector_) {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           numFilters_,
+                           imgSize,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, 1, imgSize, false, useGpu_);
+        inGradTmp->collectBias(*outGradTmp, 1);
+      }
+    } else {
+      for (size_t i = 0; i < batchSize; i++) {
+        MatrixPtr outGradTmp =
+            Matrix::create(outGrad->getData() + i * imgSize * numFilters_,
+                           imgSize,
+                           numFilters_,
+                           false,
+                           useGpu_);
+        MatrixPtr inGradTmp = Matrix::create(
+            inGrad->getData() + i * imgSize, imgSize, 1, false, useGpu_);
+        inGradTmp->sumRows(*outGradTmp, 1, 1);
+      }
     }
   }
-  /* Do derivation */ {
-    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
-    backwardActivation();
-  }
 }
 
 }  // namespace paddle.
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index 06907768e9..148516391c 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
+#include "paddle/function/GruFunctor.h"
 #include "paddle/utils/Util.h"
 
 namespace paddle {
@@ -25,13 +26,13 @@ void GruCompute::init(LayerConfig &config) {
 
 template <>
 void GruCompute::forward<0>(hl_gru_value value, int frameSize, int batchSize) {
-  hl_cpu_gru_forward(hppl::forward::gru_resetOutput(),
-                     hppl::forward::gru_finalOutput(),
-                     value,
-                     frameSize,
-                     batchSize,
-                     activeNode_,
-                     activeGate_);
+  GruFunctor<DEVICE_TYPE_CPU, real>::compute(hppl::forward::gru_resetOutput(),
+                                             hppl::forward::gru_finalOutput(),
+                                             value,
+                                             frameSize,
+                                             batchSize,
+                                             activeNode_,
+                                             activeGate_);
 }
 
 template <>
@@ -39,14 +40,15 @@ void GruCompute::backward<0>(hl_gru_value value,
                              hl_gru_grad grad,
                              int frameSize,
                              int batchSize) {
-  hl_cpu_gru_backward(hppl::backward::gru_stateGrad(),
-                      hppl::backward::gru_resetGrad(),
-                      value,
-                      grad,
-                      frameSize,
-                      batchSize,
-                      activeNode_,
-                      activeGate_);
+  GruGradFunctor<DEVICE_TYPE_CPU, real>::compute(
+      hppl::backward::gru_stateGrad(),
+      hppl::backward::gru_resetGrad(),
+      value,
+      grad,
+      frameSize,
+      batchSize,
+      activeNode_,
+      activeGate_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GruCompute.cu b/paddle/gserver/layers/GruCompute.cu
index d5e547dce3..b4f5c54b14 100644
--- a/paddle/gserver/layers/GruCompute.cu
+++ b/paddle/gserver/layers/GruCompute.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "GruCompute.h"
 
 #include "hl_recurrent_apply.cuh"
@@ -31,8 +30,10 @@ void GruCompute::forward<1>(hl_gru_value value, int frameSize, int batchSize) {
 }
 
 template <>
-void GruCompute::backward<1>(hl_gru_value value, hl_gru_grad grad,
-                            int frameSize, int batchSize) {
+void GruCompute::backward<1>(hl_gru_value value,
+                             hl_gru_grad grad,
+                             int frameSize,
+                             int batchSize) {
   hl_gpu_gru_backward(hppl::backward::gru_stateGrad(),
                       hppl::backward::gru_resetGrad(),
                       value,
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e..236f8096bd 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b167..7f896e61ca 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
new file mode 100644
index 0000000000..d5407555b2
--- /dev/null
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName() << " are scores over a sequence or "
+      << "a nested sequence, so its width must be 1.";
+
+  if (useGpu_) {
+    /*
+     * currently, this Layer only runs in CPU, if the other part of the model is
+     * runing on GPU, then copy the input to this layer from GPU to CPU.
+     */
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but output of this layer which is some selected indices of the give
+   * sequence are actually filled with int types so that storing int types
+   * information in a real number matrix is dangerous, since real numbers will
+   * be convered to int types.
+   */
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.cpp b/paddle/gserver/layers/L2DistanceLayer.cpp
new file mode 100644
index 0000000000..c71df1b92c
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.cpp
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "L2DistanceLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(l2_distance, L2DistanceLayer);
+
+bool L2DistanceLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
+                                     << "only two inputs.";
+  CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
+                           << "is fixed to be 1.";
+
+  return true;
+}
+
+void L2DistanceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const auto inV1 = getInputValue(0);
+  const auto inV2 = getInputValue(1);
+
+  CHECK(inV1 && inV2);
+  CHECK_EQ(inV1->getHeight(), inV2->getHeight())
+      << "The height of two inputs of this layer must be the same.";
+  CHECK_EQ(inV1->getWidth(), inV2->getWidth())
+      << "The width of two inputs of this layer must be the same.";
+
+  int batchSize = inV1->getHeight();
+  int output_dim = getSize();
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+    reserveOutput(batchSize, output_dim);
+    auto outV = getOutputValue();
+    CHECK(outV) << "The output matrix should not be null.";
+
+    Matrix::resizeOrCreate(
+        inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
+
+    inputSub_->assign(*inV1);
+    inputSub_->sub(*inV2);
+    outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
+    outV->sqrt2(*outV);
+  }
+}
+
+void L2DistanceLayer::backward(const UpdateCallback& callback) {
+  const auto outG = getOutputGrad();
+  const auto outV = getOutputValue();
+  CHECK(outG && outV);
+
+  auto inGrad1 = getInputGrad(0);
+  auto inGrad2 = getInputGrad(1);
+
+  {
+    REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
+
+    if (inGrad1 || inGrad2) {
+      outV->scalarDiv(*outV, 1.);
+      outV->dotMul(*outG, *outV);
+    }
+
+    if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
+
+    if (inGrad2) {
+      inputSub_->mulScalar(-1.);
+      inGrad2->addRowScale(0, *inputSub_, *outV);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/L2DistanceLayer.h b/paddle/gserver/layers/L2DistanceLayer.h
new file mode 100644
index 0000000000..9b12847a10
--- /dev/null
+++ b/paddle/gserver/layers/L2DistanceLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief The layer calculates the l2 distance between two input vectors.
+ * \f[
+ * f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
+ * \f]
+ *
+ * - Input1: A vector (batchSize * dataDim)
+ * - Input2: A vector (batchSize * dataDim)
+ * - Output: A vector (batchSize * 1)
+ *
+ * The configuration api is: l2_distance_layer.
+ */
+
+class L2DistanceLayer : public Layer {
+public:
+  explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
+  ~L2DistanceLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  // Store the result of subtracting Input2 from Input1 in forward computation,
+  // which will be reused in backward computation.
+  MatrixPtr inputSub_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 125aaf947f..b55b86221c 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -14,25 +14,14 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
+#include "CostLayer.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
-#include "AddtoLayer.h"
-#include "CRFLayer.h"
-#include "CosSimLayer.h"
-#include "CostLayer.h"
-#include "DataLayer.h"
-#include "ExpandConvLayer.h"
-#include "FullyConnectedLayer.h"
-#include "HierarchicalSigmoidLayer.h"
-#include "MaxLayer.h"
-#include "MixedLayer.h"
-#include "NormLayer.h"
-#include "PoolLayer.h"
-#include "TensorLayer.h"
-#include "TransLayer.h"
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "ValidationLayer.h"
+#endif
 
 DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
@@ -41,7 +30,7 @@ namespace paddle {
 Layer::Layer(const LayerConfig& config, bool useGpu)
     : config_(config),
       useGpu_(useGpu),
-      deviceId_(-1),
+      deviceId_(CPU_DEVICE),
       needSequenceInfo_(true) {}
 
 bool Layer::init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
@@ -109,6 +98,11 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
   std::string type = config.type();
 
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOTE: As following types have illegal character '-',
+  // they can not use REGISTER_LAYER to registrar.
+  // Besides, to fit with old training models,
+  // they can not use '_' instead.
   if (type == "multi-class-cross-entropy")
     return LayerPtr(new MultiClassCrossEntropy(config));
   else if (type == "rank-cost")
@@ -117,8 +111,7 @@ LayerPtr Layer::create(const LayerConfig& config) {
     return LayerPtr(new AucValidation(config));
   else if (type == "pnpair-validation")
     return LayerPtr(new PnpairValidation(config));
-  // NOTE: stop adding "if" statements here.
-  // Instead, use REGISTER_LAYER to add more layer types
+#endif
 
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
@@ -191,6 +184,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
   for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
     SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
     outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                           HPPL_STREAM_DEFAULT);
     outputOtherDevice_[i].sequenceStartPositions =
@@ -354,12 +352,11 @@ void Layer::backwardActivation() {
   /* Do error clipping */
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
-      real maxAbsGrad = outGradVec.getAbsMax();
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
         LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                   << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
       }
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 0ed482889d..9813a55607 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -49,6 +49,12 @@ struct LayerState {
 };
 typedef std::shared_ptr<LayerState> LayerStatePtr;
 
+/// Paddle device ID, MKLDNN is -2, CPU is -1
+enum PADDLE_DEVICE_ID {
+  MKLDNN_DEVICE = -2,
+  CPU_DEVICE = -1,
+};
+
 /**
  * @brief Base class for layer.
  * Define necessary variables and functions for every layer.
@@ -59,7 +65,7 @@ protected:
   LayerConfig config_;
   /// whether to use GPU
   bool useGpu_;
-  /// Device Id. CPU is -1, and GPU is 0, 1, 2 ...
+  /// Device Id. MKLDNN is -2, CPU is -1, and GPU is 0, 1, 2 ...
   int deviceId_;
   /// Input layers
   std::vector<LayerPtr> inputLayers_;
@@ -77,8 +83,10 @@ protected:
   Argument output_;
   /// Several outputs stored on different devices, used in 'parallel_nn' case,
   /// and record them by deviceId_.
+  /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
   std::map<std::string, Argument*> outputMap_;
   /// Used to merge grad on different devices.
   MatrixPtr tmpGrad_;
@@ -172,6 +180,13 @@ protected:
     return inputLayer.getOutput(deviceId_);
   }
 
+  /**
+   * Get the argument of input layer with deviceId.
+   */
+  const Argument& getInput(size_t inputIndex, int deviceId) const {
+    return inputLayers_[inputIndex]->getOutput(deviceId);
+  }
+
   /**
    * Get the forward-input value.
    */
@@ -186,6 +201,13 @@ protected:
     return inputLayer.getOutput(deviceId_).value;
   }
 
+  /**
+   * Get the forward-input value with deviceId.
+   */
+  const MatrixPtr& getInputValue(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).value;
+  }
+
   /**
    * Get the forward-input grad.
    */
@@ -200,6 +222,13 @@ protected:
     return inputLayer.getOutput(deviceId_).grad;
   }
 
+  /**
+   * Get the forward-input grad.
+   */
+  const MatrixPtr& getInputGrad(int inputIndex, int deviceId) {
+    return inputLayers_[inputIndex]->getOutput(deviceId).grad;
+  }
+
   /**
    * Get the forward-input label.
    */
@@ -297,6 +326,11 @@ public:
     outputMap_[name] = output;
   }
 
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
   /**
    * Get the output based on layer's name.
    */
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index dc3dc15679..abaa1802b7 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -102,7 +102,6 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
 }
 
 void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
-  MatrixPtr matX = Matrix::create(x, length, numClasses_);
   Matrix::resizeOrCreate(matGrad_, length, numClasses_);
   Matrix::resizeOrCreate(beta_, length, numClasses_);
   real* b = b_->getData();
diff --git a/paddle/gserver/layers/LstmCompute.cu b/paddle/gserver/layers/LstmCompute.cu
index f75c0c40cc..d3f59b52a4 100644
--- a/paddle/gserver/layers/LstmCompute.cu
+++ b/paddle/gserver/layers/LstmCompute.cu
@@ -12,41 +12,62 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include "LstmCompute.h"
 #include "hl_recurrent_apply.cuh"
 
 namespace paddle {
 
 template <>
-void LstmCompute::forwardBatch<1>(hl_lstm_value value, int frameSize,
-                                 int batchSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value, frameSize,
-                      batchSize, activeNode_, activeGate_,
+void LstmCompute::forwardBatch<1>(hl_lstm_value value,
+                                  int frameSize,
+                                  int batchSize) {
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      batchSize,
+                      activeNode_,
+                      activeGate_,
                       activeState_);
 }
 
 template <>
-void LstmCompute::backwardBatch<1>(hl_lstm_value value, hl_lstm_grad grad,
-                                   int frameSize, int batchSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, batchSize, activeNode_,
-                       activeGate_, activeState_);
+void LstmCompute::backwardBatch<1>(hl_lstm_value value,
+                                   hl_lstm_grad grad,
+                                   int frameSize,
+                                   int batchSize) {
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       batchSize,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 
 template <>
 void LstmCompute::forwardOneSequence<1>(hl_lstm_value value, int frameSize) {
-  hl_gpu_lstm_forward(hppl::forward::lstm(), value,
-                      frameSize, /* batchSize */ 1,
-                      activeNode_, activeGate_, activeState_);
+  hl_gpu_lstm_forward(hppl::forward::lstm(),
+                      value,
+                      frameSize,
+                      /* batchSize */ 1,
+                      activeNode_,
+                      activeGate_,
+                      activeState_);
 }
 
 template <>
-void LstmCompute::backwardOneSequence<1>(hl_lstm_value value, hl_lstm_grad grad,
+void LstmCompute::backwardOneSequence<1>(hl_lstm_value value,
+                                         hl_lstm_grad grad,
                                          int frameSize) {
-  hl_gpu_lstm_backward(hppl::backward::lstm(), value, grad,
-                       frameSize, /* batchSize */ 1,
-                       activeNode_, activeGate_, activeState_);
+  hl_gpu_lstm_backward(hppl::backward::lstm(),
+                       value,
+                       grad,
+                       frameSize,
+                       /* batchSize */ 1,
+                       activeNode_,
+                       activeGate_,
+                       activeState_);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.cpp b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
new file mode 100644
index 0000000000..39bffc26f7
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.cpp
@@ -0,0 +1,219 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNAddtoLayer.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_addto, MKLDNNAddtoLayer);
+
+bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  layerSize_ = getSize();
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize()) << "input size must equal";
+  }
+  if (biasParameter_.get() != NULL) {
+    biases_ =
+        std::unique_ptr<Weight>(new Weight(1, layerSize_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNAddtoLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
+    CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
+  }
+
+  oc = ic;
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
+                                MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs, biasVal_, out);
+
+  std::shared_ptr<sum::primitive_desc> fwdPD;
+  std::shared_ptr<sum::primitive_desc> biasPD;
+  resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
+}
+
+void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
+                                std::vector<MKLDNNMatrixPtr>& inputs,
+                                MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inputs, biasGrad_, out);
+
+  // backward only need share output grad to input grad
+  for (size_t i = 0; i < inputs.size(); i++) {
+    if (inputs[i] != nullptr) {
+      inputs[i] = out;
+      inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
+    }
+  }
+
+  // backward bias
+  bwdBias_ = nullptr;
+  if (biasGrad_) {
+    std::vector<float> scales(bs_, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(bs_,
+                                               biasGrad_->getPrimitiveDesc());
+    auto biasPD =
+        sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
+    std::vector<primitive::at> srcs;
+    for (size_t i = 0; i < grads_.size(); ++i) {
+      srcs.push_back(*(grads_[i]));
+    }
+    bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
+    pipeline.push_back(*bwdBias_);
+  }
+}
+
+void MKLDNNAddtoLayer::updateWeights(const UpdateCallback& callback) {
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNAddtoLayer::prepareBias(MKLDNNMatrixPtr& bias,
+                                   const MatrixPtr& biasMat,
+                                   const MKLDNNMatrixPtr& out,
+                                   std::vector<MKLDNNMatrixPtr>& outs) {
+  auto pd = MKLDNNMatrix::createPrimitiveDesc(
+      {(int)layerSize_}, memory::format::x, engine_);
+  bias = MKLDNNMatrix::create(pd, biasMat);
+  outs.clear();
+  real* data = out->getData();
+  CHECK_EQ(bs_ * layerSize_, out->getElementCnt());
+  for (int i = 0; i < bs_; ++i) {
+    MatrixPtr tmp =
+        Matrix::create(data + i * layerSize_, 1, layerSize_, false, false);
+    outs.push_back(MKLDNNMatrix::create(bias->getPrimitiveDesc(), tmp));
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
+                                       MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i);
+    CHECK(inputs[i]);
+    inputs[i]->downSpatial();
+  }
+  for (size_t i = 1; i < inputs.size(); i++) {
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inputs[0]->getPrimitiveDesc());
+  }
+
+  resetOutValue(out, inputs[0]->getPrimitiveDesc());
+
+  if (biases_ && biases_->getW()) {
+    prepareBias(bias, biases_->getW(), out, vals_);
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdPD(std::shared_ptr<sum::primitive_desc>& pd,
+                                  std::shared_ptr<sum::primitive_desc>& biasPD,
+                                  std::vector<MKLDNNMatrixPtr>& inputs,
+                                  MKLDNNMatrixPtr bias,
+                                  MKLDNNMatrixPtr out) {
+  std::vector<float> scales(inputs.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+
+  biasPD = nullptr;
+  if (bias) {
+    std::vector<float> scales(2, 1.0);
+    std::vector<memory::primitive_desc> srcPDs(2, bias->getPrimitiveDesc());
+    biasPD.reset(
+        new sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs));
+    CHECK_PRIMITIVE_DESC_EQ(bias, biasPD->dst_primitive_desc());
+  }
+}
+
+void MKLDNNAddtoLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<sum::primitive_desc>& pd,
+    std::shared_ptr<sum::primitive_desc>& biasPD,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new sum(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+
+  fwdBias_.clear();
+  if (biasPD == nullptr || bias == nullptr) {
+    return;
+  }
+  fwdBias_.resize(vals_.size());
+  for (size_t i = 0; i < vals_.size(); ++i) {
+    std::vector<primitive::at> srcs;
+    srcs.push_back(*(vals_[i]));
+    srcs.push_back(*bias);
+    fwdBias_[i].reset(new sum(*biasPD, srcs, *vals_[i]));
+    pipeline.push_back(*fwdBias_[i]);
+  }
+}
+
+void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                       MKLDNNMatrixPtr& bias,
+                                       MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
+  }
+
+  if (biases_ && biases_->getWGrad()) {
+    prepareBias(bias, biases_->getWGrad(), out, grads_);
+  } else {
+    bias = nullptr;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNAddtoLayer.h b/paddle/gserver/layers/MKLDNNAddtoLayer.h
new file mode 100644
index 0000000000..0ea3e208e5
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNAddtoLayer.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Addto layer.
+ *
+ * The config file api is mkldnn_addto
+ */
+class MKLDNNAddtoLayer : public MKLDNNLayer {
+protected:
+  // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
+  size_t layerSize_;
+
+  std::unique_ptr<Weight> biases_;
+
+  // buffers for adding bias
+  std::vector<MKLDNNMatrixPtr> vals_;
+  std::vector<MKLDNNMatrixPtr> grads_;
+  // primitives for adding bias
+  std::vector<std::shared_ptr<mkldnn::primitive>> fwdBias_;
+  std::shared_ptr<mkldnn::primitive> bwdBias_;
+
+public:
+  explicit MKLDNNAddtoLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNAddtoLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                  std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& pd,
+                        std::shared_ptr<mkldnn::sum::primitive_desc>& biasPD,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+
+  void prepareBias(MKLDNNMatrixPtr& bias,
+                   const MatrixPtr& biasMat,
+                   const MKLDNNMatrixPtr& out,
+                   std::vector<MKLDNNMatrixPtr>& outs);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
new file mode 100644
index 0000000000..af02a37cad
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+typedef enum {
+  MKLDNN_BASE = 1,   // basical info of MKLDNN
+  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
+  MKLDNN_ALL = 4,    // show all info of MKLDNN
+} MKLDNN_LOG_LEVEL;
+
+/**
+ * @brief MKLDNN CPU engine.
+ *
+ */
+class CPUEngine {
+public:
+  static CPUEngine& Instance() {
+    // Thread-safe in C++11.
+    static CPUEngine myInstance;
+    return myInstance;
+  }
+
+  // Disallow copy or move
+  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
+  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
+  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
+  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
+
+  mkldnn::engine& getEngine() { return cpuEngine_; }
+
+protected:
+  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
+  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
+  ~CPUEngine() {}
+
+private:
+  mkldnn::engine cpuEngine_;
+};
+
+/**
+ * @brief MKLDNN Stream.
+ *
+ */
+class MKLDNNStream {
+public:
+  MKLDNNStream() : ready_(false) { resetState(); }
+
+  virtual ~MKLDNNStream() {}
+
+  /**
+   * @brief Submit stream
+   * @param prims The primitives vector
+   * @param block Waiting for the stream to complete
+   */
+  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
+    resetState();
+    stream_->submit(prims).wait(block);
+    ready_ = false;
+  }
+
+  /**
+   * @brief Reset the mkldnn stream
+   */
+  void resetState() {
+    if (ready_) {
+      return;
+    }
+    // TODO(TJ): change me when mkldnn have method to reset this state
+    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
+    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+    ready_ = true;
+  }
+
+private:
+  bool ready_;
+  std::shared_ptr<mkldnn::stream> stream_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
new file mode 100644
index 0000000000..7faca0f8b7
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.cpp
@@ -0,0 +1,306 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNBatchNormLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
+
+bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  // first one is input layer
+  // the other two are created in config_parser.py saving moving mean and var
+  CHECK_EQ(inputLayers_.size(), 3U);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK_EQ(inputLayers_.size(), size_t(config_.inputs_size()));
+
+  const ImageConfig& conf = config_.inputs(0).image_conf();
+  ic_ = conf.channels();
+  ih_ = inputLayers_[0]->getOutput().getFrameHeight();
+  iw_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (iw_ == 0 && ih_ == 0) {
+    iw_ = conf.img_size();
+    ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  }
+  oc_ = ic_;
+  oh_ = ih_;
+  ow_ = iw_;
+  if (config_.has_use_global_stats()) {
+    useGlobalStats_ = config_.use_global_stats();
+  }
+  movingAvgFraction_ = config_.moving_average_fraction();
+  epsilon_ = config_.epsilon();
+
+  VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
+                    << " --- global stats";
+  VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
+
+  initWeight();
+  movingMean_.reset(new Weight(oc_, 1, parameters_[1], 0));
+  movingVar_.reset(new Weight(oc_, 1, parameters_[2], 0));
+  return true;
+}
+
+void MKLDNNBatchNormLayer::initWeight() {
+  weight_.reset(new Weight(1, oc_, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  CHECK_EQ(weight_ != nullptr, biases_ != nullptr)
+      << "only support have both weight and bias, or neither";
+  if (weight_ && weight_->getW()) {
+    CHECK(biases_ && biases_->getW());
+    valueScaleShift_ = Matrix::create(2, oc_, false, false);
+    valueScaleShift_->zeroMem();
+    VectorPtr scale(new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), 0));
+    VectorPtr shift(
+        new CpuVector(oc_, valueScaleShift_->getMemoryHandle(), oc_));
+    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_VALUE);
+    scale->copyFrom(*wgt);
+    shift->copyFrom(*bias);
+    wgt->setData(valueScaleShift_->getData());
+    bias->setData(valueScaleShift_->getData() + oc_);
+  }
+  if (weight_ && weight_->getWGrad()) {
+    CHECK(biases_ && biases_->getWGrad());
+    gradScaleShift_ = Matrix::create(2, oc_, false, false);
+    gradScaleShift_->zeroMem();
+    const VectorPtr& wgt = parameters_[0]->getBuf(PARAMETER_GRADIENT);
+    const VectorPtr& bias = biasParameter_->getBuf(PARAMETER_GRADIENT);
+    wgt->setData(gradScaleShift_->getData());
+    bias->setData(gradScaleShift_->getData() + oc_);
+  }
+}
+
+void MKLDNNBatchNormLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+  // prepare mean and var if necessary
+  if (useGlobalStats_) {
+    CHECK(mean_);
+    CHECK(var_);
+    mean_->copyFrom(*(movingMean_->getW()));
+    var_->copyFrom(*(movingVar_->getW()));
+  }
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
+  // calculating and saving moving mean and variance
+  CHECK_EQ(useGlobalStats_, false);
+  movingMean_->getW()->add(
+      *mean_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+  // here var is v^2
+  movingVar_->getW()->add(*var_, movingAvgFraction_, 1.0 - movingAvgFraction_);
+}
+
+void MKLDNNBatchNormLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  oh = ih;
+  ow = iw;
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
+                                    MKLDNNMatrixPtr& out) {
+  // In training phase, it will always calculate mean and var,
+  // so useGlobalStats must be false.
+  // In scoring phase, it depends on useGlobalStats choice.
+  if (passType_ != PASS_TEST && useGlobalStats_ == true) {
+    LOG(WARNING) << "use_global_stats is invalid setting in training phase";
+    useGlobalStats_ = false;
+  }
+
+  resetFwdBuffers(inputs[0], wgtVal_, out);
+
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
+}
+
+void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
+                                    std::vector<MKLDNNMatrixPtr>& inputs,
+                                    MKLDNNMatrixPtr& out) {
+  std::shared_ptr<bn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], wgtGrad_, out);
+
+  resetBwdPD(pd, inputs[0], wgtGrad_, out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
+}
+
+void MKLDNNBatchNormLayer::forward(PassType passType) {
+  MKLDNNLayer::forward(passType);
+
+  // calculate and save moving mean and variance
+  if (passType_ != PASS_TEST) {
+    calMovingMeanAndVar();
+  }
+}
+
+void MKLDNNBatchNormLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNBatchNormLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                           MKLDNNMatrixPtr& wgt,
+                                           MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+
+  if (valueScaleShift_) {
+    auto pd = MKLDNNMatrix::createPrimitiveDesc({2, oc_}, format::nc, engine_);
+    resetWithMatrix(wgt, valueScaleShift_, pd);
+  }
+  if (passType_ != PASS_TEST || useGlobalStats_) {
+    auto pd = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    mean_ = MKLDNNMatrix::create(pd);
+    var_ = MKLDNNMatrix::create(pd);
+  }
+}
+
+void MKLDNNBatchNormLayer::resetFwdPD(
+    std::shared_ptr<bn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr in,
+    MKLDNNMatrixPtr wgt,
+    MKLDNNMatrixPtr out) {
+  flags_ = 0u;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  if (useGlobalStats_) {
+    flags_ = (flags_ | batch_normalization_flag::use_global_stats);
+  }
+  if (wgt) {
+    flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
+  }
+  auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
+  pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+  if (wgt) {
+    CHECK_PRIMITIVE_DESC_EQ(wgt, pd->weights_primitive_desc());
+  }
+  if (passType_ != PASS_TEST || useGlobalStats_) {
+    CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
+    CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
+  }
+}
+
+void MKLDNNBatchNormLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<bn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  if (passType_ == PASS_TEST) {
+    if (useGlobalStats_) {
+      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd,
+                                             *in,
+                                             (const primitive::at)(*mean_),
+                                             (const primitive::at)(*var_),
+                                             *wgt,
+                                             *out)
+                                : new bn_fwd(*pd,
+                                             *in,
+                                             (const primitive::at)(*mean_),
+                                             (const primitive::at)(*var_),
+                                             *out));
+    } else {
+      fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out)
+                                : new bn_fwd(*pd, *in, *out));
+    }
+  } else {
+    CHECK_EQ(useGlobalStats_, false)
+        << "useGlobalStats should be false in training";
+    fwd_.reset(wgt != nullptr ? new bn_fwd(*pd, *in, *wgt, *out, *mean_, *var_)
+                              : new bn_fwd(*pd, *in, *out, *mean_, *var_));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                           MKLDNNMatrixPtr& wgt,
+                                           MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+  if (gradScaleShift_) {
+    CHECK(wgtVal_);
+    resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
+  }
+}
+
+void MKLDNNBatchNormLayer::resetBwdPD(
+    std::shared_ptr<bn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
+  auto md = in->getMemoryDesc();
+  auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
+  pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+  CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(mean_, pd->mean_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(var_, pd->variance_primitive_desc());
+}
+
+void MKLDNNBatchNormLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<bn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  bwdData_.reset(
+      wgt && wgtVal_
+          ? new bn_bwd(
+                *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
+          : new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNBatchNormLayer.h b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
new file mode 100644
index 0000000000..1cf33cb34f
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNBatchNormLayer.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::batch_normalization_forward bn_fwd;
+typedef mkldnn::batch_normalization_backward bn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer BatchNorm layer.
+ *
+ * The config file api is mkldnn_batch_norm
+ */
+class MKLDNNBatchNormLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
+
+  // Epsilon value used in the batch normalization formula.
+  real epsilon_;
+
+  // weight and bias in paddle
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+  // mkldnn use a large buffer store both scale and shift
+  // which are weight and bias in paddle corresponding.
+  MatrixPtr valueScaleShift_;
+  MatrixPtr gradScaleShift_;
+  // Moving average of mean.
+  std::unique_ptr<Weight> movingMean_;
+  // Moving average of variance.
+  std::unique_ptr<Weight> movingVar_;
+
+  // if useGlobalStats_ is true, will use the loaded mean and variance.
+  // otherwise, calculate mean and variance in every mini-batch.
+  bool useGlobalStats_;
+  // used in MKLDNN primitive desc
+  unsigned flags_;
+  // use to compute moving mean and variance.
+  real movingAvgFraction_;
+  // whether the weight has been init
+  bool hasInitedWgt_;
+
+  // local mean and variance
+  // when useGlobalStats_ they are loaded from moving mean and variance
+  // when do not useGlobalStats_ they are calculated from this mini-batch
+  MKLDNNMatrixPtr mean_;
+  MKLDNNMatrixPtr var_;
+
+public:
+  explicit MKLDNNBatchNormLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), useGlobalStats_(true), hasInitedWgt_(false) {}
+
+  ~MKLDNNBatchNormLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+protected:
+  void initWeight();
+  /**
+   * cal moving mean and variance.
+   * moving = moving * AvgFraction + local * (1 - AvgFraction)
+   */
+  void calMovingMeanAndVar();
+
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& wgt,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<bn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.cpp b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
new file mode 100644
index 0000000000..520ccc1a99
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.cpp
@@ -0,0 +1,186 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConcatLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
+
+bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_GT(inputLayers_.size(), 1UL);
+  CHECK(!biasParameter_);
+  return true;
+}
+
+void MKLDNNConcatLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  ic = inputLayers_[0]->getSize() / ih / iw;
+  CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
+  CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
+           (size_t)bs * ic * ih * iw);
+  CHECK_GT(inputLayers_.size(), 1UL);
+  channels_.resize(inputLayers_.size());
+  channels_[0] = ic;
+  oc = ic;
+  for (size_t i = 1; i < inputLayers_.size(); i++) {
+    int batchsize = 0, height = 0, witdh = 0;
+    reshapeInput(batchsize, height, witdh, i);
+    CHECK_EQ(bs, batchsize);
+    CHECK_EQ(ih, height);
+    CHECK_EQ(iw, witdh);
+
+    channels_[i] = inputLayers_[i]->getSize() / height / witdh;
+    CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
+    oc += channels_[i];
+  }
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs, out);
+
+  std::shared_ptr<concat::primitive_desc> fwdPD;
+  resetFwdPD(fwdPD, inputs, out);
+
+  resetFwdPipeline(pipeline, fwdPD, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
+                                 std::vector<MKLDNNMatrixPtr>& inputs,
+                                 MKLDNNMatrixPtr& out) {
+  resetBwdBuffers(inputs, out);
+
+  resetBwdPipeline(pipeline, bwds_, inputs, out);
+}
+
+void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  inputs.resize(inputLayers_.size());
+  bool has8c = false, has16c = false, hasnc = false;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    resetInValue(inputs[i], nullptr, i, channels_[i]);
+    inputs[i]->downSpatial();
+    CHECK(inputs[i]);
+    auto dm = inputs[i]->getDims();
+    // inputs format can be different, but ndims must equal
+    CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
+    CHECK_EQ(bs_, dm[0]);
+    CHECK_EQ(channels_[i], dm[1]);
+    if (dm.size() > 2) {
+      CHECK_EQ(ih_, dm[2]);
+      CHECK_EQ(iw_, dm[3]);
+    }
+    if (inputs[i]->getFormat() == format::nc) {
+      hasnc = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw8c) {
+      has8c = true;
+    }
+    if (inputs[i]->getFormat() == format::nChw16c) {
+      has16c = true;
+    }
+  }
+
+  format outFmt;
+  if (has16c && oc_ % 16 == 0) {
+    outFmt = format::nChw16c;
+  } else if (has8c && oc_ % 8 == 0) {
+    outFmt = format::nChw8c;
+  } else if (hasnc) {
+    CHECK(oh_ == 1 && ow_ == 1);
+    outFmt = format::nc;
+  } else {
+    outFmt = format::nchw;
+  }
+  memory::dims outDims =
+      hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
+  auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
+                                   std::vector<MKLDNNMatrixPtr>& inputs,
+                                   MKLDNNMatrixPtr out) {
+  std::vector<memory::primitive_desc> srcPDs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcPDs.push_back(inputs[i]->getPrimitiveDesc());
+  }
+  CHECK(out);
+  pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
+  CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
+}
+
+void MKLDNNConcatLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<concat::primitive_desc>& pd,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  std::vector<primitive::at> srcs;
+  for (size_t i = 0; i < inputs.size(); i++) {
+    srcs.push_back(*(inputs[i]));
+  }
+  fwd_.reset(new concat(*pd, srcs, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                                        MKLDNNMatrixPtr& out) {
+  CHECK(outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  CHECK(out);
+
+  inputs.resize(inputLayers_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    CHECK(inVals_[i]);
+    resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
+    CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
+  }
+}
+
+void MKLDNNConcatLayer::resetBwdPipeline(
+    std::vector<mkldnn::primitive>& pipeline,
+    std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+    std::vector<MKLDNNMatrixPtr>& inputs,
+    MKLDNNMatrixPtr& out) {
+  // reset the backward primitives
+  memory::dims offsets = {0, 0, 0, 0};
+  prims.resize(inputs.size());
+  CHECK_EQ(inputs.size(), channels_.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto viewPD = view::primitive_desc(
+        out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
+    auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
+                                         inputs[i]->getPrimitiveDesc());
+    prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
+    offsets[axis_] += channels_[i];
+    // push to pipeline
+    pipeline.push_back(*prims[i]);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConcatLayer.h b/paddle/gserver/layers/MKLDNNConcatLayer.h
new file mode 100644
index 0000000000..37f3a26c5e
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConcatLayer.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer Concatenate layer.
+ *
+ * The config file api is mkldnn_concat
+ */
+class MKLDNNConcatLayer : public MKLDNNLayer {
+protected:
+  std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
+  // input channel numbers
+  std::vector<int> channels_;
+
+  // concat_dimension in MKLDNN
+  // if axis_ == 0, concat batchsize
+  // if axis_ == 1, concat channel (default)
+  int axis_;
+
+public:
+  explicit MKLDNNConcatLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), axis_(1) {}
+
+  ~MKLDNNConcatLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    CHECK_EQ(channels_.size(), inputLayers_.size());
+    for (size_t i = 0; i < channels_.size(); ++i) {
+      VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
+                         << ": " << bs_ << ", " << channels_[i] << ", " << ih_
+                         << ", " << iw_;
+    }
+    VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
+                       << ", " << ow_;
+  }
+
+  size_t keepCondition() {
+    // reset when the total element size of all inputs changed
+    size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
+    for (size_t i = 1; i < inputLayers_.size(); ++i) {
+      totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
+    }
+    return totalSize;
+  }
+
+protected:
+  void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                  std::vector<MKLDNNMatrixPtr>& inputs,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
new file mode 100644
index 0000000000..ab1d0f7b04
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -0,0 +1,388 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNConvLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_conv, MKLDNNConvLayer);
+
+bool MKLDNNConvLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(config_.shared_biases()) << "Only support shared biases yet";
+
+  oc_ = config_.num_filters();
+  const ConvConfig& conf = config_.inputs(0).conv_conf();
+  ic_ = conf.channels();
+  fw_ = conf.filter_size();
+  fh_ = conf.filter_size_y();
+  pw_ = conf.padding();
+  ph_ = conf.padding_y();
+  dw_ = conf.dilation();
+  dh_ = conf.dilation_y();
+  sw_ = conf.stride();
+  sh_ = conf.stride_y();
+  gp_ = conf.groups();
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  caffeMode_ = conf.caffe_mode();
+  CHECK(caffeMode_) << "Only support caffe mode yet";
+  CHECK(dh_ == 1 && dw_ == 1) << "Only support dilation 1 yet";
+  // check group setting
+  CHECK_EQ((oc_ / gp_) * gp_, oc_) << "group is indivisible for oc";
+  CHECK_EQ((ic_ / gp_) * gp_, ic_) << "group is indivisible for ic";
+
+  // create weight
+  size_t height = oc_ / gp_;
+  size_t width = ic_ * fh_ * fw_;
+  CHECK_EQ(parameters_[0]->getSize(), height * width);
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(height, width, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNConvLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  // the paddle weight format is oihw or goihw
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNConvLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = (gp_ == 1) ? memory::format::oihw : memory::format::goihw;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNConvLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  // cal output sizes
+  // oc can not be changed
+  int fh = (fh_ - 1) * dh_ + 1;
+  int fw = (fw_ - 1) * dw_ + 1;
+  oh = outputSize(ih, fh, ph_, sh_, caffeMode_);
+  ow = outputSize(iw, fw, pw_, sw_, caffeMode_);
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdPD(fwdPD_);
+
+  resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdWgtPD(bwdWgtPD);
+
+  resetBwdDataPD(bwdDataPD);
+
+  resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNConvLayer::loadConvSettings(memory::dims& wgt,
+                                       memory::dims& bias,
+                                       memory::dims& stride,
+                                       memory::dims& dilation,
+                                       memory::dims& padL,
+                                       memory::dims& padR) {
+  wgt = (gp_ == 1) ? memory::dims{oc_, ic_, fh_, fw_}
+                   : memory::dims{gp_, oc_ / gp_, ic_ / gp_, fh_, fw_};
+  bias = memory::dims{oc_};
+  stride = memory::dims{sh_, sw_};
+  padL = memory::dims{ph_, pw_};
+  padR = getPaddingR();
+  // note: mkldnn dilation start from 0
+  dilation = memory::dims{dh_ - 1, dw_ - 1};
+}
+
+void MKLDNNConvLayer::resetFwdPD(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd) {
+  // dims for conv
+  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  conv_fwd::desc fwdDesc =
+      biases_ && biases_->getW()
+          ? conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(biasDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind)
+          : conv_fwd::desc(pk,
+                           algo,
+                           MKLDNNMatrix::createMemoryDesc(inDims),
+                           MKLDNNMatrix::createMemoryDesc(wgtDims),
+                           MKLDNNMatrix::createMemoryDesc(outDims),
+                           strides,
+                           dilations,
+                           padL,
+                           padR,
+                           padKind);
+  pd.reset(new conv_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNConvLayer::resetFwdBuffers(
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(pd);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
+
+  resetOutValue(out, pd->dst_primitive_desc());
+
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
+
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNConvLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNConvLayer::resetBwdWgtPD(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& pd) {
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+
+  // create backward weight using input, output and weight value memory desc
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  CHECK(wgtVal_) << "Should have weight value";
+  algorithm algo = algorithm::convolution_direct;
+  padding_kind padKind = padding_kind::zero;
+  auto bwdWgtDesc = biasVal_ != nullptr
+                        ? conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            biasVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind)
+                        : conv_bwdWgt::desc(algo,
+                                            inVals_[0]->getMemoryDesc(),
+                                            wgtVal_->getMemoryDesc(),
+                                            outVal_->getMemoryDesc(),
+                                            strides,
+                                            padL,
+                                            padR,
+                                            padKind);
+  pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      pd->diff_weights_primitive_desc(),
+      "primitive desc of weight value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdDataPD(
+    std::shared_ptr<conv_bwdData::primitive_desc>& pd) {
+  pd = nullptr;
+  if (inputLayers_[0]->getOutput().grad == nullptr) {
+    return;
+  }
+
+  memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
+  loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
+  CHECK(inVals_[0]) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
+  // create backward data using input and output value memory desc
+  // but using weight memory desc with any format
+  auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
+                                        inVals_[0]->getMemoryDesc(),
+                                        MKLDNNMatrix::createMemoryDesc(wgtDims),
+                                        outVal_->getMemoryDesc(),
+                                        strides,
+                                        padL,
+                                        padR,
+                                        padding_kind::zero);
+  pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+  CHECK_PRIMITIVE_DESC_EQ(
+      inVals_[0],
+      pd->diff_src_primitive_desc(),
+      "primitive desc of in value and grad should be equal");
+  CHECK_PRIMITIVE_DESC_EQ(
+      outVal_,
+      pd->diff_dst_primitive_desc(),
+      "primitive desc of out value and grad should be equal");
+}
+
+void MKLDNNConvLayer::resetBwdBuffers(
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(wgtPD);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
+
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK_PRIMITIVE_DESC_EQ(
+      wgtVal_,
+      wgt->getPrimitiveDesc(),
+      "primitive desc of weight grad and value should be equal");
+
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias);
+    CHECK_PRIMITIVE_DESC_EQ(
+        biasVal_,
+        bias->getPrimitiveDesc(),
+        "primitive desc of bias grad and value should be equal");
+  }
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
+  resetWgtValBwdData(dataPD, wgtValBwdData_);
+}
+
+void MKLDNNConvLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  // add bwdWgt handle
+  if (bias) {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (dataPD == nullptr) {
+    return;
+  }
+  if (cvtWgtVal_) {
+    pipeline.push_back(*cvtWgtVal_);
+  }
+  // add bwdData handle
+  CHECK(wgtValBwdData_) << "Should have weight memory";
+  bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+void MKLDNNConvLayer::resetWgtValBwdData(
+    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+    MKLDNNMatrixPtr& wgt) {
+  if (dataPD == nullptr) {
+    return;
+  }
+
+  // create new weight value for backward data, and create reorder if necessary
+  // since the primitive_desc would be different with wgtVal_
+  CHECK(wgtVal_) << "should have weight value";
+  if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
+    cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
+    CHECK(cvtWgtVal_);
+  } else {
+    wgtValBwdData_ = wgtVal_;
+  }
+  VLOG(MKLDNN_FMTS) << "weight value format for backward data: "
+                    << wgtValBwdData_->getFormat();
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
new file mode 100644
index 0000000000..3e754a0e65
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::convolution_forward conv_fwd;
+typedef mkldnn::convolution_backward_weights conv_bwdWgt;
+typedef mkldnn::convolution_backward_data conv_bwdData;
+
+/**
+ * @brief A subclass of MKLDNNLayer conv layer.
+ *
+ * The config file api is mkldnn_conv
+ */
+class MKLDNNConvLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // dilation height and width
+  int dh_, dw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+  // group number
+  int gp_;
+
+  // in resetBwdData, the format of wgtValBwdData_ is different with wgtVal_
+  MKLDNNMatrixPtr wgtValBwdData_;
+  // convert handle from wgtVal_ to wgtValBwdData_
+  std::shared_ptr<mkldnn::reorder> cvtWgtVal_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
+
+  // whether the weight has been init
+  bool hasInitedWgt_;
+
+  // true by default, which impact the calculation of output image size.
+  // details can refer to mathUtil.h
+  bool caffeMode_;
+
+  // weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNConvLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), caffeMode_(true) {}
+
+  ~MKLDNNConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ", ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
+  }
+
+protected:
+  /**
+   * load the dims settings of this conv
+   */
+  void loadConvSettings(mkldnn::memory::dims& wgt,
+                        mkldnn::memory::dims& bias,
+                        mkldnn::memory::dims& stride,
+                        mkldnn::memory::dims& dilation,
+                        mkldnn::memory::dims& padL,
+                        mkldnn::memory::dims& padR);
+
+  void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
+  void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
+  void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
+  void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                       std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                       MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
+                        std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * reset MKLDNNMatrix of weight value for backward data
+   * since the primitive_desc would be different with wgtVal_
+   */
+  void resetWgtValBwdData(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
+                          MKLDNNMatrixPtr& wgt);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_convolution_forward_common.hpp
+   * @note: mkldnn dilation start from 0 while paddle start from 1
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ - ((fh_ - 1) * dh_ + 1) + ph_ + padR[0]) / sh_ + 1 != oh_) {
+        ++padR[0];
+      }
+      if ((iw_ - ((fw_ - 1) * dw_ + 1) + pw_ + padR[1]) / sw_ + 1 != ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
new file mode 100644
index 0000000000..c8778bdd07
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -0,0 +1,262 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1UL) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+  ih_ = 1;
+  iw_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_, 0));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto srcFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  CHECK(wgtVal_) << "should have been initialized";
+  auto targetDim = wgtVal_->getDims();
+  auto dstFmt = targetDim.size() == 2 ? format::io : format::ihwo;
+  wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
+}
+
+void MKLDNNFcLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic = iLayerSize_ / (ih * iw);
+  CHECK_EQ(size_t(ic * ih * iw), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc), getSize());
+
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc);
+}
+
+void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
+}
+
+void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
+                             std::vector<MKLDNNMatrixPtr>& inputs,
+                             MKLDNNMatrixPtr& out) {
+  std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
+  std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
+
+  resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
+
+  resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
+
+  resetBwdDataPD(bwdDataPD, inputs[0], out);
+
+  resetBwdPipeline(
+      pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
+}
+
+void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
+  weight_->getParameterPtr()->incUpdate(callback);
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  in->downSpatial();
+
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
+  format wgtFmt = format::oihw;
+  if (in->getFormat() == format::nChw8c) {
+    wgtFmt = format::oIhw8i;
+  } else if (in->getFormat() == format::nChw16c) {
+    wgtFmt = format::oIhw16i;
+  }
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
+  wgt->downSpatial();
+
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                               MKLDNNMatrixPtr in,
+                               MKLDNNMatrixPtr wgt,
+                               MKLDNNMatrixPtr bias,
+                               MKLDNNMatrixPtr out) {
+  CHECK(in);
+  CHECK(wgt);
+  CHECK(out);
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bias != nullptr ? fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        bias->getMemoryDesc(),
+                                                        out->getMemoryDesc())
+                                         : fc_fwd::desc(pk,
+                                                        in->getMemoryDesc(),
+                                                        wgt->getMemoryDesc(),
+                                                        out->getMemoryDesc());
+  pd.reset(new fc_fwd::primitive_desc(fwdDesc, engine_));
+}
+
+void MKLDNNFcLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  if (bias) {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
+  } else {
+    fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
+  }
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                    MKLDNNMatrixPtr& wgt,
+                                    MKLDNNMatrixPtr& bias,
+                                    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+
+  CHECK(wgtVal_);
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
+  }
+}
+
+void MKLDNNFcLayer::resetBwdWgtPD(
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  fc_bwdWgt::desc bwdWgtDesc =
+      bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             bias->getMemoryDesc(),
+                             out->getMemoryDesc())
+           : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
+                             wgt->getMemoryDesc(),
+                             out->getMemoryDesc());
+  pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdDataPD(
+    std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_);
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(
+      in->getMemoryDesc(), wgtVal_->getMemoryDesc(), out->getMemoryDesc());
+  pd.reset(new fc_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNFcLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+    std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& wgt,
+    MKLDNNMatrixPtr& bias,
+    MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0]);
+  if (bias) {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
+  }
+  pipeline.push_back(*bwdWgt_);
+
+  if (bwdDataPD == nullptr) {
+    return;
+  }
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(*bwdDataPD, *out, *wgtVal_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
new file mode 100644
index 0000000000..283dc9b540
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::inner_product_forward fc_fwd;
+typedef mkldnn::inner_product_backward_weights fc_bwdWgt;
+typedef mkldnn::inner_product_backward_data fc_bwdData;
+
+/**
+ * @brief A subclass of MKLDNNLayer fc layer.
+ *
+ * The config file api is mkldnn_fc
+ */
+class MKLDNNFcLayer : public MKLDNNLayer {
+protected:
+  // input layer size, can not be change after init
+  size_t iLayerSize_;  // == ic * ih * iw
+
+  // if has already init the weight
+  bool hasInitedWgt_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<fc_fwd::primitive_desc> fwdPD_;
+
+  // fc weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNFcLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false) {}
+
+  ~MKLDNNFcLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void updateWeights(const UpdateCallback& callback) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr wgt,
+                  MKLDNNMatrixPtr bias,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in,
+                       MKLDNNMatrixPtr& wgt,
+                       MKLDNNMatrixPtr& bias,
+                       MKLDNNMatrixPtr& out);
+  void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
+                     MKLDNNMatrixPtr& wgt,
+                     MKLDNNMatrixPtr& bias,
+                     MKLDNNMatrixPtr& out);
+  void resetBwdDataPD(std::shared_ptr<fc_bwdData::primitive_desc>& pd,
+                      MKLDNNMatrixPtr& in,
+                      MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<fc_bwdWgt::primitive_desc>& bwdWgtPD,
+                        std::shared_ptr<fc_bwdData::primitive_desc>& bwdDataPD,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& wgt,
+                        MKLDNNMatrixPtr& bias,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
new file mode 100644
index 0000000000..ac217f1363
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
@@ -0,0 +1,163 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h
new file mode 100644
index 0000000000..cfe5621252
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNLRNLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
new file mode 100644
index 0000000000..2d0fff608c
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -0,0 +1,304 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+bool MKLDNNLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
+                          << "Please set WITH_MKL=ON "
+                          << "and set use_mkldnn=True";
+  CHECK(!useGpu_) << "Do not support GPU yet";
+
+  // set device id before Layer::init
+  setDevice(MKLDNN_DEVICE);
+  // change param device to MKLDNN device
+  setParamsDevice(MKLDNN_DEVICE, parameterMap);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setOutputMap();
+  checkCPUOutputsNumber();
+
+  stream_.reset(new MKLDNNStream());
+  engine_ = CPUEngine::Instance().getEngine();
+  return true;
+}
+
+void MKLDNNLayer::forward(PassType passType) {
+  passType_ = passType;
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    CHECK(!inputLayers_.empty());
+    copySeqInfoToOutputs();
+    if (condition_ != keepCondition()) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+      condition_ = keepCondition();
+      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+      printSizeInfo();
+      // the output_.value and output_.grad are shared with CPU device
+      shareCPUDevice();
+      pipelineFwd_.clear();
+      inVals_.resize(inputLayers_.size(), nullptr);
+      extInVals_.resize(inputLayers_.size(), nullptr);
+      cvtInVals_.resize(inputLayers_.size(), nullptr);
+      resetFwd(pipelineFwd_, inVals_, outVal_);
+      prepareValueConversions(pipelineFwd_);
+      convertWeightsFromPaddle();
+      printValueFormat();
+      needResetBwd_ = true;
+    }
+
+    if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
+      // Update input value data when input layer is "data" type,
+      // since the input value data address might be changed.
+      CHECK(extInVals_[0]);
+      extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
+    }
+
+    if (!outputOnlyMKLDNN_) {
+      clearGrads();
+    }
+    stream_->submit(pipelineFwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNLayer::backward(const UpdateCallback& callback) {
+  if (needResetBwd_) {
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+    pipelineBwd_.clear();
+    inGrads_.resize(inputLayers_.size(), nullptr);
+    extInGrads_.resize(inputLayers_.size(), nullptr);
+    cvtInGrads_.resize(inputLayers_.size(), nullptr);
+    pipelineMergeGrad_.clear();
+    mergeGrad_ = nullptr;
+    resetBwd(pipelineBwd_, inGrads_, outGrad_);
+    prepareGradConversions(pipelineBwd_);
+    printGradFormat();
+    needResetBwd_ = false;
+  }
+
+  // merge grad must before backward activation
+  if (mergeGrad_) {
+    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+    stream_->submit(pipelineMergeGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    stream_->submit(pipelineBwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    updateWeights(callback);
+  }
+}
+
+void MKLDNNLayer::reshapeInput(int& batchsize,
+                               int& height,
+                               int& width,
+                               size_t idx) {
+  const Argument& input = inputLayers_[idx]->getOutput();
+  batchsize = input.getBatchSize();
+  int h = input.getFrameHeight();
+  int w = input.getFrameWidth();
+  if (h != 0) {
+    height = h;
+  }
+  if (w != 0) {
+    width = w;
+  }
+  height = height != 0 ? height : 1;
+  width = width != 0 ? width : 1;
+}
+
+void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
+  output_.setFrameHeight(height);
+  output_.setFrameWidth(width);
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].setFrameHeight(height);
+    outputOtherDevice_[i].setFrameWidth(width);
+  }
+}
+
+void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                                  const MatrixPtr& mat,
+                                  memory::primitive_desc pd) {
+  dnn = nullptr;
+  if (mat == nullptr) {
+    return;
+  }
+  dnn = MKLDNNMatrix::create(pd, mat);
+}
+
+void MKLDNNLayer::resetInValue(
+    MKLDNNMatrixPtr& in,
+    const std::shared_ptr<memory::primitive_desc>& intPD,
+    size_t idx,
+    int inputChannel) {
+  cvtInVals_[idx] = nullptr;
+  extInVals_[idx] = nullptr;
+  in = nullptr;
+  inputChannel = inputChannel == 0 ? ic_ : inputChannel;
+  CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
+  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+      {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
+  extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
+  if (extInVals_[idx] == nullptr ||
+      extInVals_[idx]->getFormat() == format::nc) {
+    extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
+  }
+  in = extInVals_[idx];
+  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+    return;
+  }
+  // need create reorder
+  in = MKLDNNMatrix::create(*intPD);
+  cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
+  CHECK(cvtInVals_[idx]) << "should not be emptry";
+}
+
+void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
+                                memory::primitive_desc intPD) {
+  cvtOutVal_ = nullptr;
+  out = MKLDNNMatrix::create(intPD, output_.value);
+  extOutVal_ = out;
+  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+  extOutVal_ = MKLDNNMatrix::create(
+      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+  CHECK(cvtOutVal_) << "should not be empty";
+}
+
+void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
+                              memory::primitive_desc intPD,
+                              size_t idx) {
+  cvtInGrads_[idx] = nullptr;
+  extInGrads_[idx] = nullptr;
+  in = nullptr;
+  LayerPtr& input = inputLayers_[idx];
+  if (input->getOutputGrad() == nullptr) {
+    // no need input grad
+    return;
+  }
+  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+      << "only support input is MKLDNN layer or only have one output layer";
+  // when input is a mkldnn branch node,
+  // this layer will save input grad to a internal buffer,
+  // and the mkldnn input layer will merge them to actual prev->output_.grad
+  const MatrixPtr& inMat =
+      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+  in = MKLDNNMatrix::create(intPD, inMat);
+  Argument& arg = input->getOutput(this->getName());
+  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
+  if (inputIsOnlyMKLDNN()) {
+    return;
+  }
+
+  extInGrads_[idx] = in;
+  if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extInVals_[idx] != nullptr &&
+        isPaddleFormat(extInVals_[idx]->getFormat()))
+      << "should have external input value and the format must be nchw(nc)";
+  extInGrads_[idx] =
+      MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
+  CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
+  in = MKLDNNMatrix::create(intPD);
+  cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
+  CHECK(cvtInGrads_[idx]);
+}
+
+void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
+                               memory::primitive_desc intPD) {
+  cvtOutGrad_ = nullptr;
+  extOutGrad_ = nullptr;
+  out = nullptr;
+  MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(intPD, outMat);
+  resetMergeGrad(out);
+  if (outputIsOnlyMKLDNN()) {
+    return;
+  }
+  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+  extOutGrad_ = out;
+  if (isPaddleFormat(extOutGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+      << "should have external output value and the format must be nchw(nc)";
+  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
+  CHECK_PRIMITIVE_DESC_EQ(outVal_, intPD);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+  CHECK(cvtOutGrad_);
+}
+
+void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
+  mergeGrad_ = nullptr;
+  pipelineMergeGrad_.clear();
+  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+    // do not merge when output is not all MKLDNN or only one output
+    return;
+  }
+  CHECK(out) << "should have reset internal ouput grad";
+  std::vector<float> scales(outputMap_.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  std::vector<primitive::at> srcs;
+  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+    MKLDNNMatrixPtr src =
+        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+    CHECK(src) << "should be MKLDNNMatrix";
+    auto srcDims = src->getDims();
+    auto dstDims = out->getDims();
+    CHECK_EQ(srcDims.size(), dstDims.size());
+    for (size_t i = 0; i < srcDims.size(); ++i) {
+      CHECK_EQ(srcDims[i], dstDims[i]);
+    }
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
+                      << ", format " << src->getFormat();
+    srcPDs.push_back(src->getPrimitiveDesc());
+    srcs.push_back(*src);
+  }
+
+  auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *out));
+  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000..3ba39f18b6
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,477 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+#include "paddle/math/MKLDNNMatrix.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_bool(use_mkldnn);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+protected:
+  // batch size
+  int bs_;
+  // their sizes are always from the first input layer
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // the condition that forward need be reset
+  size_t condition_;
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
+  std::vector<MKLDNNMatrixPtr> inVals_;
+  std::vector<MKLDNNMatrixPtr> inGrads_;
+  MKLDNNMatrixPtr outVal_;
+  MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  std::vector<MKLDNNMatrixPtr> extInVals_;
+  std::vector<MKLDNNMatrixPtr> extInGrads_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
+  std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
+  MKLDNNMatrixPtr wgtVal_;
+  MKLDNNMatrixPtr wgtGrad_;
+  MKLDNNMatrixPtr biasVal_;
+  MKLDNNMatrixPtr biasGrad_;
+
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+
+public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        ih_(0),
+        iw_(0),
+        condition_(0),
+        needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+
+  /**
+   * reshape the input and output channels and image sizes
+   * and reset output buffer size
+   */
+  virtual void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
+
+  /**
+   * reset the mkldnn forward primitve and memories
+   * only would be called when input size changes
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * reset the mkldnn backward primitve and memories
+   * only would be called when needed
+   * weight and bias buffers should be coverd by child class itself
+   */
+  virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                        std::vector<MKLDNNMatrixPtr>& inputs,
+                        MKLDNNMatrixPtr& out) = 0;
+
+  /**
+   * Update weights and biases if necessary.
+   */
+  virtual void updateWeights(const UpdateCallback& callback) {}
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * add this interface as public for unit test
+   */
+  void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
+
+protected:
+  /**
+   * Some layers may have different condition to reset the forward.
+   * The function returns the condition that do not need reset forward.
+   */
+  inline virtual size_t keepCondition() {
+    // reset when the first input element size changed, not only the batchsize
+    return inputLayers_[0]->getOutputValue()->getElementCnt();
+  }
+
+  /**
+   * reshape the input image sizes and input batchsize
+   */
+  void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
+
+  /**
+   * reshape output image sizes
+   */
+  void reshapeOutput(size_t height, size_t width);
+
+  /**
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
+   */
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
+
+  /**
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   * input channel may be different in concat.
+   */
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
+      size_t idx = 0,
+      int inputChannel = 0);
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in,
+                   mkldnn::memory::primitive_desc intPD,
+                   size_t idx = 0);
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
+
+  /**
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
+   */
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * If input only has MKLDNN device.
+   * Otherwise, only support the previous layer using CPU device.
+   */
+  bool inputIsOnlyMKLDNN(int index = 0) {
+    int prevDevice = getPrev(index)->getDeviceId();
+    if (prevDevice == MKLDNN_DEVICE) {
+      return true;
+    } else {
+      CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
+      return false;
+    }
+  }
+
+  /**
+   * If output only has MKLDNN device.
+   * Otherwise, other devices should only using CPU device.
+   */
+  bool outputIsOnlyMKLDNN() {
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
+          << "Only support other device is CPU yet";
+    }
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
+  }
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    for (size_t i = 0; i < inVals_.size(); ++i) {
+      if (!inVals_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
+                                                  : inVals_[i]->getFormat())
+                        << " >>> " << inVals_[i]->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
+                        << (extOutVal_ ? extOutVal_->getFormat()
+                                       : outVal_->getFormat());
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
+                        << (extOutGrad_ ? extOutGrad_->getFormat()
+                                        : outGrad_->getFormat());
+    }
+    for (size_t i = 0; i < inGrads_.size(); ++i) {
+      if (!inGrads_[i]) {
+        continue;
+      }
+      VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
+                        << ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
+                                                   : inGrads_[i]->getFormat())
+                        << " <<< " << inGrads_[i]->getFormat() << " <<<";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
+
+private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    if (output_.grad) {
+      output_.grad->zeroMem();
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].grad) {
+        outputOtherDevice_[i].grad->zeroMem();
+      }
+    }
+  }
+
+  /**
+   * Set deviceId of the params used in this layer.
+   */
+  void setParamsDevice(int id, const ParameterMap& parameterMap) {
+    for (auto& inputConfig : config_.inputs()) {
+      if (inputConfig.has_input_parameter_name()) {
+        ParameterPtr parameter;
+        std::string name = inputConfig.input_parameter_name();
+        CHECK(mapGet(name, parameterMap, &parameter))
+            << "Cannot find input parameter " << name << " for layer "
+            << getName();
+        parameter->setDevice(id);
+      }
+    }
+    if (config_.has_bias_parameter_name()) {
+      ParameterPtr parameter;
+      std::string name = config_.bias_parameter_name();
+      CHECK(mapGet(name, parameterMap, &parameter))
+          << "Cannot find bias parameter " << name << " for layer "
+          << getName();
+      parameter->setDevice(id);
+    }
+  }
+
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
+  /**
+   * Check the cpu device number of outputOtherDevice_.
+   * should have only one at most.
+   */
+  void checkCPUOutputsNumber(int max = 1) {
+    int cnt = 0;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].deviceId == CPU_DEVICE) {
+        ++cnt;
+      }
+    }
+    CHECK_LE(cnt, max) << "too much CPU devies";
+  }
+
+  /**
+   * copy SeqInfo from input layer to this output and other output devices.
+   * @note: do not use getInput(0) since it used this deviceId_,
+   *        use "inputLayers_[0]->getOutput()" instead.
+   */
+  void copySeqInfoToOutputs() {
+    if (inputLayers_.empty() || !needSequenceInfo_) {
+      return;
+    }
+    const Argument& input = inputLayers_[0]->getOutput();
+    output_.sequenceStartPositions = input.sequenceStartPositions;
+    output_.subSequenceStartPositions = input.subSequenceStartPositions;
+    output_.cpuSequenceDims = input.cpuSequenceDims;
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].sequenceStartPositions =
+          output_.sequenceStartPositions;
+      outputOtherDevice_[i].subSequenceStartPositions =
+          output_.subSequenceStartPositions;
+      outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
+    }
+  }
+
+  void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // MKLDNNLayer output value should be MKLDNNMatrix
+    // so external output value is necessary.
+    // Then external input value is not necessary,
+    // since input may be mkldnn internal buffer.
+    CHECK(extOutVal_) << "external output value is necessary";
+    output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+    CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
+    for (size_t i = 0; i < cvtInVals_.size(); ++i) {
+      if (cvtInVals_[i]) {
+        pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
+      }
+    }
+    if (cvtOutVal_) {
+      pipeline.push_back(*cvtOutVal_);
+    }
+  }
+  void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipeline.insert(pipeline.begin(), *cvtOutGrad_);
+    }
+    for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
+      if (cvtInGrads_[i]) {
+        pipeline.push_back(*cvtInGrads_[i]);
+      }
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
new file mode 100644
index 0000000000..a8252593c8
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNPoolLayer.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_pool, MKLDNNPoolLayer);
+
+bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  ic_ = conf.channels();
+  ih_ = conf.img_size_y();
+  iw_ = conf.img_size();
+  oc_ = ic_;
+  oh_ = conf.output_y();
+  ow_ = conf.output_x();
+  fh_ = conf.size_y();
+  fw_ = conf.size_x();
+  ph_ = conf.padding_y();
+  pw_ = conf.padding();
+  sh_ = conf.stride_y();
+  sw_ = conf.stride();
+
+  const std::string& type = conf.pool_type();
+  if (type == "max-projection") {
+    poolAlgo_ = algorithm::pooling_max;
+  } else if (type == "avg-projection") {
+    // paddle only use exclude_padding
+    poolAlgo_ = algorithm::pooling_avg_exclude_padding;
+  } else {
+    LOG(FATAL) << "unknow pooling type!";
+  }
+  return true;
+}
+
+void MKLDNNPoolLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+
+  // cal output sizes
+  // paddle used false caffeMode for pooling
+  oh = outputSize(ih, fh_, ph_, sh_, false);
+  ow = outputSize(iw, fw_, pw_, sw_, false);
+  reshapeOutput(oh, ow);
+
+  resizeOutput(bs, oc * oh * ow);
+}
+
+void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+
+  resetFwdPD(fwdPD_, inputs[0], out);
+
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
+                               std::vector<MKLDNNMatrixPtr>& inputs,
+                               MKLDNNMatrixPtr& out) {
+  std::shared_ptr<pool_bwd::primitive_desc> pd;
+
+  resetBwdBuffers(inputs[0], out);
+
+  resetBwdPD(pd, inputs[0], out);
+
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+
+void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+
+  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
+}
+
+void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr in,
+                                 MKLDNNMatrixPtr out) {
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  padding_kind padKind = padding_kind::zero;
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = pool_fwd::desc(pk,
+                                poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padKind);
+  pd.reset(new pool_fwd::primitive_desc(fwdDesc, engine_));
+
+  // prepare workspace if necessary
+  workspace_ =
+      (passType_ != PASS_TEST && poolAlgo_ == algorithm::pooling_max)
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+
+void MKLDNNPoolLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
+             : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+
+void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                      MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+
+void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                                 MKLDNNMatrixPtr& in,
+                                 MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  memory::dims kernels = memory::dims{fh_, fw_};
+  memory::dims strides = memory::dims{sh_, sw_};
+  memory::dims padL = memory::dims{ph_, pw_};
+  memory::dims padR = getPaddingR();
+  CHECK(out);
+  auto bwdDesc = pool_bwd::desc(poolAlgo_,
+                                in->getMemoryDesc(),
+                                out->getMemoryDesc(),
+                                strides,
+                                kernels,
+                                padL,
+                                padR,
+                                padding_kind::zero);
+  pd.reset(new pool_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+
+void MKLDNNPoolLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<pool_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+
+  bwdData_ =
+      workspace_
+          ? std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *workspace_, *in))
+          : std::make_shared<pool_bwd>(pool_bwd(*pd, *out, *in));
+  pipeline.push_back(*bwdData_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
new file mode 100644
index 0000000000..dad60156f0
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+typedef mkldnn::pooling_forward pool_fwd;
+typedef mkldnn::pooling_backward pool_bwd;
+
+/**
+ * @brief A subclass of MKLDNNLayer pool layer.
+ *
+ * The config file api is mkldnn_pool
+ */
+class MKLDNNPoolLayer : public MKLDNNLayer {
+protected:
+  // padding height and width
+  int ph_, pw_;
+  // stride height and width
+  int sh_, sw_;
+  // filter(kenerl) height and width
+  int fh_, fw_;
+
+  // pooling_avg or pooling_max
+  mkldnn::algorithm poolAlgo_;
+
+  // save forward primitive_desc, which can be used backward
+  std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_pooling_forward.cpp, pool need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+
+public:
+  explicit MKLDNNPoolLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+
+  ~MKLDNNPoolLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+
+  void printSizeInfo() override {
+    MKLDNNLayer::printSizeInfo();
+    VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
+                       << ": ph: " << ph_ << ", pw: " << pw_ << ", sh: " << sh_
+                       << ", sw: " << sw_;
+  }
+
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<pool_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+
+  /**
+   * get padding_r according to
+   * https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+   * test_pooling_forward.cpp
+   */
+  mkldnn::memory::dims getPaddingR() const {
+    mkldnn::memory::dims padR = {ph_, pw_};
+    for (int i = 0; i < 2; ++i) {
+      if ((ih_ + ph_ + padR[0] - fh_) / sh_ + 1 < oh_) {
+        ++padR[0];
+      }
+      if ((iw_ + pw_ + padR[1] - fw_) / sw_ + 1 < ow_) {
+        ++padR[1];
+      }
+    }
+    return padR;
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
new file mode 100644
index 0000000000..dd75555fae
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.cpp
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLPackedRecurrentLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(mkl_packed_recurrent, MKLPackedRecurrentLayer);
+
+bool MKLPackedRecurrentLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+  if (!RecurrentLayer::init(layerMap, parameterMap)) return false;
+  packed_weight_.reset(new MKLPackedWeight(weight_->getW()));
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_.reset(new MKLPackedWeight(weight_->getW(), true));
+    packed_weightT_->pack();
+  }
+  return true;
+}
+
+void MKLPackedRecurrentLayer::backward(const UpdateCallback& callback) {
+  RecurrentLayer::backward(callback);
+  packed_weight_->pack();
+  if (needGradient_) {
+    packed_weightT_->pack();
+  }
+}
+
+void MKLPackedRecurrentLayer::forwardBatch(int batchSize,
+                                           size_t numSequences,
+                                           const int* starts) {
+  if (!batchValue_) {
+    batchValue_.reset(new SequenceToBatch(useGpu_));
+  }
+
+  batchValue_->resizeOrCreateBatch(batchSize, numSequences, starts, reversed_);
+
+  batchValue_->copyFromSeq(*output_.value);
+
+  {
+    REGISTER_TIMER_INFO("RecurrentFwBatch", getName().c_str());
+    /* forward one batch */
+    for (size_t n = 0; n < batchValue_->getNumBatch(); n++) {
+      MatrixPtr batchValue = batchValue_->getBatchValue(n);
+
+      if (n != 0) {
+        MatrixPtr preBatchValue =
+            batchValue_->getBatchValue(n - 1, batchValue->getHeight());
+
+        packed_weight_->gemm_compute(preBatchValue, batchValue);
+      }
+      Argument arg;
+      arg.value = batchValue;
+      activation_->forward(arg).check();
+    }
+  }
+  batchValue_->copyBackSeq(*output_.value);
+}
+
+void MKLPackedRecurrentLayer::backwardBatch(int batchSize,
+                                            size_t numSequences,
+                                            const int* starts) {
+  if (!batchGrad_) {
+    batchGrad_.reset(new SequenceToBatch(useGpu_));
+  }
+  batchGrad_->shareIndexWith(*batchValue_);
+
+  size_t numBatch = batchGrad_->getNumBatch();
+  bool backwardByBatch = numBatch < numSequences;
+
+  batchGrad_->copyFromSeq(*output_.grad);
+  {
+    REGISTER_TIMER_INFO("RecurrentBwData", getName().c_str());
+    /* backward one batch */
+    for (int n = (int)numBatch - 1; n >= 0; n--) {
+      MatrixPtr batchGrad = batchGrad_->getBatchValue(n);
+      MatrixPtr batchValue =
+          batchValue_->getBatchValue(n, batchGrad->getHeight());
+
+      Argument arg;
+      arg.value = batchValue;
+      arg.grad = batchGrad;
+      activation_->backward(arg).check();
+
+      if (n != 0) {
+        batchValue = batchGrad_->getBatchValue(n - 1, batchGrad->getHeight());
+        packed_weightT_->gemm_compute(batchGrad, batchValue);
+      }
+
+      if (backwardByBatch && weight_->getWGrad()) {
+        if (n != 0) {
+          /* backward weight */
+          batchValue =
+              batchValue_->getBatchValue(n - 1, batchGrad->getHeight());
+          weight_->getWGrad()->mul(
+              *batchValue->getTranspose(), *batchGrad, 1, 1);
+        }
+      }
+    }
+  }
+
+  batchGrad_->copyBackSeq(*output_.grad);
+
+  if (!backwardByBatch && weight_->getWGrad()) {
+    REGISTER_TIMER_INFO("RecurrentBwWeight", getName().c_str());
+    for (size_t seq = 0; seq < numSequences; ++seq) {
+      int len = starts[seq + 1] - starts[seq];
+      weight_->getWGrad()->mul(
+          *output_.value
+               ->subMatrix(reversed_ ? starts[seq] + 1 : starts[seq], len - 1)
+               ->getTranspose(),
+          *output_.grad->subMatrix(reversed_ ? starts[seq] : starts[seq] + 1,
+                                   len - 1),
+          1,
+          1);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedRecurrentLayer.h b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
new file mode 100644
index 0000000000..bded523a8f
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedRecurrentLayer.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLPackedWeight.h"
+#include "RecurrentLayer.h"
+
+DECLARE_bool(rnn_use_batch);
+
+namespace paddle {
+
+/**
+ * @brief MKLPackedRecurrentLayer is almost the same with RecurrentLayer
+ * but is optimized with MKL cblas packed gemm.
+ * More details:
+ * https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/mkl/mkl_packed.md
+ */
+
+class MKLPackedRecurrentLayer : public RecurrentLayer {
+public:
+  explicit MKLPackedRecurrentLayer(const LayerConfig& config)
+      : RecurrentLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  void forwardBatch(int batchSize,
+                    size_t numSequences,
+                    const int* starts) override;
+
+  void backwardBatch(int batchSize,
+                     size_t numSequences,
+                     const int* starts) override;
+
+protected:
+  /// packed_weight_ contains same data with
+  /// RecurrentLayer::weight_ but is packed
+  std::unique_ptr<MKLPackedWeight> packed_weight_;
+  /// packed_weightT_ is the transposition matrix of packed_weight_
+  std::unique_ptr<MKLPackedWeight> packed_weightT_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLPackedWeight.h b/paddle/gserver/layers/MKLPackedWeight.h
new file mode 100644
index 0000000000..15d5093beb
--- /dev/null
+++ b/paddle/gserver/layers/MKLPackedWeight.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/MathFunctions.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
+
+namespace paddle {
+
+class MKLPackedWeight {
+protected:
+  /// The pointer of weight
+  real *weight_;
+  /// The pointer of cblas packed gemm to weight
+  real *packedWeight_;
+  size_t height_;
+  size_t width_;
+  bool transW_;
+
+public:
+  explicit MKLPackedWeight(MatrixPtr weight, bool transW = false) {
+    packedWeight_ = nullptr;
+    weight_ = weight->getData();
+    height_ = weight->getHeight();
+    width_ = weight->getWidth();
+    transW_ = transW;
+  }
+
+  ~MKLPackedWeight() { free_(); }
+
+  void pack() { pack_(weight_); }
+
+  void gemm_compute(const MatrixPtr src, MatrixPtr dst) {
+    cblas_sgemm_compute(CblasRowMajor,
+                        CblasNoTrans,
+                        CblasPacked,
+                        src->getHeight(),
+                        transW_ ? height_ : width_,
+                        transW_ ? width_ : height_,
+                        src->getData(),
+                        src->getWidth(),
+                        packedWeight_,
+                        width_,
+                        1.0,
+                        dst->getData(),
+                        dst->getWidth());
+  }
+
+protected:
+  void pack_(real *src) {
+    if (!packedWeight_) {
+      packedWeight_ = cblas_sgemm_alloc(CblasBMatrix, 1, width_, height_);
+    }
+    cblas_sgemm_pack(CblasRowMajor,
+                     CblasBMatrix,
+                     transW_ ? CblasTrans : CblasNoTrans,
+                     1,
+                     transW_ ? height_ : width_,
+                     transW_ ? width_ : height_,
+                     1.0,
+                     src,
+                     width_,
+                     packedWeight_);
+  }
+
+  void free_() {
+    if (packedWeight_) {
+      cblas_sgemm_free(packedWeight_);
+    }
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index baa58ca2d7..fa536fce2b 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -26,6 +26,10 @@ namespace paddle {
  * If SequenceLevel = kNonSeq:
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = max_{for each instance in this sequence}{input[i]}
+ *    If stride_ > 0:
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and the max pooling operation is
+ *              then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
new file mode 100644
index 0000000000..d810a58d9a
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MaxPoolWithMaskLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+bool MaxPoolWithMaskLayer::init(const LayerMap& layerMap,
+                                const ParameterMap& parameterMap) {
+  PoolLayer::init(layerMap, parameterMap);
+  setOutput("mask", &mask_);
+  return true;
+}
+
+size_t MaxPoolWithMaskLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t layerSize = 0;
+
+  outputY_ = outputSize(imgSizeY_,
+                        sizeY_,
+                        confPaddingY_,
+                        strideY_,
+                        /* caffeMode */ false);
+  outputX_ = outputSize(imgSize_,
+                        sizeX_,
+                        confPadding_,
+                        stride_,
+                        /* caffeMode */ false);
+
+  layerSize = outputX_ * outputY_ * channels_;
+  getOutput().setFrameHeight(outputY_);
+  getOutput().setFrameWidth(outputX_);
+
+  return layerSize;
+}
+
+void MaxPoolWithMaskLayer::forward(PassType passType) {
+  size_t size = getSize();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  int batchSize = inputV->getHeight();
+  resetOutput(batchSize, size);
+
+  MatrixPtr outV = getOutputValue();
+  CHECK_EQ(size, outV->getWidth());
+
+  resetSpecifyOutput(mask_,
+                     batchSize,
+                     size,
+                     /* isValueClean */ false,
+                     /* isGradClean */ true);
+
+  MatrixPtr maskV = mask_.value;
+  outV->maxPoolForward(*inputV,
+                       imgSizeY_,
+                       imgSize_,
+                       channels_,
+                       sizeX_,
+                       sizeY_,
+                       strideY_,
+                       stride_,
+                       outputY_,
+                       outputX_,
+                       confPaddingY_,
+                       confPadding_,
+                       maskV);
+}
+
+void MaxPoolWithMaskLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  if (NULL == getInputGrad(0)) {
+    return;
+  }
+
+  MatrixPtr outGrad = getOutputGrad();
+  MatrixPtr inputV = inputLayers_[0]->getOutputValue();
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr inputGrad = inputLayers_[0]->getOutputGrad();
+
+  inputGrad->maxPoolBackward(*inputV,
+                             imgSizeY_,
+                             imgSize_,
+                             *outGrad,
+                             *outV,
+                             sizeX_,
+                             sizeY_,
+                             strideY_,
+                             stride_,
+                             outputY_,
+                             outputX_,
+                             1,
+                             1,
+                             confPaddingY_,
+                             confPadding_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MaxPoolWithMaskLayer.h b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
new file mode 100644
index 0000000000..e0174add9d
--- /dev/null
+++ b/paddle/gserver/layers/MaxPoolWithMaskLayer.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "PoolLayer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * @brief Basic parent layer of different kinds of pooling
+ */
+class MaxPoolWithMaskLayer : public PoolLayer {
+protected:
+  Argument mask_;
+
+public:
+  explicit MaxPoolWithMaskLayer(const LayerConfig& config)
+      : PoolLayer(config) {}
+
+  size_t getSize();
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.cpp b/paddle/gserver/layers/MultiBoxLossLayer.cpp
new file mode 100644
index 0000000000..bbf1166dce
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
@@ -0,0 +1,376 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MultiBoxLossLayer.h"
+#include <float.h>
+#include <vector>
+#include "DataLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(multibox_loss, MultiBoxLossLayer);
+
+bool MultiBoxLossLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  numClasses_ = layerConf.num_classes();
+  inputNum_ = layerConf.input_num();
+  overlapThreshold_ = layerConf.overlap_threshold();
+  negPosRatio_ = layerConf.neg_pos_ratio();
+  negOverlap_ = layerConf.neg_overlap();
+  backgroundId_ = layerConf.background_id();
+  return true;
+}
+
+void MultiBoxLossLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  resetOutput(batchSize, 1);
+
+  // all location data and confidence score data
+  locSizeSum_ = 0;
+  confSizeSum_ = 0;
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    locSizeSum_ += inLoc->getElementCnt();
+    confSizeSum_ += inConf->getElementCnt();
+  }
+
+  // locBuffer layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin2 ......
+  Matrix::resizeOrCreate(locTmpBuffer_, 1, locSizeSum_, false, useGpu_);
+  locBuffer_ = locTmpBuffer_;
+
+  // confBuffer layout:
+  // | class1 score | class2 score | ... |classN score | class1 score | ......
+  Matrix::resizeOrCreate(confTmpBuffer_, 1, confSizeSum_, false, useGpu_);
+  confBuffer_ = confTmpBuffer_;
+
+  // concate location data and confidence score data
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto& layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLoc = getInputValue(*getLocInputLayer(n));
+    const MatrixPtr inConf = getInputValue(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+    locOffset += appendWithPermute(*inLoc,
+                                   height,
+                                   width,
+                                   locSizeSum_,
+                                   locOffset,
+                                   batchSize,
+                                   *locBuffer_,
+                                   kNCHWToNHWC);
+    confOffset += appendWithPermute(*inConf,
+                                    height,
+                                    width,
+                                    confSizeSum_,
+                                    confOffset,
+                                    batchSize,
+                                    *confBuffer_,
+                                    kNCHWToNHWC);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+
+  // priorValue layout:
+  // | xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var
+  // | xmin2 | ......
+  MatrixPtr priorValue;
+
+  // labelValue layout:
+  // | class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ......
+  MatrixPtr labelValue;
+
+  // Copy data from GPU to CPU if use GPU
+  if (useGpu_) {
+    Matrix::resizeOrCreate(locCpuBuffer_, 1, locSizeSum_, false, false);
+    Matrix::resizeOrCreate(confCpuBuffer_, 1, confSizeSum_, false, false);
+    MatrixPtr priorTmpValue = getInputValue(*getPriorBoxLayer());
+    Matrix::resizeOrCreate(
+        priorCpuValue_, 1, priorTmpValue->getElementCnt(), false, false);
+    MatrixPtr labelTmpValue = getInputValue(*getLabelLayer());
+    Matrix::resizeOrCreate(labelCpuValue_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    locCpuBuffer_->copyFrom(*locTmpBuffer_);
+    confCpuBuffer_->copyFrom(*confTmpBuffer_);
+    priorCpuValue_->copyFrom(*priorTmpValue);
+    labelCpuValue_->copyFrom(*labelTmpValue);
+
+    locBuffer_ = locCpuBuffer_;
+    confBuffer_ = confCpuBuffer_;
+    priorValue = priorCpuValue_;
+    labelValue = labelCpuValue_;
+  } else {
+    priorValue = getInputValue(*getPriorBoxLayer());
+    labelValue = getInputValue(*getLabelLayer());
+  }
+
+  // Get max scores for each prior bbox. Used in negative mining
+  std::vector<std::vector<real>> allMaxConfScore;
+  numPriors_ = priorValue->getElementCnt() / 8;
+  getMaxConfidenceScores(confBuffer_->getData(),
+                         batchSize,
+                         numPriors_,
+                         numClasses_,
+                         backgroundId_,
+                         &allMaxConfScore);
+
+  // Match prior bbox to groundtruth bbox
+  Argument label = getInput(*getLabelLayer());
+  const int* labelIndex = label.sequenceStartPositions->getData(false);
+  size_t seqNum = label.getNumSequences();
+  numMatches_ = 0;
+  numNegs_ = 0;
+  allMatchIndices_.clear();
+  allNegIndices_.clear();
+
+  std::pair<size_t, size_t> retPair = generateMatchIndices(*priorValue,
+                                                           numPriors_,
+                                                           *labelValue,
+                                                           labelIndex,
+                                                           seqNum,
+                                                           allMaxConfScore,
+                                                           batchSize,
+                                                           overlapThreshold_,
+                                                           negOverlap_,
+                                                           negPosRatio_,
+                                                           &allMatchIndices_,
+                                                           &allNegIndices_);
+  numMatches_ = retPair.first;
+  numNegs_ = retPair.second;
+
+  // BBox location L1 smooth loss
+  locLoss_ = 0.0;
+  if (numMatches_ >= 1) {
+    size_t count = 0;
+    MatrixPtr locLossOutput;
+    Matrix::resizeOrCreate(locLossOutput, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locGTData_, numMatches_ * 4, 1, false, false);
+    Matrix::resizeOrCreate(locDiff_, numMatches_ * 4, 1, false, false);
+    locDiff_->zeroMem();
+    std::vector<real> locGTData;
+
+    real* locDiffData = locDiff_->getData();
+    const real* locBufferData = locBuffer_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;  // match none
+        size_t locOffset =
+            n * (locBuffer_->getElementCnt() / batchSize) + i * 4;
+        std::copy(locBufferData + locOffset,
+                  locBufferData + locOffset + 4,
+                  locDiffData + count);
+        count += 4;
+        const int gtIdx = allMatchIndices_[n][i];
+        size_t priorOffset = i * 8;
+        std::vector<NormalizedBBox> priorBBoxVec;
+        getBBoxFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVec);
+        std::vector<std::vector<real>> priorBBoxVar;
+        getBBoxVarFromPriorData(
+            priorValue->getData() + priorOffset, 1, priorBBoxVar);
+        size_t labelOffset = (labelIndex[n] + gtIdx) * 6;
+        std::vector<NormalizedBBox> gtBBoxVec;
+        getBBoxFromLabelData(labelValue->getData() + labelOffset, 1, gtBBoxVec);
+        std::vector<real> gtEncode;
+        encodeBBoxWithVar(
+            priorBBoxVec[0], priorBBoxVar[0], gtBBoxVec[0], gtEncode);
+        locGTData.insert(locGTData.end(), gtEncode.begin(), gtEncode.end());
+      }
+    }
+    locGTData_->copyFrom(&locGTData[0], numMatches_ * 4);
+    locLossOutput->smoothL1(*locDiff_, *locGTData_, 0.0);
+    locLoss_ = locLossOutput->getSum() / numMatches_;
+  }
+
+  // BBox confidence softmax loss
+  confLoss_ = 0;
+  numConf_ = numMatches_ + numNegs_;
+  if (numConf_ >= 1) {
+    Matrix::resizeOrCreate(confProb_, numConf_, numClasses_, false, false);
+    IVector::resizeOrCreate(confGTData_, numConf_, false);
+    confProb_->zeroMem();
+    size_t count = 0;
+
+    std::vector<real> confPredData;
+    real* confProbData = confProb_->getData();
+    const real* confBufferData = confBuffer_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        size_t labelOffset = (labelIndex[n] + allMatchIndices_[n][i]) * 6;
+        const int gtLabel = (labelValue->getData() + labelOffset)[0];
+        confGTData_->getData()[count] = gtLabel;
+        size_t confOffset = n * numPriors_ * numClasses_ + i * numClasses_;
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
+        ++count;
+      }
+      // Negative mining samples
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        confGTData_->getData()[count] = backgroundId_;
+        size_t confOffset =
+            n * numPriors_ * numClasses_ + allNegIndices_[n][i] * numClasses_;
+        std::copy(confBufferData + confOffset,
+                  confBufferData + confOffset + numClasses_,
+                  confProbData + count * numClasses_);
+        confPredData.reserve(confPredData.size() + numClasses_);
+        confPredData.insert(confPredData.end(),
+                            confBufferData + confOffset,
+                            confBufferData + confOffset + numClasses_);
+        ++count;
+      }
+    }
+    CHECK_EQ(numConf_, count);
+    confProb_->softmax(*confProb_);
+    MatrixPtr confLossOutput;
+    Matrix::resizeOrCreate(confLossOutput, numConf_, 1, false, false);
+    confLossOutput->oneHotCrossEntropy(*confProb_, *confGTData_);
+    confLoss_ = confLossOutput->getSum() / numMatches_;
+  }
+  real loss = locLoss_ + confLoss_;
+  MatrixPtr outV = getOutputValue();
+  outV->assign(loss);
+}
+
+void MultiBoxLossLayer::backward(const UpdateCallback& callback) {
+  size_t batchSize = getInputValue(*getLocInputLayer(0))->getHeight();
+  locBuffer_->zeroMem();
+  confBuffer_->zeroMem();
+
+  // Back propagate on location prediction
+  if (numMatches_ >= 1) {
+    MatrixPtr locDiffBuffer;
+    Matrix::resizeOrCreate(locDiffBuffer, numMatches_ * 4, 1, false, false);
+    locDiffBuffer->smoothL1Bp(*locDiff_, *locGTData_, 0.0);
+    locDiff_->copyFrom(*locDiffBuffer);
+    // scale gradient
+    for (size_t i = 0; i < numMatches_ * 4; ++i)
+      locDiff_->getData()[i] *= (1. / numMatches_);
+    // Copy gradient back
+    size_t count = 0;
+    const real* locDiffData = locDiff_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* locBufferData =
+            locBuffer_->getData() + n * numPriors_ * 4 + i * 4;
+        std::copy(locDiffData + count * 4,
+                  locDiffData + (count + 1) * 4,
+                  locBufferData);
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numMatches_);
+  }
+
+  if (numConf_ >= 1) {
+    for (size_t i = 0; i < numConf_; ++i)
+      confProb_->getData()[i * numClasses_ + confGTData_->getData()[i]] -= 1;
+    for (size_t i = 0; i < numConf_ * numClasses_; ++i)
+      confProb_->getData()[i] *= (1. / numMatches_);
+    size_t count = 0;
+    const real* confProbData = confProb_->getData();
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (size_t i = 0; i < numPriors_; ++i) {
+        if (allMatchIndices_[n][i] == -1) continue;
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + i * numClasses_;
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
+        ++count;
+      }
+      for (size_t i = 0; i < allNegIndices_[n].size(); ++i) {
+        int idx = allNegIndices_[n][i];
+        real* confDiffData = confBuffer_->getData() +
+                             n * numPriors_ * numClasses_ + idx * numClasses_;
+        std::copy(confProbData + count * numClasses_,
+                  confProbData + (count + 1) * numClasses_,
+                  confDiffData);
+        ++count;
+      }
+    }
+    CHECK_EQ(count, numConf_);
+  }
+  if (useGpu_) {
+    locTmpBuffer_->copyFrom(*locCpuBuffer_);
+    confTmpBuffer_->copyFrom(*confCpuBuffer_);
+    locBuffer_ = locTmpBuffer_;
+    confBuffer_ = confTmpBuffer_;
+  }
+  // copy back
+  size_t locOffset = 0;
+  size_t confOffset = 0;
+  auto layerConf = config_.inputs(0).multibox_loss_conf();
+  for (size_t n = 0; n < inputNum_; ++n) {
+    const MatrixPtr inLocG = getInputGrad(*getLocInputLayer(n));
+    const MatrixPtr inConfG = getInputGrad(*getConfInputLayer(n));
+    size_t height = getInput(*getLocInputLayer(n)).getFrameHeight();
+    // only for unittest, there are no width and height information
+    // when constructing matrix in unittest, so we should
+    // set the shape in configuration
+    if (!height) height = layerConf.height();
+    size_t width = getInput(*getLocInputLayer(n)).getFrameWidth();
+    if (!width) width = layerConf.width();
+
+    // NHWC to NCHW
+    MatrixPtr locGBuffer;
+    Matrix::resizeOrCreate(
+        locGBuffer, inLocG->getHeight(), inLocG->getWidth(), false, useGpu_);
+    MatrixPtr confGBuffer;
+    Matrix::resizeOrCreate(
+        confGBuffer, inConfG->getHeight(), inConfG->getWidth(), false, useGpu_);
+
+    locOffset += decomposeWithPermute(*locBuffer_,
+                                      height,
+                                      width,
+                                      locSizeSum_,
+                                      locOffset,
+                                      batchSize,
+                                      *locGBuffer,
+                                      kNHWCToNCHW);
+    inLocG->add(*locGBuffer);
+    confOffset += decomposeWithPermute(*confBuffer_,
+                                       height,
+                                       width,
+                                       confSizeSum_,
+                                       confOffset,
+                                       batchSize,
+                                       *confGBuffer,
+                                       kNHWCToNCHW);
+    inConfG->add(*confGBuffer);
+  }
+  CHECK_EQ(locOffset, locSizeSum_ / batchSize);
+  CHECK_EQ(confOffset, confSizeSum_ / batchSize);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
new file mode 100644
index 0000000000..9935da5644
--- /dev/null
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -0,0 +1,103 @@
+/* copyright (c) 2016 paddlepaddle authors. all rights reserve.
+
+licensed under the apache license, version 2.0 (the "license");
+you may not use this file except in compliance with the license.
+you may obtain a copy of the license at
+
+    http://www.apache.org/licenses/license-2.0
+
+unless required by applicable law or agreed to in writing, software
+distributed under the license is distributed on an "as is" basis,
+without warranties or conditions of any kind, either express or implied.
+see the license for the specific language governing permissions and
+limitations under the license. */
+
+#pragma once
+
+#include <vector>
+#include "CostLayer.h"
+#include "DataLayer.h"
+#include "DetectionUtil.h"
+#include "Layer.h"
+
+using std::vector;
+using std::pair;
+
+namespace paddle {
+
+/**
+ * The multibox loss layer for a SSD detection task.
+ * The loss is composed by the location loss and the confidence loss.
+ * The location loss is a smooth L1 loss and the confidence loss is
+ * a softmax loss.
+ * - Input: This layer needs four input layers: The first input layer
+ *          is the priorbox layer and the second layer is a label layer.
+ *          The rest two input layers are convolution layers for generating
+ *          bbox location offset and the classification confidence.
+ * - Output: The Single Shot Multibox Detection loss value.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+
+class MultiBoxLossLayer : public CostLayer {
+public:
+  explicit MultiBoxLossLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) {}
+
+  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad) {}
+
+protected:
+  inline LayerPtr getPriorBoxLayer() { return inputLayers_[0]; }
+  inline LayerPtr getLabelLayer() { return inputLayers_[1]; }
+  inline LayerPtr getLocInputLayer(size_t index) {
+    return inputLayers_[2 + index];
+  }
+  inline LayerPtr getConfInputLayer(size_t index) {
+    return inputLayers_[2 + inputNum_ + index];
+  }
+
+protected:
+  size_t numClasses_;
+  real overlapThreshold_;
+  real negPosRatio_;
+  real negOverlap_;
+  size_t inputNum_;
+  size_t backgroundId_;
+
+  real locLoss_;
+  real confLoss_;
+
+  size_t numPriors_;
+  size_t numMatches_;
+  size_t numNegs_;
+  size_t numConf_;
+  size_t locSizeSum_;
+  size_t confSizeSum_;
+
+  vector<vector<int>> allMatchIndices_;
+  vector<vector<int>> allNegIndices_;
+  MatrixPtr locGTData_;
+  IVectorPtr confGTData_;
+
+  MatrixPtr locBuffer_;
+  MatrixPtr confBuffer_;
+  MatrixPtr locDiff_;
+  MatrixPtr confProb_;
+
+  MatrixPtr labelCpuValue_;
+  MatrixPtr priorCpuValue_;
+  MatrixPtr locCpuBuffer_;
+  MatrixPtr confCpuBuffer_;
+  MatrixPtr locTmpBuffer_;
+  MatrixPtr confTmpBuffer_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index e094078bfe..caef710092 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -56,14 +56,4 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-bool CrossChannelNormLayer::init(const LayerMap& layerMap,
-                                 const ParameterMap& parameterMap) {
-  Layer::init(layerMap, parameterMap);
-  CHECK(parameters_[0]);
-  const NormConfig& conf = config_.inputs(0).norm_conf();
-  channels_ = conf.channels();
-  scale_.reset(new Weight(channels_, 1, parameters_[0]));
-  return true;
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Pool3DLayer.cpp b/paddle/gserver/layers/Pool3DLayer.cpp
new file mode 100644
index 0000000000..199f21adb1
--- /dev/null
+++ b/paddle/gserver/layers/Pool3DLayer.cpp
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Pool3DLayer.h"
+#include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pool3d, Pool3DLayer);
+
+bool Pool3DLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  /* the size of inputs for pool-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+
+  const PoolConfig& conf = config_.inputs(0).pool_conf();
+  poolType_ = conf.pool_type();
+  channels_ = conf.channels();
+
+  sizeX_ = conf.size_x();
+  sizeY_ = conf.size_y();
+  sizeZ_ = conf.size_z();
+
+  strideW_ = conf.stride();
+  strideH_ = conf.stride_y();
+  strideD_ = conf.stride_z();
+
+  imgSizeW_ = conf.img_size();
+  imgSizeH_ = conf.img_size_y();
+  imgSizeD_ = conf.img_size_z();
+
+  paddingW_ = conf.padding();
+  paddingH_ = conf.padding_y();
+  paddingD_ = conf.padding_z();
+
+  outputW_ = conf.output_x();
+  outputH_ = conf.output_y();
+  outputD_ = conf.output_z();
+
+  return true;
+}
+
+size_t Pool3DLayer::getSize() {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+
+  size_t layerSize = 0;
+  outputD_ = outputSize(imgSizeD_, sizeZ_, paddingD_, strideD_, false);
+  outputH_ = outputSize(imgSizeH_, sizeY_, paddingH_, strideH_, false);
+  outputW_ = outputSize(imgSizeW_, sizeX_, paddingW_, strideW_, false);
+
+  layerSize = outputD_ * outputH_ * outputW_ * channels_;
+  getOutput().setFrameHeight(outputH_);
+  getOutput().setFrameWidth(outputW_);
+  getOutput().setFrameDepth(outputD_);
+  return layerSize;
+}
+
+void Pool3DLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  size_t batchSize = inMat->getHeight();
+  size_t outWidth = getSize();
+  resetOutput(batchSize, outWidth);
+  Matrix::resizeOrCreate(maxPoolIdx_, batchSize, outWidth, false, useGpu_);
+  const MatrixPtr outMat = getOutputValue();
+
+  if (poolType_ == "avg") {
+    outMat->avgPool3DForward(*inMat,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else if (poolType_ == "max") {
+    outMat->maxPool3DForward(*inMat,
+                             *maxPoolIdx_,
+                             channels_,
+                             imgSizeD_,
+                             imgSizeH_,
+                             imgSizeW_,
+                             outputD_,
+                             outputH_,
+                             outputW_,
+                             sizeZ_,
+                             sizeY_,
+                             sizeX_,
+                             strideD_,
+                             strideH_,
+                             strideW_,
+                             paddingD_,
+                             paddingH_,
+                             paddingW_);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+  forwardActivation();
+}
+
+void Pool3DLayer::backward(const UpdateCallback& callback) {
+  backwardActivation();
+
+  (void)callback;
+  if (NULL == getInputGrad(0)) return;
+  MatrixPtr inMat = inputLayers_[0]->getOutputValue();
+  MatrixPtr inGradMat = inputLayers_[0]->getOutputGrad();
+  MatrixPtr outMat = getOutputValue();
+  MatrixPtr outGradMat = getOutputGrad();
+
+  if (poolType_ == "avg") {
+    inGradMat->avgPool3DBackward(*outGradMat,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else if (poolType_ == "max") {
+    inGradMat->maxPool3DBackward(*outGradMat,
+                                 *maxPoolIdx_,
+                                 imgSizeD_,
+                                 imgSizeH_,
+                                 imgSizeW_,
+                                 outputD_,
+                                 outputH_,
+                                 outputW_,
+                                 sizeZ_,
+                                 sizeY_,
+                                 sizeZ_,
+                                 strideD_,
+                                 strideH_,
+                                 strideW_,
+                                 paddingD_,
+                                 paddingH_,
+                                 paddingW_,
+                                 1.0,
+                                 1.0);
+  } else {
+    LOG(FATAL) << "Unknown pool type: " << poolType_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/Pool3DLayer.h
similarity index 63%
rename from paddle/gserver/layers/ExpandConvTransLayer.h
rename to paddle/gserver/layers/Pool3DLayer.h
index 00b8f24188..8329a02f57 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/Pool3DLayer.h
@@ -15,30 +15,35 @@ limitations under the License. */
 #pragma once
 
 #include <vector>
-#include "ExpandConvBaseLayer.h"
+#include "Layer.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 
 namespace paddle {
 
 /**
- * @brief A subclass of convolution layer.
- * This layer expands input and use matrix multiplication to
- * calculate convolution transpose (deconv) operation.
- *
- * The config file api is img_conv_layer with flag trans=True.
+ * @brief Basic parent layer of pooling
+ * Pools the input within regions
  */
-class ExpandConvTransLayer : public ExpandConvBaseLayer {
+class Pool3DLayer : public Layer {
 public:
-  explicit ExpandConvTransLayer(const LayerConfig& config)
-      : ExpandConvBaseLayer(config) {}
-
-  ~ExpandConvTransLayer() {}
+  explicit Pool3DLayer(const LayerConfig& config) : Layer(config) {}
+  ~Pool3DLayer() {}
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
-
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
+  size_t getSize();
+
+protected:
+  int channels_;
+  int sizeX_, sizeY_, sizeZ_;
+  int strideW_, strideH_, strideD_;
+  int paddingW_, paddingH_, paddingD_;
+  int imgSizeW_, imgSizeH_, imgSizeD_;
+  int outputW_, outputH_, outputD_;
+  std::string poolType_;
+  MatrixPtr maxPoolIdx_;
 };
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 96d5c54acc..fceb389d06 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PoolLayer.h"
+#include "MaxPoolWithMaskLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnPoolLayer.h"
 #endif
 namespace paddle {
@@ -45,6 +46,7 @@ bool PoolLayer::init(const LayerMap& layerMap,
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
 
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
   return true;
 }
 
@@ -53,10 +55,12 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
   if (pool == "max-projection" || pool == "avg-projection") {
     return new PoolProjectionLayer(config);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
+  } else if (pool == "max-pool-with-mask") {
+    return new MaxPoolWithMaskLayer(config);
   } else {
     LOG(FATAL) << "Unknown pool type: " << pool;
     return nullptr;
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index d43292ad2d..9df672a935 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
 
   std::string poolType_;
 
+  bool excludeMode_;
+
 public:
   explicit PoolLayer(const LayerConfig& config) : Layer(config) {}
 
diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp
index d90b438448..6a9de394ce 100644
--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
   strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
   confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
   outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                        outputY_,
                        outputX_,
                        confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                              1,
                              1,
                              confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h
index 9a75f465f6..a0412714bc 100644
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
   int confPaddingY_, confPadding_;
   size_t channels_;
   std::string poolType_;
+  bool excludeMode_;
 
 public:
   PoolProjection(const ProjectionConfig& config,
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index de198af111..e83ae34bbe 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -22,10 +22,42 @@ public:
 
   void forward(PassType passType) override {
     Layer::forward(passType);
+    std::vector<std::string> vals;
     for (size_t i = 0; i != inputLayers_.size(); ++i) {
-      getInput(i).printValueString(LOG(INFO),
-                                   "layer=" + inputLayers_[i]->getName() + " ");
+      std::ostringstream s;
+      getInput(i).printValueString(s, "");
+      vals.push_back(s.str());
     }
+    size_t pos = 0;
+    size_t i = 0;
+    std::ostringstream s;
+    const std::string& format = config_.user_arg();
+    while (true) {
+      size_t pos1 = format.find("%s", pos);
+      if (pos1 == std::string::npos) break;
+      if (i >= vals.size()) {
+        break;
+      }
+      s << format.substr(pos, pos1 - pos) << vals[i];
+      pos = pos1 + 2;
+      ++i;
+    }
+    if (i != inputLayers_.size()) {
+      LOG(ERROR) << "Number of value in the format (" << format
+                 << ") is not same as the number of inputs ("
+                 << inputLayers_.size() << ") at " << getName();
+    }
+    s << format.substr(pos);
+
+    const std::string delimiter("\n");
+    std::string content = s.str();
+    std::string::size_type foundPos = 0;
+    std::string::size_type prevPos = 0;
+    while ((foundPos = content.find(delimiter, prevPos)) != std::string::npos) {
+      LOG(INFO) << content.substr(prevPos, foundPos - prevPos);
+      prevPos = foundPos + delimiter.size();
+    }
+    LOG(INFO) << content.substr(prevPos);
   }
 
   void backward(const UpdateCallback& callback) override {}
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 331bc7672e..8faf032f55 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -65,14 +65,19 @@ bool PriorBoxLayer::init(const LayerMap& layerMap,
   std::copy(pbConf.aspect_ratio().begin(),
             pbConf.aspect_ratio().end(),
             std::back_inserter(tmp));
-  // flip
-  int inputRatioLength = tmp.size();
-  for (int index = 0; index < inputRatioLength; index++) {
-    aspectRatio_.push_back(tmp[index]);
-    aspectRatio_.push_back(1 / tmp[index]);
+
+  if (maxSize_.size() > 0) CHECK_EQ(minSize_.size(), maxSize_.size());
+
+  // flip aspect ratios
+  for (unsigned index = 0; index < tmp.size(); index++) {
+    real ar = tmp[index];
+    if (fabs(ar - 1.) < 1e-6) continue;
+    aspectRatio_.push_back(ar);
+    aspectRatio_.push_back(1. / ar);
   }
-  numPriors_ = aspectRatio_.size();
-  if (maxSize_.size() > 0) numPriors_++;
+
+  numPriors_ = aspectRatio_.size() * minSize_.size() + maxSize_.size();
+
   return true;
 }
 
@@ -99,50 +104,39 @@ void PriorBoxLayer::forward(PassType passType) {
     for (int w = 0; w < layerWidth; ++w) {
       real centerX = (w + 0.5) * stepW;
       real centerY = (h + 0.5) * stepH;
-      real minSize = 0;
       for (size_t s = 0; s < minSize_.size(); s++) {
-        // first prior.
-        minSize = minSize_[s];
+        real minSize = minSize_[s];
         real boxWidth = minSize;
         real boxHeight = minSize;
-        // xmin, ymin, xmax, ymax.
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+
+        // priors with different aspect ratios
+        for (size_t r = 0; r < aspectRatio_.size(); r++) {
+          real ar = aspectRatio_[r];
+          boxWidth = minSize * sqrt(ar);
+          boxHeight = minSize / sqrt(ar);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
+        }
 
         if (maxSize_.size() > 0) {
-          CHECK_EQ(minSize_.size(), maxSize_.size());
-          // second prior.
-          for (size_t s = 0; s < maxSize_.size(); s++) {
-            real maxSize = maxSize_[s];
-            boxWidth = boxHeight = sqrt(minSize * maxSize);
-            tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-            tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-            tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-            tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-            // set the variance.
-            for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-          }
+          // square prior with size sqrt(minSize * maxSize)
+          real maxSize = maxSize_[s];
+          boxWidth = boxHeight = sqrt(minSize * maxSize);
+          tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
+          tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
+          tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
+          // set the variance.
+          for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
         }
       }
-      // rest of priors.
-      for (size_t r = 0; r < aspectRatio_.size(); r++) {
-        real ar = aspectRatio_[r];
-        if (fabs(ar - 1.) < 1e-6) continue;
-        real boxWidth = minSize * sqrt(ar);
-        real boxHeight = minSize / sqrt(ar);
-        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
-        tmpPtr[idx++] = (centerX + boxWidth / 2.) / imageWidth;
-        tmpPtr[idx++] = (centerY + boxHeight / 2.) / imageHeight;
-        // set the variance.
-        for (int t = 0; t < 4; t++) tmpPtr[idx++] = variance_[t];
-      }
     }
   }
+
   // clip the prior's coordidate such that it is within [0, 1]
   for (int d = 0; d < dim * 2; ++d)
     if ((d % 8) < 4)
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
new file mode 100644
index 0000000000..7d7c30b4d8
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ROIPoolLayer.h"
+#include <cfloat>
+
+namespace paddle {
+
+REGISTER_LAYER(roi_pool, ROIPoolLayer);
+
+bool ROIPoolLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  pooledWidth_ = layerConf.pooled_width();
+  pooledHeight_ = layerConf.pooled_height();
+  spatialScale_ = layerConf.spatial_scale();
+
+  return true;
+}
+
+void ROIPoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const ROIPoolConfig& layerConf = config_.inputs(0).roi_pool_conf();
+  height_ = getInput(0).getFrameHeight();
+  if (!height_) height_ = layerConf.height();
+  width_ = getInput(0).getFrameWidth();
+  if (!width_) width_ = layerConf.width();
+  channels_ = getInputValue(0)->getWidth() / width_ / height_;
+
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t numROIs = getInput(1).getBatchSize();
+
+  MatrixPtr dataValue = getInputValue(0);
+  MatrixPtr roiValue = getInputValue(1);
+  resetOutput(numROIs, channels_ * pooledHeight_ * pooledWidth_);
+  MatrixPtr outputValue = getOutputValue();
+
+  if (useGpu_) {  // TODO(guosheng): implement on GPU later
+    MatrixPtr dataCpuBuffer;
+    Matrix::resizeOrCreate(dataCpuBuffer,
+                           dataValue->getHeight(),
+                           dataValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    dataCpuBuffer->copyFrom(*dataValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    dataValue = dataCpuBuffer;
+    roiValue = roiCpuBuffer;
+    MatrixPtr outputCpuBuffer;
+    Matrix::resizeOrCreate(outputCpuBuffer,
+                           outputValue->getHeight(),
+                           outputValue->getWidth(),
+                           false,
+                           false);
+    outputCpuBuffer->copyFrom(*outputValue);
+    outputValue = outputCpuBuffer;
+  }
+
+  real* bottomData = dataValue->getData();
+  size_t batchOffset = dataValue->getWidth();
+  size_t channelOffset = height_ * width_;
+  real* bottomROIs = roiValue->getData();
+  size_t roiOffset = roiValue->getWidth();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+
+  real* outputData = outputValue->getData();
+  real* argmaxData = nullptr;
+  if (passType != PASS_TEST) {
+    Matrix::resizeOrCreate(maxIdxs_,
+                           numROIs,
+                           channels_ * pooledHeight_ * pooledWidth_,
+                           false,
+                           false);
+    argmaxData = maxIdxs_->getData();
+  }
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    // the first five elememts of each RoI should be:
+    // batch_idx, roi_x_start, roi_y_start, roi_x_end, roi_y_end
+    size_t roiBatchIdx = bottomROIs[0];
+    size_t roiStartW = round(bottomROIs[1] * spatialScale_);
+    size_t roiStartH = round(bottomROIs[2] * spatialScale_);
+    size_t roiEndW = round(bottomROIs[3] * spatialScale_);
+    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
+    CHECK_GE(roiBatchIdx, 0UL);
+    CHECK_LT(roiBatchIdx, batchSize);
+    size_t roiHeight =
+        std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
+    real binSizeH =
+        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
+    real binSizeW =
+        static_cast<real>(roiWidth) / static_cast<real>(pooledWidth_);
+    real* batchData = bottomData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t hstart = static_cast<size_t>(std::floor(ph * binSizeH));
+          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
+          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
+          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
+          hstart = std::min(
+              std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
+          wstart = std::min(
+              std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
+          hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
+                          height_);
+          wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
+                          width_);
+
+          bool isEmpty = (hend <= hstart) || (wend <= wstart);
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          if (argmaxData) {
+            argmaxData[poolIndex] = -1;
+          }
+
+          for (size_t h = hstart; h < hend; ++h) {
+            for (size_t w = wstart; w < wend; ++w) {
+              size_t index = h * width_ + w;
+              if (batchData[index] > outputData[poolIndex]) {
+                outputData[poolIndex] = batchData[index];
+                if (argmaxData) {
+                  argmaxData[poolIndex] = index;
+                }
+              }
+            }
+          }
+        }
+      }
+      batchData += channelOffset;
+      outputData += poolChannelOffset;
+      if (argmaxData) {
+        argmaxData += poolChannelOffset;
+      }
+    }
+    bottomROIs += roiOffset;
+  }
+  if (useGpu_) {
+    getOutputValue()->copyFrom(*outputValue);
+  }
+}
+
+void ROIPoolLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inGradValue = getInputGrad(0);
+  MatrixPtr outGradValue = getOutputGrad();
+  MatrixPtr roiValue = getInputValue(1);
+
+  if (useGpu_) {
+    MatrixPtr inGradCpuBuffer;
+    Matrix::resizeOrCreate(inGradCpuBuffer,
+                           inGradValue->getHeight(),
+                           inGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr outGradCpuBuffer;
+    Matrix::resizeOrCreate(outGradCpuBuffer,
+                           outGradValue->getHeight(),
+                           outGradValue->getWidth(),
+                           false,
+                           false);
+    MatrixPtr roiCpuBuffer;
+    Matrix::resizeOrCreate(roiCpuBuffer,
+                           roiValue->getHeight(),
+                           roiValue->getWidth(),
+                           false,
+                           false);
+    inGradCpuBuffer->copyFrom(*inGradValue);
+    outGradCpuBuffer->copyFrom(*outGradValue);
+    roiCpuBuffer->copyFrom(*roiValue);
+    inGradValue = inGradCpuBuffer;
+    outGradValue = outGradCpuBuffer;
+    roiValue = roiCpuBuffer;
+  }
+
+  real* bottomROIs = roiValue->getData();
+  size_t numROIs = getInput(1).getBatchSize();
+  size_t roiOffset = getInputValue(1)->getWidth();
+
+  real* inDiffData = inGradValue->getData();
+  size_t batchOffset = getInputValue(0)->getWidth();
+  size_t channelOffset = height_ * width_;
+
+  real* outDiffData = outGradValue->getData();
+  size_t poolChannelOffset = pooledHeight_ * pooledWidth_;
+  real* argmaxData = maxIdxs_->getData();
+
+  for (size_t n = 0; n < numROIs; ++n) {
+    size_t roiBatchIdx = bottomROIs[0];
+    real* batchDiffData = inDiffData + batchOffset * roiBatchIdx;
+    for (size_t c = 0; c < channels_; ++c) {
+      for (size_t ph = 0; ph < pooledHeight_; ++ph) {
+        for (size_t pw = 0; pw < pooledWidth_; ++pw) {
+          size_t poolIndex = ph * pooledWidth_ + pw;
+          if (argmaxData[poolIndex] > 0) {
+            size_t index = static_cast<size_t>(argmaxData[poolIndex]);
+            batchDiffData[index] += outDiffData[poolIndex];
+          }
+        }
+      }
+      batchDiffData += channelOffset;
+      outDiffData += poolChannelOffset;
+      argmaxData += poolChannelOffset;
+    }
+    bottomROIs += roiOffset;
+  }
+
+  if (useGpu_) {
+    getInputGrad(0)->copyFrom(*inGradValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.h b/paddle/gserver/layers/ROIPoolLayer.h
new file mode 100644
index 0000000000..4f07e49d6f
--- /dev/null
+++ b/paddle/gserver/layers/ROIPoolLayer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+ * feature map.
+ * - Input: This layer needs two input layers: The first input layer is a
+ *          convolution layer; The second input layer contains the ROI data
+ *          which is the output of ProposalLayer in Faster R-CNN. layers for
+ *          generating bbox location offset and the classification confidence.
+ * - Output: The ROIs' feature map.
+ * Reference:
+ *    Shaoqing Ren, Kaiming He, Ross Girshick, and Jian Sun.
+ *    Faster R-CNN: Towards Real-Time Object Detection with Region Proposal
+ * Networks
+ */
+
+class ROIPoolLayer : public Layer {
+protected:
+  size_t channels_;
+  size_t width_;
+  size_t height_;
+  size_t pooledWidth_;
+  size_t pooledHeight_;
+  real spatialScale_;
+
+  // Since there is no int matrix, use real maxtrix instead.
+  MatrixPtr maxIdxs_;
+
+public:
+  explicit ROIPoolLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index e4c2b483d2..6bd42c06ca 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -12,119 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gflags/gflags.h>
-#include "Layer.h"
-#include "SequenceToBatch.h"
-#include "paddle/utils/Stat.h"
+#include "RecurrentLayer.h"
 
 DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
-/**
- * @brief RecurrentLayer takes 1 input layer. The output size is the same with
- * input layer.
- * For each sequence [start, end] it performs the following computation:
- * \f[
- *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
- *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
- *
- * \f]
- * If reversed is true, the order is reversed:
- * \f[
- *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
- *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
- * \f]
- * There are two methods to calculate rnn. One way is to compute rnn one
- * sequence by one sequence. The other way is to reorganize the input
- * into batches, then compute rnn one batch by one batch. Users can select
- * them by rnn_use_batch flag.
- */
-
-class RecurrentLayer : public Layer {
-public:
-  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
-
-  bool init(const LayerMap& layerMap,
-            const ParameterMap& parameterMap) override;
-
-  void forward(PassType passType) override;
-
-  void backward(const UpdateCallback& callback) override;
-
-  void resetState() override;
-
-  void setState(LayerStatePtr state) override;
-
-  LayerStatePtr getState() override;
-
-protected:
-  /**
-   * @brief If user do not set --rnn_use_batch=true, it will
-   * compute rnn forward one sequence by one sequence in default.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn forward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void forwardOneSequence(int start, int length);
-  /**
-   * @brief Compute rnn backward one sequence by onesequence.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
-  /**
-   * @brief Compute rnn backward by one sequence.
-   * @param start The start position of this sequence (or sample).
-   * @param length The length of this sequence (or sample), namely the words
-   * number of this sequence.
-   */
-  void backwardOneSequence(int start, int length);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch. It will convert batch shape to sequence after finishing forward.
-   * The batch info can refer to SequenceToBatch class.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void forwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-  /**
-   * @brief Reorganize input into batches and compute rnn forward batch
-   * by batch.
-   * @param batchSize Total words number of all samples in this batch.
-   * @param numSequences The sample number.
-   * @param starts Each start position of each samples.
-   */
-  void backwardBatch(int batchSize, size_t numSequences, const int* starts);
-
-protected:
-  std::unique_ptr<Weight> weight_;
-  std::unique_ptr<Weight> bias_;
-
-  /// frameOutput_[i] is used to hold the i-th sample of output_
-  std::vector<Argument> frameOutput_;
-  MatrixPtr prevOutput_;
-  /// Whether compute rnn by reverse.
-  bool reversed_;
-  /// If compute batch by batch, batchValue_ will be used to save the
-  /// reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchValue_;
-  /// If compute batch by batch, batchGrad_ will be used to save the
-  /// gradient with respect to reorganized input value.
-  std::unique_ptr<SequenceToBatch> batchGrad_;
-};
-
 REGISTER_LAYER(recurrent, RecurrentLayer);
 
 bool RecurrentLayer::init(const LayerMap& layerMap,
@@ -260,7 +153,6 @@ void RecurrentLayer::backward(const UpdateCallback& callback) {
     bias_->getWGrad()->collectBias(*output_.grad, 1);
     bias_->getParameterPtr()->incUpdate(callback);
   }
-
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/RecurrentLayer.h b/paddle/gserver/layers/RecurrentLayer.h
new file mode 100644
index 0000000000..f40dbe150f
--- /dev/null
+++ b/paddle/gserver/layers/RecurrentLayer.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <gflags/gflags.h>
+#include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+/**
+ * @brief RecurrentLayer takes 1 input layer. The output size is the same with
+ * input layer.
+ * For each sequence [start, end] it performs the following computation:
+ * \f[
+ *    out_{i} = act(in_{i})     \      \      \text{for} \ i = start \\
+ *    out_{i} = act(in_{i} + out_{i-1} * W) \ \ \text{for} \ start < i <= end
+ *
+ * \f]
+ * If reversed is true, the order is reversed:
+ * \f[
+ *   out_{i} = act(in_{i})           \    \   \text{for} \ i = end  \\
+ *   out_{i} = act(in_{i} + out_{i+1} * W) \ \ \text{for} \ start <= i < end
+ * \f]
+ * There are two methods to calculate rnn. One way is to compute rnn one
+ * sequence by one sequence. The other way is to reorganize the input
+ * into batches, then compute rnn one batch by one batch. Users can select
+ * them by rnn_use_batch flag.
+ */
+
+class RecurrentLayer : public Layer {
+public:
+  explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+  void resetState() override;
+
+  void setState(LayerStatePtr state) override;
+
+  LayerStatePtr getState() override;
+
+protected:
+  /**
+   * @brief If user do not set --rnn_use_batch=true, it will
+   * compute rnn forward one sequence by one sequence in default.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void forwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn forward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void forwardOneSequence(int start, int length);
+  /**
+   * @brief Compute rnn backward one sequence by onesequence.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  void backwardSequence(int batchSize, size_t numSequences, const int* starts);
+  /**
+   * @brief Compute rnn backward by one sequence.
+   * @param start The start position of this sequence (or sample).
+   * @param length The length of this sequence (or sample), namely the words
+   * number of this sequence.
+   */
+  void backwardOneSequence(int start, int length);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch. It will convert batch shape to sequence after finishing forward.
+   * The batch info can refer to SequenceToBatch class.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void forwardBatch(int batchSize,
+                            size_t numSequences,
+                            const int* starts);
+
+  /**
+   * @brief Reorganize input into batches and compute rnn forward batch
+   * by batch.
+   * @param batchSize Total words number of all samples in this batch.
+   * @param numSequences The sample number.
+   * @param starts Each start position of each samples.
+   */
+  virtual void backwardBatch(int batchSize,
+                             size_t numSequences,
+                             const int* starts);
+
+protected:
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> bias_;
+
+  /// frameOutput_[i] is used to hold the i-th sample of output_
+  std::vector<Argument> frameOutput_;
+  MatrixPtr prevOutput_;
+  /// Whether compute rnn by reverse.
+  bool reversed_;
+  /// If compute batch by batch, batchValue_ will be used to save the
+  /// reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchValue_;
+  /// If compute batch by batch, batchGrad_ will be used to save the
+  /// gradient with respect to reorganized input value.
+  std::unique_ptr<SequenceToBatch> batchGrad_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/gserver/layers/RowConvLayer.cpp
new file mode 100644
index 0000000000..54d77999ad
--- /dev/null
+++ b/paddle/gserver/layers/RowConvLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(row_conv, RowConvLayer);
+
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+
+  return true;
+}
+
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
new file mode 100644
index 0000000000..b3bdda2f35
--- /dev/null
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief Row Convolution Layer.
+ */
+class RowConvLayer : public Layer {
+public:
+  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~RowConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  // Row convolution weight, context_lenght_ * fan_out.
+  // fan_out is the size of output feature.
+  std::unique_ptr<Weight> weight_;
+
+  // The step number to look ahead plus one equals contexLength_.
+  size_t contexLength_;
+  TensorShape wDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
new file mode 100644
index 0000000000..0d609be43b
--- /dev/null
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleShiftLayer.cpp b/paddle/gserver/layers/ScaleShiftLayer.cpp
new file mode 100644
index 0000000000..35fd038ab4
--- /dev/null
+++ b/paddle/gserver/layers/ScaleShiftLayer.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer applies a linear transformation to each element in each row of
+ * the input matrix. For each element, the layer first re-scale it and then
+ * adds a bias to it.
+ *
+ * \f[
+ *    y = wx + b
+ * \f]
+ *
+ * Here, w is the scale and b is the bias. Both w and b are trainable scalars.
+ *
+ */
+
+class ScaleShiftLayer : public Layer {
+protected:
+  std::unique_ptr<Weight> scale_;
+  std::unique_ptr<Weight> offset_;
+
+public:
+  explicit ScaleShiftLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(scale_shift, ScaleShiftLayer);
+
+bool ScaleShiftLayer::init(const LayerMap& layerMap,
+                           const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 1U);
+  scale_.reset(new Weight(1, 1, parameters_[0]));
+  if (biasParameter_.get() != NULL) {
+    offset_ = std::unique_ptr<Weight>(new Weight(1, 1, biasParameter_));
+  }
+  return true;
+}
+
+void ScaleShiftLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  real scaleValue = scale_->getW()->getElement(0, 0);
+  outV->mulScalar(*inV, scaleValue);
+  if (offset_) {
+    real offsetValue = offset_->getW()->getElement(0, 0);
+    outV->add(offsetValue);
+  }
+}
+
+void ScaleShiftLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+
+  /* Calculate the parameter gradient for the current layer */
+  if (scale_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ij} * c_{ij}
+    rowSumMtx->sumOfProducts(
+        /* b= */ *inV, /* c= */ *outG, /* scaleSum= */ 1, /* scaleDest= */ 0.);
+    // this_i = scaleDest * this_i + scaleSum * \sum_j b_{ji}
+    scale_->getWGrad()->sumCols(
+        /* b= */ *rowSumMtx, /* scaleSum= */ 1., /* scaleDest= */ 1.);
+    scale_->getParameterPtr()->incUpdate(callback);
+  }
+  if (offset_ && offset_->getWGrad()) {
+    MatrixPtr rowSumMtx;
+    Matrix::resizeOrCreate(rowSumMtx, outG->getHeight(), 1, false, useGpu_);
+    rowSumMtx->sumRows(*outG, 1., 0.);
+    offset_->getWGrad()->sumCols(*rowSumMtx, 1., 1.);
+    offset_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers error */
+  if (inG) {
+    real scaleValue = scale_->getW()->getElement(0, 0);
+    inG->add(*outG, scaleValue);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.cpp b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
new file mode 100644
index 0000000000..aa6778aef4
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.cpp
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ScaleSubRegionLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(scale_sub_region, ScaleSubRegionLayer);
+
+bool ScaleSubRegionLayer::init(const LayerMap& layerMap,
+                               const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 2);
+  auto& conf = config_.inputs(0).scale_sub_region_conf();
+  value_ = conf.value();
+
+  createFunction(forward_, "ScaleSubRegion", FuncConfig().set("value", value_));
+  createFunction(
+      backward_, "ScaleSubRegionGrad", FuncConfig().set("value", value_));
+
+  return true;
+}
+
+void ScaleSubRegionLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  auto in0 = getInput(0);
+  imgH_ = in0.getFrameHeight();
+  imgW_ = in0.getFrameWidth();
+  if (imgH_ == 0 || imgW_ == 0) {
+    auto& conf = config_.inputs(0).scale_sub_region_conf();
+    imgH_ = conf.image_conf().img_size_y();
+    imgW_ = conf.image_conf().img_size();
+  }
+  MatrixPtr imgV = in0.value;
+  size_t batchSize = imgV->getHeight();
+  size_t spatialSize = imgH_ * imgW_;
+  channelsNum_ = imgV->getWidth() / spatialSize;
+  shape_ = TensorShape({batchSize, channelsNum_, imgH_, imgW_});
+
+  resetOutput(batchSize, imgV->getWidth());
+  auto& out = getOutput();
+  out.setFrameHeight(imgH_);
+  out.setFrameWidth(imgW_);
+
+  MatrixPtr indicesV = getInputValue(1);
+  indicesShape_ = TensorShape({batchSize, 6});
+
+  REGISTER_TIMER_INFO("ScaleSubRegionForward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*imgV, shape_);
+  inArgs.addArg(*indicesV, indicesShape_);
+  outArgs.addArg(*out.value, shape_, ASSIGN_TO);
+  forward_[0]->calc(inArgs, outArgs);
+}
+
+void ScaleSubRegionLayer::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("ScaleSubRegionBackward", getName().c_str());
+  BufferArgs inArgs;
+  BufferArgs outArgs;
+  inArgs.addArg(*getOutputGrad(), shape_);
+  inArgs.addArg(*getInputValue(1), indicesShape_);
+  outArgs.addArg(*getInputGrad(0), shape_, ADD_TO);
+  backward_[0]->calc(inArgs, outArgs);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ScaleSubRegionLayer.h b/paddle/gserver/layers/ScaleSubRegionLayer.h
new file mode 100644
index 0000000000..a27c56de93
--- /dev/null
+++ b/paddle/gserver/layers/ScaleSubRegionLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  For each instance, this layer can be used to multiply a value to a
+ *         specified sub continuous region. By providing start index and end
+ *         index for C/H/W, you can specify the location and shape of the
+ *         region.
+ *
+ *         input_0: Input value.
+ *         input_1: Indices value to specify the location an shape of the
+ *                  region.
+ */
+class ScaleSubRegionLayer : public Layer {
+public:
+  explicit ScaleSubRegionLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~ScaleSubRegionLayer() {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+
+  void backward(const UpdateCallback& callback = nullptr);
+
+protected:
+  TensorShape shape_;
+  TensorShape indicesShape_;
+  size_t imgH_;
+  size_t imgW_;
+  size_t channelsNum_;
+  real value_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 944c705166..323cc47df1 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -26,10 +26,9 @@ namespace paddle {
  * If SequenceLevel = kNonseq:
  *   Output: a sequence containing only the last instance of the input sequence
  *   If stride_ > 0:
- *      Output: a shorten sequence. The operation of getting last instance of a
- *              sequence is independently performed on every slice of the input
- *              sequence, which is obtained by sliding a window with the window
- *              size set to stride_.
+ *      Output: a shorten sequence. Stride is the step size by which we slide a
+ *              window upon the input sequence, and getting last instance
+ *              operation is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
@@ -73,8 +72,7 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  auto starts = (stride_ > 0) ? stridePositions_->getData()
-                              : startPositions_->getData(false);
+  auto starts = startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 235d9a9b0f..2a693b110a 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -46,6 +46,9 @@ void SequencePoolLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   const Argument& input = getInput(0);
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "Input should be a sequence or subsequence for layer " << getName();
+
   newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
   size_t dim = getSize();
   // check
@@ -69,9 +72,8 @@ void SequencePoolLayer::forward(PassType passType) {
   if (stride_ > 0) {
     CHECK_EQ(input.hasSubseq(), 0UL)
         << "sequence stride pooling is invalid for hasSubseq now";
-    output_.poolSequenceWithStride(
-        input, stride_, &stridePositions_, reversed_);
-    newBatchSize_ = stridePositions_->getSize() - 1;
+    output_.poolSequenceWithStride(input, stride_, &startPositions_, reversed_);
+    newBatchSize_ = startPositions_->getSize() - 1;
   }
 
   resetOutput(newBatchSize_, dim);
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index 293d1bf278..e207afd1dc 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -28,8 +28,9 @@ namespace paddle {
  * sequence}{input[i]}
  *    If stride_ > 0:
  *        Check input sequence must not have sub-sequence
- *        Output: a shorten sequence, pooling is performed upon a small local
- *                area
+ *        Output: a shorten sequence. Stride is the step size by which we slide
+ *                a window upon the input sequence, and the pooling operation
+ *                is then applied to each interval independently.
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
@@ -47,8 +48,6 @@ protected:
   size_t newBatchSize_;
   ICpuGpuVectorPtr startPositions_;
   int stride_;
-  // Store the start position of each window.
-  IVectorPtr stridePositions_;
   // Whether the input sequence is reversed or not.
   bool reversed_ = false;
 
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 433592953b..8229744072 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -70,11 +70,23 @@ void SequenceReshapeLayer::forward(PassType passType) {
   size_t outDim = getSize();
 
   size_t numSequences = input.getNumSequences();
-  auto startPositions = input.sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
 
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  CHECK_EQ(numSequences, startPositions->getSize() - 1);
+  // by default, we assume each instance as a sequence
+  IVectorPtr seqStarts;
+  IVector::resizeOrCreate(seqStarts, input.getBatchSize() + 1, false);
+  int* startsData = seqStarts->getData();
+  for (int i = 0; i < input.getBatchSize() + 1; i++) {
+    startsData[i] = i;
+  }
+  const int* starts = startsData;
+
+  // if there is sequence, then use start positions
+  if (input.sequenceStartPositions) {
+    auto startPositions = input.sequenceStartPositions->getVector(false);
+    starts = startPositions->getData();
+    CHECK_EQ(starts[numSequences], input.getBatchSize());
+    CHECK_EQ(numSequences, startPositions->getSize() - 1);
+  }
 
   for (size_t seqID = 0; seqID < numSequences; seqID++) {
     size_t inNumIns = starts[seqID + 1] - starts[seqID];
diff --git a/paddle/gserver/layers/SequenceSliceLayer.cpp b/paddle/gserver/layers/SequenceSliceLayer.cpp
new file mode 100644
index 0000000000..ce68ca4494
--- /dev/null
+++ b/paddle/gserver/layers/SequenceSliceLayer.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SequenceSliceLayer : public Layer {
+public:
+  explicit SequenceSliceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second and the (optional) third input which are some
+   * selected indices of the give sequence to trim the sequence, are actually
+   * filled with int types so that storing int types information in real number
+   * matrices is very dangerous, since real numbers will be convered to int
+   * types. If a user fills this matrix himself, invalid data may occor.
+   */
+
+  MatrixPtr startIdsOnCpu_;
+  MatrixPtr endIdsOnCpu_;
+
+  std::vector<int> selectedRows_;
+  IVectorPtr rowIndice_;
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+  std::vector<int> outSubSeqStartPos_;
+  std::vector<int> outSeqStartPos_;
+
+  void checkInputs();
+  void copySliceIdsToCpu();
+  void calSelectedRows(const MatrixPtr starts, const MatrixPtr ends);
+};
+
+REGISTER_LAYER(seq_slice, SequenceSliceLayer);
+
+bool SequenceSliceLayer::init(const LayerMap& layerMap,
+                              const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_GE(inputLayers_.size(), 2U);
+  CHECK_LE(inputLayers_.size(), 3U);
+
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SequenceSliceLayer::checkInputs() {
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSeq()) << "The first input of sequence slice layer "
+                           << "must be a sequence.";
+  const MatrixPtr indices1 = getInputValue(1);
+  CHECK_EQ(
+      indices1->getHeight(),
+      static_cast<size_t>(inputSeq.hasSubseq() ? inputSeq.getNumSubSequences()
+                                               : inputSeq.getNumSequences()))
+      << "Height of the second input should be equal to number of sequence "
+      << "in the first input.";
+  if (inputLayers_.size() == 3) {
+    const MatrixPtr indices2 = getInputValue(2);
+    CHECK_EQ(indices2->getHeight(), indices1->getHeight())
+        << "start indices and end indices should have the same height.";
+    CHECK_EQ(indices2->getWidth(), indices1->getWidth())
+        << "start indices and end indices should have the same Width.";
+  }
+}
+
+void SequenceSliceLayer::copySliceIdsToCpu() {
+  const MatrixPtr indices1 = getInputValue(1);
+  if (inputLayers_.size() == 2U) {
+    if (config_.select_first()) {
+      Matrix::resizeOrCreate(startIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      startIdsOnCpu_->copyFrom(*indices1);
+      endIdsOnCpu_ = nullptr;
+    } else {
+      Matrix::resizeOrCreate(endIdsOnCpu_,
+                             indices1->getHeight(),
+                             indices1->getWidth(),
+                             false /* trans */,
+                             false /* useGpu */);
+      endIdsOnCpu_->copyFrom(*indices1);
+      startIdsOnCpu_ = nullptr;
+    }
+  } else if (inputLayers_.size() == 3U) {
+    Matrix::resizeOrCreate(startIdsOnCpu_,
+                           indices1->getHeight(),
+                           indices1->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    startIdsOnCpu_->copyFrom(*indices1);
+
+    const MatrixPtr indices2 = getInputValue(2);
+    Matrix::resizeOrCreate(endIdsOnCpu_,
+                           indices2->getHeight(),
+                           indices2->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    endIdsOnCpu_->copyFrom(*indices2);
+  }
+}
+
+void SequenceSliceLayer::calSelectedRows(const MatrixPtr starts,
+                                         const MatrixPtr ends) {
+  CHECK(starts || ends) << "At least one of the start or end indices "
+                        << "should be given.";
+
+  bool hasSubseq = getInput(0).hasSubseq();
+
+  outSeqStartPos_.resize(1, 0);
+  outSubSeqStartPos_.resize(1, 0);
+  selectedRows_.clear();
+
+  size_t beamSize = starts ? starts->getWidth() : ends->getWidth();
+  size_t rowIdx = 0;
+  for (size_t i = 0; i < inputSeqInfoVec_.size(); ++i) {
+    for (size_t j = 0; j < inputSeqInfoVec_[i].size() - 1; ++j) {
+      for (size_t k = 0; k < beamSize; ++k) {
+        if (starts && starts->getElement(rowIdx, k) == -1.) break;
+        if (ends && ends->getElement(rowIdx, k) == -1.) break;
+
+        int begPos = inputSeqInfoVec_[i][j];
+        if (starts) begPos += starts->getElement(rowIdx, k);
+
+        int endPos = inputSeqInfoVec_[i][j + 1] - 1;
+        if (ends) endPos = inputSeqInfoVec_[i][j] + ends->getElement(rowIdx, k);
+
+        int seqLen = endPos - begPos + 1;
+        CHECK_GT(seqLen, 0);
+        for (int m = begPos; m <= endPos; ++m) selectedRows_.push_back(m);
+        hasSubseq
+            ? outSubSeqStartPos_.push_back(outSubSeqStartPos_.back() + seqLen)
+            : outSeqStartPos_.push_back(outSeqStartPos_.back() + seqLen);
+      }
+      rowIdx++;
+    }
+    if (hasSubseq) outSeqStartPos_.push_back(outSubSeqStartPos_.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartPos_.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartPos_.data(), outSeqStartPos_.size(), false);
+
+  if (hasSubseq) {
+    ICpuGpuVector::resizeOrCreate(
+        output_.subSequenceStartPositions, outSubSeqStartPos_.size(), false);
+    output_.subSequenceStartPositions->copyFrom(
+        outSubSeqStartPos_.data(), outSubSeqStartPos_.size(), false);
+  }
+}
+
+void SequenceSliceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  checkInputs();
+
+  const Argument& inputSeq = getInput(0);
+  inputSeqInfoVec_.clear();
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  if (!useGpu_) {
+    if (inputLayers_.size() == 2U) {
+      startIdsOnCpu_ = config_.select_first() ? getInputValue(1) : nullptr;
+      endIdsOnCpu_ = config_.select_first() ? nullptr : getInputValue(1);
+    } else if (inputLayers_.size() == 3U) {
+      startIdsOnCpu_ = getInputValue(1);
+      endIdsOnCpu_ = getInputValue(2);
+    }
+  } else {
+    copySliceIdsToCpu();
+  }
+
+  /*
+   * calculate the selected row indices in a batch, and build the output
+   * sequence information.
+   */
+  calSelectedRows(startIdsOnCpu_, endIdsOnCpu_);
+
+  resetOutput(selectedRows_.size(), getSize());
+
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SequenceSliceLayer::backward(const UpdateCallback& callback) {
+  getOutputGrad()->addToRows(*getInputGrad(0), *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index 5fa7b6f488..6b769378d2 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -171,12 +171,31 @@ void SequenceToBatch::sequence2BatchCopy(Matrix &batch,
     hl_sequence2batch_copy(
         batchData, seqData, idxData, seqWidth, batchCount, seq2batch);
   } else {
-    for (int i = 0; i < batchCount; ++i) {
-      if (seq2batch) {
+    if (seq2batch) {
+#ifdef PADDLE_USE_MKLML
+      const int blockMemSize = 8 * 1024;
+      const int blockSize = blockMemSize / sizeof(real);
+#pragma omp parallel for collapse(2)
+      for (int i = 0; i < batchCount; ++i) {
+        for (int j = 0; j < seqWidth; j += blockSize) {
+          memcpy(batch.rowBuf(i) + j,
+                 sequence.rowBuf(idxData[i]) + j,
+                 (j + blockSize > seqWidth) ? (seqWidth - j) * sizeof(real)
+                                            : blockMemSize);
+        }
+      }
+#else
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(batch.rowBuf(i),
                sequence.rowBuf(idxData[i]),
                seqWidth * sizeof(real));
-      } else {
+      }
+#endif
+    } else {
+#ifdef PADDLE_USE_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < batchCount; ++i) {
         memcpy(sequence.rowBuf(idxData[i]),
                batch.rowBuf(i),
                seqWidth * sizeof(real));
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
new file mode 100644
index 0000000000..267dd6154b
--- /dev/null
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubNestedSequenceLayer.cpp b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
new file mode 100644
index 0000000000..e9bee77212
--- /dev/null
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+class SubNestedSequenceLayer : public Layer {
+public:
+  explicit SubNestedSequenceLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+private:
+  /*
+   * This functions generates the indices of rows in a batch according to the
+   * indices of selected sub-sequence in each sequence.
+   *
+   * Examples:
+   * selectedIndices:
+   *   [
+   *     [0, 1, -1],
+   *     [0, 1, 2],
+   *     [0, -1, -1],
+   *     [0, 2, 3],
+   *   ]
+   * inputSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   *
+   * ths output is saved to private member rowIndice_;
+   * [0,1,2,3,4,5,6,7,8,9,15,16,17,18,19,20,21,23,24,25,26,27]
+   */
+
+  void calSelectedRows(const MatrixPtr selectedIndices,
+                       const std::vector<std::vector<int>>& inputSeqInfo);
+
+  /*
+   * TODO(caoying)
+   * In PaddePaddle, currently all matrices are real number types,
+   * but the second is some selected indices of the give sequence to trim
+   * the nested sequence, are actually filled with int types so that storing
+   * int types information in real number matrices is very dangerous, since
+   * real numbers will be convered to int types. If a user fills this matrix
+   * himself, invalid data may occor.
+   *
+   * if the second input of this layer is on GPU memory, copy it to CPU memory.
+   */
+  MatrixPtr selIdsCpu_;
+
+  /*
+   * reorganize sequenceStartPositions and subSequenceStartPositions
+   * into a 2d vector to facilitate the sequence selection process.
+   */
+  std::vector<std::vector<int>> inputSeqInfoVec_;
+
+  /* store the final selected row indices in a batch */
+  IVectorPtr rowIndice_;
+  /* rowIndice_ and selectedRows_ actually share a same memory. */
+  std::vector<int> selectedRows_;
+};
+
+REGISTER_LAYER(sub_nested_seq, SubNestedSequenceLayer);
+
+bool SubNestedSequenceLayer::init(const LayerMap& layerMap,
+                                  const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(2U, inputLayers_.size());
+  setNeedSequenceInfo(false);
+  return true;
+}
+
+void SubNestedSequenceLayer::calSelectedRows(
+    const MatrixPtr selectedIndices,
+    const std::vector<std::vector<int>>& inputSeqInfo) {
+  selectedRows_.clear();
+
+  std::vector<int> outSeqStartInfo(1, 0);
+  std::vector<int> outSubSeqStartInfo(1, 0);
+
+  size_t seqNum = selectedIndices->getHeight();
+  size_t beamSize = selectedIndices->getWidth();
+  for (size_t i = 0; i < seqNum; ++i) {
+    for (size_t j = 0; j < beamSize; ++j) {
+      if (selectedIndices->getElement(i, j) == -1.) break;
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
+      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);
+
+      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
+                         inputSeqInfoVec_[i][selSubSeqIdx];
+      for (size_t k = 0; k < subSeqLen; ++k)
+        selectedRows_.push_back(inputSeqInfoVec_[i][selSubSeqIdx] + k);
+      outSubSeqStartInfo.push_back(outSubSeqStartInfo.back() + subSeqLen);
+    }
+    outSeqStartInfo.push_back(outSubSeqStartInfo.back());
+  }
+
+  if (useGpu_) {
+    rowIndice_ = IVector::create(selectedRows_.size(), useGpu_);
+    rowIndice_->copyFrom(selectedRows_.data(), selectedRows_.size());
+  } else {
+    rowIndice_ =
+        IVector::create(selectedRows_.data(), selectedRows_.size(), useGpu_);
+  }
+
+  // create the sequence information for the output.
+  ICpuGpuVector::resizeOrCreate(
+      output_.sequenceStartPositions, outSeqStartInfo.size(), false);
+  output_.sequenceStartPositions->copyFrom(
+      outSeqStartInfo.data(), outSeqStartInfo.size(), false);
+
+  ICpuGpuVector::resizeOrCreate(
+      output_.subSequenceStartPositions, outSubSeqStartInfo.size(), false);
+  output_.subSequenceStartPositions->copyFrom(
+      outSubSeqStartInfo.data(), outSubSeqStartInfo.size(), false);
+}
+
+void SubNestedSequenceLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& inputSeq = getInput(0);
+  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
+                              << "must be a nested sequence.";
+  const MatrixPtr selectedIndices = getInputValue(1);
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());
+
+  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
+    /*
+     * Currently, the second input for this layer is generated by
+     * kmax_sequence_score_layer whose output is always stored on CPU,
+     * or a data_layer which canbe on GPU.
+     *
+     * If the second input is on GPU, copy it to CPU memory, because this
+     * input always uses very few memory, and operations related to it are
+     * all logic control, not computations.
+     */
+    Matrix::resizeOrCreate(selIdsCpu_,
+                           selectedIndices->getHeight(),
+                           selectedIndices->getWidth(),
+                           false /* trans */,
+                           false /* useGpu */);
+    selIdsCpu_->copyFrom(*selectedIndices);
+  } else {
+    selIdsCpu_ = selectedIndices;
+  }
+
+  Argument::reorganizeSeqInfo(inputSeq.sequenceStartPositions,
+                              inputSeq.subSequenceStartPositions,
+                              inputSeqInfoVec_);
+  calSelectedRows(selIdsCpu_, inputSeqInfoVec_);
+
+  resetOutput(selectedRows_.size(), getSize());
+  getOutputValue()->selectRows(*getInputValue(0), *rowIndice_);
+}
+
+void SubNestedSequenceLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inputSeqGrad = getInputGrad(0);
+  MatrixPtr outputGrad = getOutputGrad();
+
+  if (inputSeqGrad) outputGrad->addToRows(*inputSeqGrad, *rowIndice_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 19b7ad1869..00d8ce017a 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -98,8 +98,19 @@ void SubSequenceLayer::forward(PassType passType) {
   CHECK_EQ(numSequences2, numSequences3);
 
   MatrixPtr inputValue = input.value;
-  IVectorPtr offsetValue = offsetSeq.ids;
-  IVectorPtr sizeValue = sizeSeq.ids;
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   CHECK_EQ(offsetValue->getSize(), numSequences1);
   CHECK_EQ(sizeValue->getSize(), numSequences1);
@@ -176,8 +187,21 @@ void SubSequenceLayer::backward(const UpdateCallback& callback) {
   size_t numSequences1 = startPositions1->getSize() - 1;
   const int* starts1 = startPositions1->getData();
 
-  IVectorPtr offsetValue = getInput(1).ids;
-  IVectorPtr sizeValue = getInput(2).ids;
+  const Argument& offsetSeq = getInput(1);
+  const Argument& sizeSeq = getInput(2);
+  IVectorPtr offsetValue;
+  IVectorPtr sizeValue;
+
+  if (useGpu_) {
+    // copy to cpu
+    IVector::resizeOrCreate(offsetValue, offsetSeq.ids->getSize(), false);
+    IVector::resizeOrCreate(sizeValue, sizeSeq.ids->getSize(), false);
+    offsetValue->copyFrom(*offsetSeq.ids);
+    sizeValue->copyFrom(*sizeSeq.ids);
+  } else {
+    offsetValue = offsetSeq.ids;
+    sizeValue = sizeSeq.ids;
+  }
 
   int* offsets = offsetValue->getData();
   int* sizes = sizeValue->getData();
diff --git a/paddle/gserver/layers/SwitchOrderLayer.cpp b/paddle/gserver/layers/SwitchOrderLayer.cpp
new file mode 100644
index 0000000000..e97809141a
--- /dev/null
+++ b/paddle/gserver/layers/SwitchOrderLayer.cpp
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "SwitchOrderLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(switch_order, SwitchOrderLayer);
+
+bool SwitchOrderLayer::init(const LayerMap& layerMap,
+                            const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  auto& img_conf = config_.inputs(0).image_conf();
+  size_t inD = img_conf.img_size_z();
+  size_t inH =
+      img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size();
+  size_t inW = img_conf.img_size();
+  size_t inC = img_conf.channels();
+  inH = inH * inD;
+  inDims_ = TensorShape({0, inC, inH, inW});
+  outDims_ = TensorShape(4);
+
+  auto& reshape_conf = config_.reshape_conf();
+  for (int i = 0; i < reshape_conf.height_axis_size(); i++) {
+    heightAxis_.push_back(reshape_conf.height_axis(i));
+  }
+  for (int i = 0; i < reshape_conf.width_axis_size(); i++) {
+    widthAxis_.push_back(reshape_conf.width_axis(i));
+  }
+  createFunction(nchw2nhwc_, "NCHW2NHWC", FuncConfig());
+  createFunction(nhwc2nchw_, "NHWC2NCHW", FuncConfig());
+  return true;
+}
+
+void SwitchOrderLayer::setOutDims() {
+  outDims_.setDim(0, inDims_[0]);
+  outDims_.setDim(1, inDims_[2]);
+  outDims_.setDim(2, inDims_[3]);
+  outDims_.setDim(3, inDims_[1]);
+  reshapeHeight_ = 1;
+  for (size_t i = 0; i < heightAxis_.size(); i++) {
+    reshapeHeight_ *= outDims_[heightAxis_[i]];
+  }
+  output_.setFrameHeight(reshapeHeight_);
+  reshapeWidth_ = 1;
+  for (size_t i = 0; i < widthAxis_.size(); i++) {
+    reshapeWidth_ *= outDims_[widthAxis_[i]];
+  }
+  output_.setFrameWidth(reshapeWidth_);
+}
+
+void SwitchOrderLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int d = inputLayers_[0]->getOutput().getFrameDepth();
+  d = (d == 0 ? 1 : d);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h * d);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  int totalCount = input->getElementCnt();
+  int channels = totalCount / (inDims_[0] * inDims_[2] * inDims_[3]);
+  if (channels != 0) inDims_.setDim(1, channels);
+}
+
+void SwitchOrderLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  resetOutput(outDims_[0], outDims_[1] * outDims_[2] * outDims_[3]);
+  if (heightAxis_.size() > 0) {
+    resetOutput(reshapeHeight_, reshapeWidth_);
+  }
+
+  // switch NCHW to NHWC
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_);
+  nchw2nhwc_[0]->calc(inputs, outputs);
+  forwardActivation();
+}
+
+void SwitchOrderLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  backwardActivation();
+
+  // switch NHWC to NCHW
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  nhwc2nchw_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SwitchOrderLayer.h b/paddle/gserver/layers/SwitchOrderLayer.h
new file mode 100644
index 0000000000..47b1f7f73e
--- /dev/null
+++ b/paddle/gserver/layers/SwitchOrderLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer calculate softmax in image channel dimension.
+ */
+class SwitchOrderLayer : public Layer {
+public:
+  explicit SwitchOrderLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~SwitchOrderLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void setInDims();
+  void setOutDims();
+
+protected:
+  std::vector<std::shared_ptr<FunctionBase>> nchw2nhwc_;
+  std::vector<std::shared_ptr<FunctionBase>> nhwc2nchw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+  std::vector<int> heightAxis_;
+  std::vector<int> widthAxis_;
+  size_t reshapeHeight_;
+  size_t reshapeWidth_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 3c4128b5b8..b578a906c2 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,125 +1,97 @@
 # gserver pacakge unittests
-
-################### test_ProtoDataProvider ############
-add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp)
-
-# test_ProtoDataProvider will mkdir as same name,
-# so if WORKING_DIRECTORY is default directory, then
-# mkdir will get error.
-add_test(NAME test_ProtoDataProvider
-    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
-
-################# test_LayerGrad #######################
-add_unittest_without_exec(test_LayerGrad
-    test_LayerGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_LayerGrad
-    COMMAND test_LayerGrad)
-
-################ test_CRFLayerGrad ####################
-add_unittest_without_exec(test_CRFLayerGrad
-    test_CRFLayerGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_CRFLayerGrad
-    COMMAND test_CRFLayerGrad)
-
-
-add_unittest_without_exec(test_ActivationGrad
-    test_ActivationGrad.cpp
-    LayerGradUtil.cpp)
-add_test(NAME test_ActivationGrad
-    COMMAND test_ActivationGrad)
-################# test_ConvTrans #######################
-add_unittest_without_exec(test_ConvTrans
-    test_ConvTrans.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_ConvTrans
-    COMMAND test_ConvTrans)
-################# test_PriorBox #######################
-add_unittest_without_exec(test_PriorBox
-    test_PriorBox.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_PriorBox
-    COMMAND test_PriorBox)
-################# test_ConvUnify #######################
-add_unittest_without_exec(test_ConvUnify
-    test_ConvUnify.cpp
-    LayerGradUtil.cpp)
-    
-add_test(NAME test_ConvUnify
-    COMMAND test_ConvUnify)
-################# test_BatchNorm #######################
-add_unittest_without_exec(test_BatchNorm
-    test_BatchNorm.cpp
-    LayerGradUtil.cpp)
-
-add_test(NAME test_BatchNorm
-    COMMAND test_BatchNorm)
-################## test_Evaluator #######################
-add_unittest(test_Evaluator
-    test_Evaluator.cpp)
-
-################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
+add_simple_unittest(test_RecurrentLayer)
 
-############## test_MultinomialSampler ###################
-add_simple_unittest(test_MultinomialSampler)
+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
 
-############## test_PyDataProvider ########################
+function(gserver_test TARGET)
+  add_unittest_without_exec(${TARGET}
+      ${TARGET}.cpp
+      LayerGradUtil.cpp)
+  add_test(NAME ${TARGET}
+      COMMAND ${TARGET})
+endfunction()
+
+gserver_test(test_LayerGrad)
+gserver_test(test_CRFLayerGrad)
+gserver_test(test_CrossEntropyOverBeamGrad)
+gserver_test(test_SeqSliceLayerGrad)
+gserver_test(test_ActivationGrad)
+gserver_test(test_ConvTrans)
+gserver_test(test_PriorBox)
+gserver_test(test_DetectionOutput)
+gserver_test(test_ConvUnify)
+gserver_test(test_BatchNorm)
+gserver_test(test_KmaxSeqScore)
+gserver_test(test_Expand)
+gserver_test(test_MaxPoolingWithMaskOutput)
+
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+
+gserver_test_with_python(test_PyDataProvider2)
 if(WITH_PYTHON)
-    add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp)
-
-    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
 endif()
 
-############### test_RecurrentLayer #######################
-add_simple_unittest(test_RecurrentLayer)
+########## test_MKLDNN layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+endif()
 
 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
     add_unittest_without_exec(test_WarpCTCLayer
         test_WarpCTCLayer.cpp)
-
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
-############### test_RecurrentGradientMachine ###############
-# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-# I will fix it.
-add_unittest_without_exec(test_RecurrentGradientMachine
-    test_RecurrentGradientMachine.cpp)
-add_test(NAME test_RecurrentGradientMachine
-    COMMAND .set_python_path.sh -d
-            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
-            ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
-
-add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp)
-if(WITH_GPU)
-    add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
-else()
+if(NOT MOBILE_INFERENCE)
+    ################## test_Evaluator #############
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+      
+    ########### test_NetworkCompare ###############
+    add_unittest_without_exec(test_NetworkCompare
+        test_NetworkCompare.cpp)
+    if(WITH_GPU)
+        set(use_gpu true)
+    else()
+        set(use_gpu false)
+    endif()
     add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+
+    ############ test_CompareSparse ################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
 endif()
-
-
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
-
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PROJ_ROOT}/paddle/gserver/tests:${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-)
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a0b1cd471d..cd957c7c0b 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
 
     std::vector<Argument> args;
     args.push_back(out);
-    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
+    ASSERT_NEAR(0, Argument::sum(args), 1e-5) << "testBatchState failed";
     for (size_t seqId = 0; seqId < numSequences; ++seqId) {
       start[seqId] += seqLens[seqId];
     }
@@ -387,6 +387,52 @@ void initDataLayer(TestConfig testConf,
         data.value->sigmoid(*data.value);
         data.grad->zeroMem();
         break;
+      case INPUT_SELF_DEFINE_DATA: {
+        if (testConf.inputDefs[i].ids.size()) {
+          data.ids = IVector::create(testConf.inputDefs[i].ids.size(), useGpu);
+          data.ids->copyFrom(testConf.inputDefs[i].ids.data(),
+                             testConf.inputDefs[i].ids.size());
+        } else if (testConf.inputDefs[i].selfDefinedData) {
+          size_t height = testConf.inputDefs[i].selfDefinedData->getHeight();
+          size_t width = testConf.inputDefs[i].selfDefinedData->getWidth();
+          CHECK_GT(static_cast<int>(height), 0);
+          CHECK_GT(static_cast<int>(width), 0);
+          data.value = Matrix::create(height, width, false, useGpu);
+          data.grad = Matrix::create(height, width, false, useGpu);
+          data.value->copyFrom(*testConf.inputDefs[i].selfDefinedData);
+          data.grad->zeroMem();
+        } else {
+          LOG(FATAL) << "No self-defined data are given.";
+          return;
+        }
+
+        const std::vector<int>& labelSeqStartPositions =
+            testConf.inputDefs[i].labelSeqStartPositions;
+        if (labelSeqStartPositions.size() != 0) {
+          CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
+
+          sequenceStartPositions =
+              ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
+          sequenceStartPositions->copyFrom(labelSeqStartPositions.data(),
+                                           labelSeqStartPositions.size(),
+                                           useGpu);
+          data.sequenceStartPositions = sequenceStartPositions;
+        }
+
+        const std::vector<int>& labelSubSeqStartPositions =
+            testConf.inputDefs[i].labelSubSeqStartPositions;
+        if (labelSubSeqStartPositions.size() != 0) {
+          CHECK_GE(static_cast<int>(labelSubSeqStartPositions.size()), 2);
+
+          subSequenceStartPositions =
+              ICpuGpuVector::create(labelSubSeqStartPositions.size(), useGpu);
+          subSequenceStartPositions->copyFrom(labelSubSeqStartPositions.data(),
+                                              labelSubSeqStartPositions.size(),
+                                              useGpu);
+          data.subSequenceStartPositions = subSequenceStartPositions;
+        }
+        break;
+      }
       default:
         LOG(FATAL) << " unknown inputType ";
         return;
@@ -440,7 +486,6 @@ void initTestLayer(TestConfig testConf,
                            ParameterConfig paraConfig) {
     paraConfig.set_name(paraName);
     paraConfig.set_size(paraSize);
-    paraConfig.set_initial_std(1);
     paraConfig.set_is_static(isStatic);
     auto para =
         std::make_shared<Parameter>(paraConfig, FLAGS_use_gpu, initialize);
@@ -474,6 +519,9 @@ void initTestLayer(TestConfig testConf,
         paraConfig.add_dims((*layerMap)[input.input_layer_name()]->getSize());
         paraConfig.add_dims(testConf.layerConfig.size());
       }
+      CHECK_GE(testConf.paramInitialStd, 0);
+      paraConfig.set_initial_mean(testConf.paramInitialMean);
+      paraConfig.set_initial_std(testConf.paramInitialStd);
       initParameter(paraName, paraSize, inputDef.isStatic, false, paraConfig);
     }
   }
@@ -626,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
                          bool useGpu,
                          bool useWeight,
                          float epsilon) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 9f68eb64d0..e10a27eedf 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
@@ -31,7 +30,8 @@ enum InputType {
   INPUT_SEQUENCE_LABEL,
   INPUT_SPARSE_NON_VALUE_DATA,
   INPUT_SPARSE_FLOAT_VALUE_DATA,
-  INPUT_DENSE_DIM_DATA,  // using sequence length to init dense data
+  INPUT_DENSE_DIM_DATA,    // using sequence length to init dense data
+  INPUT_SELF_DEFINE_DATA,  // support customizing for input value
 };
 
 struct ParaSparse {
@@ -66,6 +66,9 @@ struct InputDef {
   bool isStatic;
   std::vector<int> labelInitValue;
   std::vector<int> labelSeqStartPositions;
+  std::vector<int> labelSubSeqStartPositions;
+  std::vector<int> ids;
+  MatrixPtr selfDefinedData;
 
   InputDef(InputType type, string nameIn, size_t dimIn, size_t sizeIn) {
     inputType = type;
@@ -76,6 +79,39 @@ struct InputDef {
     isStatic = false;
   }
 
+  InputDef(InputType type,
+           string nameIn,
+           MatrixPtr selfDefinedData,
+           std::vector<int> selfDefinedSeqStartPos = {},
+           std::vector<int> selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        selfDefinedData(selfDefinedData) {
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
+  InputDef(InputType type,
+           string nameIn,
+           const std::vector<int>& ids,
+           const std::vector<int>& selfDefinedSeqStartPos = {},
+           const std::vector<int>& selfDefinedSubSeqStartPos = {})
+      : labelSeqStartPositions(selfDefinedSeqStartPos),
+        labelSubSeqStartPositions(selfDefinedSubSeqStartPos),
+        ids(ids) {
+    selfDefinedData = nullptr;
+    inputType = type;
+    name = nameIn;
+    dim = 0;
+    sparse = {""};
+    paraSize = 0;
+    isStatic = false;
+  }
+
   InputDef(InputType type,
            string nameIn,
            size_t dimIn,
@@ -109,12 +145,16 @@ struct TestConfig {
   LayerConfig layerConfig;
   std::vector<InputDef> inputDefs;
   size_t biasSize;
+  real paramInitialMean;
+  real paramInitialStd;
   bool testAccumulate;
   bool testState;
   bool staticBias;
   bool testBatchState;
   TestConfig()
       : biasSize(0),
+        paramInitialMean(0.0),
+        paramInitialStd(1.0),
         testAccumulate(true),
         testState(false),
         staticBias(false),
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000..afe1608eab
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,580 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+#include "paddle/trainer/Trainer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  refLayer_ = testLayers_[REF];
+  dnnLayer_ = testLayers_[DNN];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  setInputImgSize();
+
+  // for comparison with Paddle reference results,
+  // need manually add cpu device output for test
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->addOutputArgument(CPU_DEVICE);
+  }
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    if (isBN && i == 2) {
+      // this param is moving average in batch norm, which must larger than 0
+      real offset = fabs(refValue->getMin()) + 1.0;
+      refValue->add(offset);
+    }
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutput(CPU_DEVICE)
+      .grad->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  VLOG(MKLDNN_TESTS) << "Check Forward";
+  printTopDatas();
+  double delta =
+      compareMatrix(refLayer_->getOutputValue(), dnnLayer_->getOutputValue());
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
+  const bool isBN = refLayer_->getType() == "batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
+    printMatrix(dnnDiff);
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(refDiff, dnnDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    if (isBN) {
+      // the other two inputs in batch norm are for moving mean and var
+      // do not have grad to compare
+      break;
+    }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  if (dnnLayer) {
+    dnnLayer->convertWeightsToPaddle();
+  }
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(ref, dnn);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs(size_t id) {
+  CHECK_LE(id, parameters_.size());
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    if (id == n || id == parameters_.size()) {
+      for (size_t i = 0; i < parameters_[n].size(); ++i) {
+        const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+        if (grad) {
+          grad->zeroMem();
+        }
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(size_t id) {
+  CHECK_LE(id, dataLayers_.size());
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    if (id == n || id == dataLayers_.size()) {
+      // clear inputs layers of this specific layer
+      for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+        dataLayers_[n][i]->getOutputGrad()->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearTopDatas(size_t id) {
+  CHECK_LE(id, testLayers_.size());
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    if (id == i || id == testLayers_.size()) {
+      testLayers_[i]->getOutputValue()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* refer,
+                              const real* value,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxRatio = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(refer[i]);
+    double val = fabs(value[i]);
+    double diff = fabs(refer[i] - value[i]);
+    delta += diff;
+    sum += ref;
+    if (ref < eps && val < eps) {  // both values are very small
+      continue;
+    }
+    double ratio = diff / ref;
+    if (ratio > thres) {
+      maxRatio = std::max(maxRatio, ratio);
+      failCnt++;
+    }
+  }
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  double res = sum > eps ? delta / sum : eps;
+  return (failCnt / (float)len) > failRate ? maxRatio : res;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(passType_);
+  refLayer_->forward(passType_);
+  checkForward();
+
+  if (passType_ == PASS_TEST) {
+    return;
+  }
+
+  // test backward
+  // simple updater
+  UpdateCallback updateCallback = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+  randomTopDiffs();
+  dnnLayer_->backward(updateCallback);
+  refLayer_->backward(updateCallback);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas(REF) should be coverd by ref layers
+  clearBotDiffs(REF);
+  clearWgtDiffs(REF);
+  // it is necessary to clear bottom diffs when only activation is dnn type
+  if (configs_[DNN].layerConfig.active_type().compare(0, 7, "mkldnn_") == 0) {
+    clearBotDiffs(DNN);
+  }
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       PassType passType,
+                       bool printDetails,
+                       size_t iter,
+                       float epsilon) {
+  CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
+        dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
+      << "should be MKLDNN layer or MKLDNN activation";
+  if (dnn.layerConfig.type() == ref.layerConfig.type()) {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.active_type() << " vs "
+                       << ref.layerConfig.active_type();
+  } else {
+    VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: "
+                       << dnn.layerConfig.type() << " vs "
+                       << ref.layerConfig.type();
+  }
+
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  passType_ = passType;
+  log_ = printDetails;
+  iter_ = iter;
+  eps_ = epsilon;
+
+  // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  if (parameters_[DNN].empty()) {
+    // has no paramters
+    return;
+  }
+
+  // After run some iterations, the mkldnn weight has been stored in dnnLayer
+  // and we can also get the mkldnn weight parameter header format.
+  // Weight parameter should always be index 0 (and bias index 1).
+  // TODO(TJ): should also consider mean and var format when batchnorm ready
+  int dnnWgtFmt = parameters_[DNN][0]->getHeaderFormat();
+  int refWgtFmt = parameters_[REF][0]->getHeaderFormat();
+  if (dnnWgtFmt == refWgtFmt) {
+    // weight format are equal, so no need check more
+    return;
+  }
+
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with dnn weight format
+  reset(dnn, ref, batchSize);
+  // TODO(TJ): should also considerate mean and var format when batchnorm ready
+  parameters_[DNN][0]->setHeaderFormat(dnnWgtFmt);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      const MatrixPtr& src = outArgs[k].value;
+      MatrixPtr dst =
+          Matrix::create(src->getHeight(), src->getWidth(), false, false);
+      if (typeid(*src) == typeid(MKLDNNMatrix)) {
+        MKLDNNMatrixPtr dnnSrc = std::dynamic_pointer_cast<MKLDNNMatrix>(src);
+        dnnSrc->copyTo(*dst);
+      } else {
+        dst->copyFrom(*src);
+      }
+      out.outValues.push_back(dst);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare value index: " << i;
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    VLOG(MKLDNN_TESTS) << "compare param index: " << i;
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runNetTest(const std::string& configPath,
+                              size_t iter,
+                              float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+  DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
+  getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000..9d61533c0b
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
+protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr refLayer_, dnnLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+  /// passType, PASS_TRAIN, PASS_TEST or PASS_GC (Gradient Check pass)
+  PassType passType_;
+
+public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    passType_ = PASS_TRAIN;
+  }
+
+  ~MKLDNNTester() {}
+
+public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           PassType passType = PASS_TRAIN,
+           bool printDetails = false,
+           size_t iter = 3,
+           float epsilon = 1e-4);
+  static void runNetTest(const std::string& configPath,
+                         size_t iter = 2,
+                         float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 2);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 2);
+
+private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  // clear specific layer, clear all when id equals NUM
+  void clearWgtDiffs(size_t id = NUM);
+  void clearBotDiffs(size_t id = NUM);
+  void clearTopDatas(size_t id = NUM);
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(val-ref)/abs(ref) > thres) points
+   * return the max(diff/ref)
+   * else return sum(abs(diff)) / sum(abs(ref))
+   * The return value should be smaller than eps when passing.
+   */
+  static double getDelta(const real* refer,
+                         const real* value,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/gserver/tests/concat_slice_a.conf
new file mode 100644
index 0000000000..dccf911089
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_a.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/gserver/tests/concat_slice_b.conf
new file mode 100644
index 0000000000..29686ef281
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_b.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/gserver/tests/img_conv_cudnn.py
index 3934607fa4..0ea6d6bae6 100644
--- a/paddle/gserver/tests/img_conv_cudnn.py
+++ b/paddle/gserver/tests/img_conv_cudnn.py
@@ -1,17 +1,16 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/gserver/tests/img_conv_exconv.py
index ad5a8ba2bd..c618cdab27 100644
--- a/paddle/gserver/tests/img_conv_exconv.py
+++ b/paddle/gserver/tests/img_conv_exconv.py
@@ -1,17 +1,16 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/mkldnn_branch_net.conf b/paddle/gserver/tests/mkldnn_branch_net.conf
new file mode 100644
index 0000000000..8d5146abb0
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branch_net.conf
@@ -0,0 +1,142 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_conv(input, group_name):
+  out1 = img_conv_layer(input=input,
+              name=group_name+'_conv1_',
+              filter_size=1,
+              num_filters=channels,
+              padding=0,
+              shared_biases=True,
+              act=ReluActivation())
+
+  out2 = img_conv_layer(input=input,
+              name=group_name+'_conv2_',
+              filter_size=3,
+              num_filters=channels,
+              padding=1,
+              shared_biases=True,
+              act=ReluActivation())
+  return out1, out2
+
+def two_conv_bn(input, group_name):
+  out1, out2 = two_conv(input, group_name)
+  out1 = batch_norm_layer(input=out1,
+              name=group_name+'_bn1_',
+              use_global_stats=False,
+              act=ReluActivation())
+
+  out2 = batch_norm_layer(input=out2,
+              name=group_name+'_bn2_',
+              use_global_stats=False,
+              act=ReluActivation())
+  return out1, out2
+
+def two_conv_pool(input, group_name):
+  out1, out2 = two_conv(input, group_name)
+  out1 = img_pool_layer(input=out1,
+              name=group_name+'_pool1_',
+              pool_size=3,
+              stride=2,
+              padding=0,
+              pool_type=MaxPooling())
+
+  out2 = img_pool_layer(input=out2,
+              name=group_name+'_pool2_',
+              pool_size=5,
+              stride=2,
+              padding=1,
+              pool_type=MaxPooling())
+  return out1, out2
+
+def two_fc(input, group_name):
+  out1 = fc_layer(input=input,
+            name=group_name+'_fc1_',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+
+  out2 = fc_layer(input=input,
+            name=group_name+'_fc2_',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+tmp = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1, a2 = two_conv(tmp, 'conv_branch')
+tmp = addto_layer(input=[a1, a2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1, b2 = two_conv_pool(tmp, 'pool_branch')
+tmp = concat_layer(input=[b1, b2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=channels*2,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            stride=2,
+            shared_biases=True,
+            act=LinearActivation(),
+            bias_attr=False)
+
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
+            act=ReluActivation())
+
+c1, c2 = two_conv_bn(tmp, 'bn_branch')
+tmp = addto_layer(input=[c1, c2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = fc_layer(input=tmp, size=channels,
+            bias_attr=True,
+            act=ReluActivation())
+
+d1, d2 = two_fc(tmp, 'fc_branch')
+tmp = addto_layer(input=[d1, d2])
+
+out = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+outputs(out)
diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf
new file mode 100644
index 0000000000..0e9d6b31fa
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -0,0 +1,66 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+data = data_layer(name ="input", size=channels*16*16)
+
+tmp = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=1,
+            padding=0,
+            pool_type=AvgPooling())
+
+tmp = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation(),
+            bias_attr=False)
+
+tmp = batch_norm_layer(input=tmp,
+            use_global_stats=False,
+            act=ReluActivation())
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
+
+tmp = fc_layer(input=tmp,
+            size=channels,
+            bias_attr=False,
+            act=ReluActivation())
+
+out = fc_layer(input=tmp,
+            size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+outputs(out)
diff --git a/paddle/gserver/tests/proto_files.txt b/paddle/gserver/tests/proto_files.txt
deleted file mode 100644
index 691b38c794..0000000000
--- a/paddle/gserver/tests/proto_files.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin
-./test_ProtoDataProvider/data2.bin
diff --git a/paddle/gserver/tests/proto_files_compressed.txt b/paddle/gserver/tests/proto_files_compressed.txt
deleted file mode 100644
index 7413c81e18..0000000000
--- a/paddle/gserver/tests/proto_files_compressed.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-./test_ProtoDataProvider/data1.bin.gz
-./test_ProtoDataProvider/data2.bin.gz
diff --git a/paddle/gserver/tests/pyDataProvider.py b/paddle/gserver/tests/pyDataProvider.py
index 7235a23943..d2ad5888b5 100644
--- a/paddle/gserver/tests/pyDataProvider.py
+++ b/paddle/gserver/tests/pyDataProvider.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import numpy
 import struct
 import traceback
diff --git a/paddle/gserver/tests/rnn_data_provider.py b/paddle/gserver/tests/rnn_data_provider.py
index 3afd45c72f..063a4127e5 100644
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer.PyDataProvider2 import *
 
 # Note that each config should has an independent provider
@@ -95,3 +94,22 @@ def process_unequalength_seq(settings, file_name):
         words1 = reduce(lambda x, y: x + y, d[0])
         words2 = reduce(lambda x, y: x + y, d[1])
         yield words1, words2, d[2]
+
+
+###########################################################
+data3 = [
+    [[[1, 2], [4, 5, 2]], [1, 2], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], [2, 3, 0], 1],
+]
+
+
+# Used for sequence_nest_mixed_inputs.conf
+@provider(
+    input_types=[
+        integer_value_sub_sequence(10), integer_value_sequence(10),
+        integer_value(2)
+    ],
+    should_shuffle=False)
+def process_mixed(settings, file_name):
+    for d in data3:
+        yield d
diff --git a/paddle/gserver/tests/sequenceGen.py b/paddle/gserver/tests/sequenceGen.py
index fd725727c0..04a1732d61 100644
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import os
 import sys
 
diff --git a/paddle/gserver/tests/sequence_lstm.conf b/paddle/gserver/tests/sequence_lstm.conf
new file mode 100644
index 0000000000..f49a827f22
--- /dev/null
+++ b/paddle/gserver/tests/sequence_lstm.conf
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 256
+label_dim = 3
+sparse_update = get_config_arg("sparse_update", bool, False)
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data,
+    size=word_dim,
+    param_attr=ParamAttr(sparse_update=sparse_update))
+
+with mixed_layer(size=hidden_dim * 4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+
+lstm = lstmemory(
+    input=lstm_input,
+    act=TanhActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=TanhActivation())
+
+lstm_last = last_seq(input=lstm)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
index ad14a2c927..afdacfffd7 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_subseq2')
+                        obj='process_subseq')
 
 
 settings(batch_size=2, learning_rate=0.01)
@@ -57,7 +57,7 @@ def outer_step(wid, x):
     last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
 
     # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it, and will report error: In hierachical RNN, all out 
+    # does not handle it, and will report error: In hierachical RNN, all out
     # links should be from sequences now.
     return inner_rnn_output
 
diff --git a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
index 7303d08804..aeaaa221f9 100644
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
@@ -1,18 +1,16 @@
-# edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from paddle.trainer_config_helpers import *
 
 ######################## data source ################################
diff --git a/paddle/gserver/tests/sequence_recurrent.py b/paddle/gserver/tests/sequence_recurrent.py
new file mode 100644
index 0000000000..8786a5465d
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent.py
@@ -0,0 +1,55 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent_group.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_recurrent_group.py b/paddle/gserver/tests/sequence_recurrent_group.py
new file mode 100644
index 0000000000..8b5a3d4983
--- /dev/null
+++ b/paddle/gserver/tests/sequence_recurrent_group.py
@@ -0,0 +1,68 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
+dict_file = dict()
+for line_count, line in enumerate(open(dict_path, "r")):
+    dict_file[line.strip()] = line_count
+
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/train.list',
+    test_list=None,
+    module='sequenceGen',
+    obj='process',
+    args={"dict_file": dict_file})
+
+settings(batch_size=5)
+######################## network configure ################################
+dict_dim = len(open(dict_path, 'r').readlines())
+word_dim = 128
+hidden_dim = 128
+label_dim = 3
+
+# This config is designed to be equivalent with sequence_recurrent.py
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(
+    input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
+
+
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    with mixed_layer(
+            name="rnn_state",
+            size=hidden_dim,
+            bias_attr=False,
+            act=SoftmaxActivation()) as out:
+        out += identity_projection(input=y)
+        out += full_matrix_projection(
+            input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
+    return out
+
+
+recurrent = recurrent_group(name="rnn", step=step, input=emb)
+
+recurrent_last = last_seq(input=recurrent)
+
+with mixed_layer(
+        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
+    output += full_matrix_projection(input=recurrent_last)
+
+outputs(
+    classification_cost(
+        input=output, label=data_layer(
+            name="label", size=1)))
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
new file mode 100644
index 0000000000..0c55f2cf9d
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -0,0 +1,84 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+subseq = embedding_layer(input=data1, size=word_dim)
+seq = embedding_layer(input=data2, size=word_dim)
+nonseq = embedding_layer(input=label, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_mixed_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(subseq, seq, nonseq):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner', input=[subseq, seq, nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[
+        subseq, expand_layer(
+            seq, expand_as=subseq,
+            expand_level=ExpandLevel.FROM_SEQUENCE), expand_layer(
+                nonseq,
+                expand_as=subseq,
+                expand_level=ExpandLevel.FROM_NO_SEQUENCE),
+        StaticInput(encoding)
+    ])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
new file mode 100644
index 0000000000..22b376b91a
--- /dev/null
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -0,0 +1,78 @@
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(
+    train_list='gserver/tests/Sequence/dummy.list',
+    test_list=None,
+    module='rnn_data_provider',
+    obj='process_mixed')
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 2
+hidden_dim = 2
+label_dim = 2
+
+data1 = data_layer(name="word1", size=dict_dim)
+data2 = data_layer(name="word2", size=dict_dim)
+label = data_layer(name="label", size=label_dim)
+
+encoding = embedding_layer(input=data2, size=word_dim)
+
+
+# This hierarchical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn_matched_inputs.conf
+def outer_step(subseq, seq, nonseq, encoding):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+
+    def inner_step(data1, data2, label):
+        inner_mem = memory(
+            name="inner_rnn_state", size=hidden_dim, boot_layer=outer_mem)
+
+        subseq = embedding_layer(input=data1, size=word_dim)
+        seq = embedding_layer(input=data2, size=word_dim)
+        nonseq = embedding_layer(input=label, size=word_dim)
+
+        print_layer(input=[data1, seq, label, inner_mem])
+        out = fc_layer(
+            input=[subseq, seq, nonseq, inner_mem],
+            size=hidden_dim,
+            act=TanhActivation(),
+            bias_attr=True,
+            name='inner_rnn_state')
+        return out
+
+    decoder = recurrent_group(
+        step=inner_step, name='inner',
+        input=[subseq, StaticInput(seq), nonseq])
+    last = last_seq(name="outer_rnn_state", input=decoder)
+    context = simple_attention(
+        encoded_sequence=encoding, encoded_proj=encoding, decoder_state=last)
+    return context
+
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[data1, data2, StaticInput(label), StaticInput(encoding)])
+
+rep = last_seq(input=out)
+prob = fc_layer(
+    size=label_dim, input=rep, act=SoftmaxActivation(), bias_attr=True)
+
+outputs(classification_cost(input=prob, label=label))
diff --git a/paddle/gserver/tests/sequence_rnn_multi_input.conf b/paddle/gserver/tests/sequence_rnn_multi_input.conf
index 40d0317415..9fae974f30 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                         test_list=None,
                         module='rnn_data_provider',
-                        obj='process_seq2')
+                        obj='process_seq')
 
 
 settings(batch_size=2, learning_rate=0.01)
diff --git a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
index 786a0c6d78..3ce87490bb 100644
--- a/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
@@ -1,17 +1,16 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 
 from paddle.trainer_config_helpers import *
 
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index b201ba8a5a..f4c2a07c44 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
@@ -57,6 +56,39 @@ TEST(Activation, activation) {
   }
 }
 
+void testSequenceSoftmaxAct(bool hasSubseq) {
+  LOG(INFO) << "test activation: sequence softmax";
+
+  const size_t size = 1;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sequence_softmax");
+  config.inputDefs.push_back(
+      {hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA,
+       "layer_0",
+       1,
+       0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sequence_softmax",
+                  100,
+                  /* trans= */ false,
+                  useGpu,
+                  /* useWeight */ true);
+  }
+}
+
+TEST(SequenceSoftmaxActivation, activation) {
+  for (auto hasSubseq : {false, true}) {
+    LOG(INFO) << "hasSubseq = " << hasSubseq;
+    testSequenceSoftmaxAct(hasSubseq);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index d07299bfe3..41116f4809 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -17,11 +17,11 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
+#include "paddle/cuda/include/hl_batch_norm.h"
+#include "paddle/math/tests/TensorCheck.h"
 #include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
@@ -118,6 +118,74 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
+#ifdef PADDLE_WITH_CUDA
+void batchNormInference(int n, int c, int h, int w) {
+  MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudaOut = std::make_shared<GpuMatrix>(n, c * h * w);
+  MatrixPtr cudnnCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  MatrixPtr cudaCheck = std::make_shared<CpuMatrix>(n, c * h * w);
+  input->randomizeUniform();
+  cudnnOut->zeroMem();
+  cudaOut->zeroMem();
+
+  MatrixPtr scale = std::make_shared<GpuMatrix>(1, c);
+  scale->randomizeUniform();
+  MatrixPtr bias = std::make_shared<GpuMatrix>(1, c);
+  bias->randomizeUniform();
+
+  MatrixPtr movingMean = std::make_shared<GpuMatrix>(1, c);
+  movingMean->randomizeUniform();
+
+  MatrixPtr movingVar = std::make_shared<GpuMatrix>(1, c);
+  movingVar->randomizeUniform();
+  movingVar->clip(0.01, 50);
+
+  hl_tensor_descriptor ioDesc;
+  hl_tensor_descriptor bnDesc;
+  hl_create_tensor_descriptor(&ioDesc);
+  hl_create_tensor_descriptor(&bnDesc);
+  hl_tensor_reshape(ioDesc, n, c, h, w);
+  hl_tensor_reshape(bnDesc, 1, c, 1, 1);
+
+  double EPS = 1E-5;
+  hl_batch_norm_forward_inference(ioDesc,
+                                  input->getData(),
+                                  ioDesc,
+                                  cudnnOut->getData(),
+                                  bnDesc,
+                                  scale->getData(),
+                                  bias->getData(),
+                                  movingMean->getData(),
+                                  movingVar->getData(),
+                                  EPS);
+
+  hl_batch_norm_cuda_inference(input->getData(),
+                               cudaOut->getData(),
+                               scale->getData(),
+                               bias->getData(),
+                               movingMean->getData(),
+                               movingVar->getData(),
+                               EPS,
+                               n,
+                               c,
+                               h,
+                               w);
+
+  cudnnCheck->copyFrom(*cudnnOut);
+  cudaCheck->copyFrom(*cudaOut);
+  autotest::TensorCheckErr(*cudnnCheck, *cudaCheck);
+
+  hl_destroy_tensor_descriptor(ioDesc);
+  hl_destroy_tensor_descriptor(bnDesc);
+}
+
+TEST(BatchNorm, Inference) {
+  batchNormInference(33, 267, 1, 1);
+  batchNormInference(19, 105, 4, 4);
+}
+#endif
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
index df14449291..f010066ebc 100644
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/LinearChainCRF.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/gserver/tests/test_CompareSparse.cpp
similarity index 98%
rename from paddle/trainer/tests/test_CompareSparse.cpp
rename to paddle/gserver/tests/test_CompareSparse.cpp
index a7000eb77e..c6e07650fc 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/gserver/tests/test_CompareSparse.cpp
@@ -22,8 +22,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-static const string& configFile1 =
-    "trainer/tests/sample_trainer_config_qb_rnn.conf";
+static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
 
 DECLARE_bool(use_gpu);
 DECLARE_string(config);
@@ -174,7 +173,7 @@ TEST(compareSparse, multiGradientMachine) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
@@ -198,7 +197,7 @@ TEST(compareSparse, NeuralNetwork) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/gserver/tests/test_CompareTwoNets.cpp
similarity index 95%
rename from paddle/trainer/tests/test_CompareTwoNets.cpp
rename to paddle/gserver/tests/test_CompareTwoNets.cpp
index 94f65e545d..801d960756 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/gserver/tests/test_CompareTwoNets.cpp
@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
 DECLARE_string(config);
 DECLARE_string(nics);
 
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
 DEFINE_bool(need_high_accuracy,
             false,
             "whether need to run in double accuracy");
@@ -42,6 +40,10 @@ DEFINE_double(
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_int32(seed);
 
+static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
+static const string& config_file_b =
+    "gserver/tests/sequence_recurrent_group.py";
+
 struct ComData {
   vector<Argument> outArgs;
   vector<ParameterPtr> parameters;
@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
   DataBatch dataBatch;
   int32_t batchSize = trainer.getConfig().opt_config().batch_size();
 
+  trainer.getDataProvider()->reset();
   trainer.getDataProvider()->setSkipShuffle();
   trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
 
@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
 
 TEST(Trainer, create) {
   ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
+  calcGradient(dataA, config_file_a);
   LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
 
   ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
+  calcGradient(dataB, config_file_b);
   LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
 
   compareGradient(dataA, dataB);
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 40bb1e2d73..5f2f966547 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -17,9 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 54b72375b7..8634355b52 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -17,9 +17,7 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
@@ -118,7 +116,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
 }
 
 TEST(Layer, convParaUnified) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   MatrixPtr input, resultCpu, resultGpu;
 
   /// TEST1 for conv ///
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
new file mode 100644
index 0000000000..477638426f
--- /dev/null
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -0,0 +1,352 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include <sstream>
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const size_t MAX_SEQ_NUM = 23;
+const size_t MAX_SEQ_LEN = 50;
+const size_t MAX_BEAM_SIZE = 27;
+
+const size_t SEED = (size_t)(time(NULL));
+
+struct SingleBeamExpansion {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<real> candidateScores;
+
+  // TODO(caoying): store this into Argument.ids
+  vector<real> selectedIndices;
+
+  vector<int> groundTruth;
+  vector<size_t> inBeam;
+  vector<int> rowIdxInBeam;
+  vector<int> colIdxInBeam;
+
+  void resetGroundTruth(size_t n) {
+    groundTruth.clear();
+    groundTruth.resize(n, -1);
+
+    inBeam.clear();
+    inBeam.resize(n, 0);
+
+    rowIdxInBeam.clear();
+    rowIdxInBeam.resize(n, -1);
+
+    colIdxInBeam.clear();
+    colIdxInBeam.resize(n, -1);
+  }
+};
+
+inline float randFloat() {
+  return static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
+}
+
+void genRand(real* numbers, size_t n) {
+  default_random_engine generator;
+  uniform_real_distribution<real> distribution(0.0, 1.0);
+  for (size_t i = 0; i < n; ++i) numbers[i] = distribution(generator);
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genCandidateScores(bool hasSubseq,
+                        size_t beamSize,
+                        SingleBeamExpansion& prevBeam,
+                        SingleBeamExpansion& curBeam) {
+  vector<int>& seqStartPos = curBeam.seqStartPos;
+  seqStartPos.resize(1, 0);
+  vector<int>& subSeqStartPos = curBeam.subSeqStartPos;
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  if (prevBeam.selectedIndices.size()) {
+    if (prevBeam.subSeqStartPos.size() > 1) {
+      int seqIdx = 1;
+      // samples in previous beam are nested sequences.
+      for (size_t i = 1; i < prevBeam.subSeqStartPos.size(); ++i) {
+        for (size_t j = 0; j < beamSize; ++j) {
+          if (prevBeam.selectedIndices[(i - 1) * beamSize + j] == -1.) break;
+          subSeqStartPos.push_back(1 + (rand() % MAX_SEQ_LEN) +
+                                   subSeqStartPos.back());
+        }
+        if (prevBeam.seqStartPos[seqIdx] == prevBeam.subSeqStartPos[i]) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          seqIdx++;
+        }
+      }
+    } else {
+      for (size_t i = 0; i <= prevBeam.selectedIndices.size(); ++i) {
+        if (i && i % beamSize == 0) {
+          seqStartPos.push_back(subSeqStartPos.back());
+          if (i == prevBeam.selectedIndices.size()) break;
+        }
+        if (prevBeam.selectedIndices[i] == -1.) continue;
+        subSeqStartPos.push_back(subSeqStartPos.back() +
+                                 (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  } else {
+    // the first beam expansion
+    int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int i = 0; i < seqNum; ++i) {
+      if (hasSubseq) {
+        for (size_t j = 0; j < 1 + (rand() % MAX_SEQ_NUM); ++j)
+          subSeqStartPos.push_back(subSeqStartPos.back() +
+                                   (1 + (rand() % MAX_SEQ_LEN)));
+        seqStartPos.push_back(subSeqStartPos.back());
+      } else {
+        seqStartPos.push_back(seqStartPos.back() +
+                              (1 + (rand() % MAX_SEQ_LEN)));
+      }
+    }
+  }
+
+  size_t totalSeqNum = hasSubseq ? subSeqStartPos.back() : seqStartPos.back();
+  curBeam.candidateScores.resize(totalSeqNum, 0.);
+  genRand(curBeam.candidateScores.data(), totalSeqNum);
+}
+
+void genSelectedIndices(size_t beamSize,
+                        vector<int>& seqStartPos,
+                        vector<real>& selectedIndices) {
+  size_t selectedIdsCount = beamSize * (seqStartPos.size() - 1);
+  selectedIndices.resize(selectedIdsCount, -1.);
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    int n = min(seqLen, static_cast<int>(beamSize));
+    vector<real> ids = randSampling(seqLen, n);
+    memcpy(selectedIndices.data() + i * beamSize,
+           ids.data(),
+           sizeof(real) * ids.size());
+  }
+}
+
+void genGroundTruth(vector<SingleBeamExpansion>& beamExpansions,
+                    size_t beamSize) {
+  SingleBeamExpansion& beam = beamExpansions[1];
+  size_t seqNum = beam.seqStartPos.size() - 1;
+  for (size_t i = 2; i < beamExpansions.size(); ++i)
+    CHECK_EQ(seqNum, beamExpansions[i].seqStartPos.size() - 1);
+
+  srand(SEED);
+
+  // initialize the first beam.
+  beam.resetGroundTruth(seqNum);
+  for (size_t i = 0; i < seqNum; ++i) {
+    if (randFloat() > 0.5) {
+      /*
+       * force the randomly generated label falls in the beam by chance 0.5.
+       * otherwise, when sequence length is relatively long and beam size is
+       * relatively small, the gold sequences falls off the beam at in the
+       * first search.
+       */
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      beam.colIdxInBeam[i] =
+          rand() % count_if(begPos, begPos + beamSize, [](const real& val) {
+            return val != -1.;
+          });
+      beam.groundTruth[i] =
+          beam.selectedIndices[i * beamSize + beam.colIdxInBeam[i]];
+      beam.inBeam[i] = 1;
+    } else {
+      int label = rand() % (beam.seqStartPos[i + 1] - beam.seqStartPos[i]);
+      beam.groundTruth[i] = label;
+
+      real* begPos = beam.selectedIndices.data() + i * beamSize;
+      real* endPos = begPos + beamSize;
+      real* lblPos = find(begPos, endPos, real(label));
+      if (lblPos != endPos) {
+        beam.inBeam[i] = 1;
+        beam.colIdxInBeam[i] = lblPos - begPos;
+      }
+    }
+    beam.rowIdxInBeam[i] = i;
+  }
+
+  // iterate over each beam expansions
+  for (size_t i = 2; i < beamExpansions.size(); ++i) {
+    SingleBeamExpansion& curBeam = beamExpansions[i];
+    SingleBeamExpansion& prevBeam = beamExpansions[i - 1];
+    curBeam.resetGroundTruth(seqNum);
+
+    // iterate over each sequence
+    for (size_t j = 0; j < seqNum; ++j) {
+      if (!prevBeam.inBeam[j]) continue;
+
+      // gold sequence falls in the beam in previous search.
+      real* begPos = prevBeam.selectedIndices.data();
+      int offset =
+          prevBeam.rowIdxInBeam[j] * beamSize + prevBeam.colIdxInBeam[j];
+      curBeam.rowIdxInBeam[j] = count_if(
+          begPos, begPos + offset, [](const real& val) { return val != -1.; });
+
+      if (randFloat() > 0.5) {
+        // force the randomly generated label falls in the beam by chance 0.5.
+
+        real* start =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        int n = rand() % count_if(start, start + beamSize, [](const real& val) {
+                  return val != -1.;
+                });
+        curBeam.colIdxInBeam[j] = n;
+        curBeam.groundTruth[j] = *(start + n);
+        curBeam.inBeam[j] = 1;
+      } else {
+        CHECK_LE((size_t)curBeam.rowIdxInBeam[j] + 1,
+                 curBeam.subSeqStartPos.size() - 1);
+        int start = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j]];
+        int end = curBeam.subSeqStartPos[curBeam.rowIdxInBeam[j] + 1];
+        CHECK_GT(size_t(end), size_t(start));
+        int label = rand() % (end - start);
+
+        curBeam.groundTruth[j] = label;
+        real* findBeg =
+            curBeam.selectedIndices.data() + curBeam.rowIdxInBeam[j] * beamSize;
+        real* lblPos =
+            find(findBeg, findBeg + beamSize, static_cast<real>(label));
+        if (lblPos != (findBeg + beamSize)) {
+          curBeam.inBeam[j] = 1;
+          curBeam.colIdxInBeam[j] = lblPos - findBeg;
+        }
+      }
+    }
+  }
+}
+
+void genOneBeam(size_t beamSize,
+                bool hasSubseq,
+                SingleBeamExpansion& prevBeam,
+                SingleBeamExpansion& curBeam) {
+  genCandidateScores(hasSubseq, beamSize, prevBeam, curBeam);
+  genSelectedIndices(beamSize,
+                     hasSubseq ? curBeam.subSeqStartPos : curBeam.seqStartPos,
+                     curBeam.selectedIndices);
+}
+
+void genRandomBeamExpansion(size_t expansionCount,
+                            size_t beamSize,
+                            vector<SingleBeamExpansion>& beamExpansions) {
+  beamExpansions.clear();
+  beamExpansions.resize(expansionCount + 1);
+
+  // beamExpansions[0] is reserved.
+  for (size_t i = 1; i <= expansionCount; ++i)
+    genOneBeam(beamSize, bool(i - 1), beamExpansions[i - 1], beamExpansions[i]);
+  genGroundTruth(beamExpansions, beamSize);
+}
+
+void testCrossEntropyOverBeam(bool useGpu,
+                              size_t beamSize,
+                              vector<SingleBeamExpansion>& beams) {
+  TestConfig config;
+  config.layerConfig.set_type("cross_entropy_over_beam");
+
+  size_t seqNum = 0;
+  for (size_t i = 1; i < beams.size(); ++i) {
+    const SingleBeamExpansion& beam = beams[i];
+    // create scores for all the candidates
+    MatrixPtr candidateScorePtr =
+        Matrix::create(beam.candidateScores.size(), 1, false, false);
+    candidateScorePtr->copyFrom(beam.candidateScores.data(),
+                                beam.candidateScores.size());
+
+    ostringstream paramName;
+    paramName << "candidate_scores_" << i;
+
+    if (beam.subSeqStartPos.size() > 1) {
+      seqNum = beam.subSeqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos,
+                                  beam.subSeqStartPos});
+    } else {
+      seqNum = beam.seqStartPos.size() - 1;
+      config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                  paramName.str(),
+                                  candidateScorePtr,
+                                  beam.seqStartPos});
+    }
+    config.layerConfig.add_inputs();
+
+    // create indices for the selected candidates
+    MatrixPtr selectedCandidates =
+        Matrix::create(seqNum, beamSize, false, false);
+    selectedCandidates->copyFrom(beam.selectedIndices.data(),
+                                 beam.selectedIndices.size());
+    paramName.clear();
+    paramName << "selected_candidates_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), selectedCandidates});
+    config.layerConfig.add_inputs();
+
+    // create the ground truth
+    paramName.clear();
+    paramName << "label_" << i;
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, paramName.str(), beam.groundTruth});
+    config.layerConfig.add_inputs();
+  }
+
+  testLayerGrad(
+      config, "cross_entropy_over_beam", seqNum, false, useGpu, false);
+}
+
+TEST(Layer, CrossEntropyOverBeam) {
+  LOG(INFO) << "SEED = " << SEED;
+  const size_t beamSize = 1 + rand() % MAX_BEAM_SIZE;
+  LOG(INFO) << "beamSize = " << beamSize;
+
+  // TODO(caoying): test with random beam expansions.
+  const size_t expansionCount = 3;
+  vector<SingleBeamExpansion> beams;
+  genRandomBeamExpansion(expansionCount, beamSize, beams);
+
+  for (bool useGpu : {false, true})
+    testCrossEntropyOverBeam(useGpu, beamSize, beams);
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(SEED);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
new file mode 100644
index 0000000000..dc39c97a87
--- /dev/null
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of priorBox layer and check to see if its output
+// matches the given result
+void doOneDetectionOutputTest(MatrixPtr& inputLoc,
+                              MatrixPtr& inputConf,
+                              MatrixPtr& inputPriorBox,
+                              size_t feature_map_width,
+                              size_t feature_map_height,
+                              real nms_threshold,
+                              bool use_gpu,
+                              MatrixPtr& result) {
+  // Setting up the detection output layer
+  TestConfig configt;
+  configt.layerConfig.set_type("detection_output");
+  LayerInputConfig* input = configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+  configt.layerConfig.add_inputs();
+
+  DetectionOutputConfig* detOutput = input->mutable_detection_output_conf();
+  detOutput->set_width(feature_map_width);
+  detOutput->set_height(feature_map_height);
+  detOutput->set_nms_threshold(nms_threshold);
+  detOutput->set_num_classes(2);
+  detOutput->set_nms_top_k(20);
+  detOutput->set_keep_top_k(10);
+  detOutput->set_background_id(0);
+  detOutput->set_confidence_threshold(0.01);
+  detOutput->set_input_num(1);
+  configt.inputDefs.push_back({INPUT_DATA_TARGET, "priorbox", 32, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_loc", 16, 0});
+  configt.inputDefs.push_back({INPUT_DATA, "input_conf", 8, 0});
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      configt, &dataLayers, &datas, &layerMap, "priorbox", 1, false, use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputPriorBox);
+  dataLayers[1]->getOutputValue()->copyFrom(*inputLoc);
+  dataLayers[2]->getOutputValue()->copyFrom(*inputConf);
+
+  // test layer initialize
+  bool store_FLAGS_use_gpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr detectionOutputLayer;
+  initTestLayer(configt, &layerMap, &parameters, &detectionOutputLayer);
+  FLAGS_use_gpu = store_FLAGS_use_gpu;
+  detectionOutputLayer->forward(PASS_GC);
+  checkMatrixEqual(detectionOutputLayer->getOutputValue(), result);
+}
+
+TEST(Layer, detectionOutputLayerFwd) {
+  bool useGpu = false;
+  // CPU case 1.
+  MatrixPtr inputLoc;
+  MatrixPtr inputConf;
+  MatrixPtr inputPriorBox;
+  MatrixPtr result, result2, result3, result4;
+  real nmsTreshold = 0.01;
+  real inputLocData[] = {0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1,
+                         0.1};
+  real inputConfData[] = {0.1, 0.9, 0.2, 0.8, 0.3, 0.7, 0.4, 0.6};
+  real inputPriorBoxData[] = {0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2,
+                              0.2, 0.2, 0.6, 0.6, 0.1, 0.1, 0.2, 0.2,
+                              0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2,
+                              0.4, 0.4, 0.8, 0.8, 0.1, 0.1, 0.2, 0.2};
+  real resultData[] = {
+      0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031};
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  result = Matrix::create(1, 7, false, useGpu);
+  inputLoc->setData(inputLocData);
+  inputConf->setData(inputConfData);
+  inputPriorBox->setData(inputPriorBoxData);
+  result->setData(resultData);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result);
+
+  // CPU case 2.
+  nmsTreshold = 0.2;
+  result2 = Matrix::create(2, 7, false, useGpu);
+  real resultData2[] = {0,
+                        1,
+                        0.68997443,
+                        0.099959746,
+                        0.099959746,
+                        0.50804031,
+                        0.50804031,
+                        0,
+                        1,
+                        0.59868765,
+                        0.29995975,
+                        0.29995975,
+                        0.70804024,
+                        0.70804024};
+  result2->setData(resultData2);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result2);
+
+#ifdef PADDLE_WITH_CUDA
+  // GPU case 1.
+  useGpu = true;
+  inputLoc = Matrix::create(1, 16, false, useGpu);
+  inputConf = Matrix::create(1, 8, false, useGpu);
+  inputPriorBox = Matrix::create(1, 32, false, useGpu);
+  inputLoc->copyFrom(inputLocData, 16);
+  inputConf->copyFrom(inputConfData, 8);
+  inputPriorBox->copyFrom(inputPriorBoxData, 32);
+
+  nmsTreshold = 0.01;
+  result3 = Matrix::create(1, 7, false, useGpu);
+  result3->copyFrom(resultData, 7);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result3);
+
+  // GPU case 2.
+  nmsTreshold = 0.2;
+  result4 = Matrix::create(2, 7, false, useGpu);
+  result4->copyFrom(resultData2, 14);
+  doOneDetectionOutputTest(inputLoc,
+                           inputConf,
+                           inputPriorBox,
+                           /* feature_map_width */ 1,
+                           /* feature_map_height */ 1,
+                           nmsTreshold,
+                           useGpu,
+                           result4);
+#endif
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 4f5fdbb37c..62a131171f 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
                    string testEvaluatorName,
                    size_t batchSize,
                    bool useGpu) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
@@ -138,6 +138,23 @@ void testEvaluatorAll(TestConfig testConf,
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
 
+TEST(Evaluator, detection_map) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("detection_map");
+  config.evaluatorConfig.set_overlap_threshold(0.5);
+  config.evaluatorConfig.set_background_id(0);
+  config.evaluatorConfig.set_ap_type("Integral");
+  config.evaluatorConfig.set_evaluate_difficult(0);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 7});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
+  config.evaluatorConfig.set_evaluate_difficult(false);
+  testEvaluatorAll(config, "detection_map", 100);
+
+  config.evaluatorConfig.set_evaluate_difficult(true);
+  testEvaluatorAll(config, "detection_map", 100);
+}
+
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
diff --git a/paddle/gserver/tests/test_Expand.cpp b/paddle/gserver/tests/test_Expand.cpp
new file mode 100644
index 0000000000..d32bf0152f
--- /dev/null
+++ b/paddle/gserver/tests/test_Expand.cpp
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+// Do one forward pass of expand layer and check to see if its output
+// matches the given result.(Test onlyCPU currently.)
+void doOneExpandTest(string trans_type,
+                     bool hasSubseq,
+                     bool useGpu,
+                     Argument& input1,
+                     Argument& input2,
+                     Argument& result) {
+  FLAGS_use_gpu = false;
+  // Setting up the expand layer
+  TestConfig config;
+  config.layerConfig.set_type("expand");
+
+  auto inputType1 =
+      trans_type == "non-seq" ? INPUT_DENSE_DIM_DATA : INPUT_SEQUENCE_DATA;
+  config.inputDefs.push_back({inputType1, "layer0", 1, 0});
+  auto inputType2 =
+      hasSubseq ? INPUT_HASSUB_SEQUENCE_DATA : INPUT_SEQUENCE_DATA;
+
+  config.inputDefs.push_back({inputType2, "layer1", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.set_trans_type(trans_type);
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "expand", 1, false, useGpu);
+  dataLayers[0]->getOutput() = input1;
+  dataLayers[1]->getOutput() = input2;
+
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr expandLayer;
+  initTestLayer(config, &layerMap, &parameters, &expandLayer);
+  expandLayer->forward(PASS_GC);
+  checkMatrixEqual(expandLayer->getOutputValue(), result.value);
+}
+
+TEST(Layer, ExpandLayerFwd) {
+  bool useGpu = false;
+
+  // Assume batch_size =3 in all cases.
+
+  // CPU case 1. non-seq expand to seq
+  // input1 = 1,2,3
+  // input2 = [4,5],[6],[7,8,9]
+  // result = [1,1],[2],[3,3,3]
+  Argument input1, input2, result;
+  input1.value = Matrix::create(3, 1, false, useGpu);
+  real input1Data[] = {1, 2, 3};
+  input1.value->setData(input1Data);
+
+  input2.value = Matrix::create(6, 1, false, useGpu);
+  real input2Data[] = {4, 5, 6, 7, 8, 9};
+  input2.value->setData(input2Data);
+  input2.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
+  int input2Seq[] = {0, 2, 3, 6};
+  input2.sequenceStartPositions->copyFrom(input2Seq, 4, useGpu);
+
+  result.value = Matrix::create(6, 1, false, useGpu);
+  real resultData[] = {1, 1, 2, 3, 3, 3};
+  result.value->setData(resultData);
+
+  doOneExpandTest("non-seq", false, useGpu, input1, input2, result);
+
+  // CPU case 2. non-seq expand to sub-seq
+  // NOTE: input1.batch_size == input2.sequencelength in this case.
+  // i.e, input1 expands by input2.sequence
+  // input1 = 1,2,3
+  // input2 = [[4,5]],[[6]],[[7],[8,9]]
+  // result = [[1,1]],[[2]],[[3],[3,3]]
+  input2.subSequenceStartPositions = ICpuGpuVector::create(5, useGpu);
+  int input2SubSeq[] = {0, 2, 3, 4, 6};
+  input2.subSequenceStartPositions->copyFrom(input2SubSeq, 5, useGpu);
+
+  doOneExpandTest("non-seq", true, useGpu, input1, input2, result);
+
+  // CPU case 3. seq expand to sub-seq
+  // input1 = [1,2],[3],[4]
+  // input2 = [[4,5]],[[6]],[[7],[8,9]]
+  // result = [[1,1]],[[2]],[[3],[4,4]]
+  Matrix::resizeOrCreate(input1.value, 4, 1, false, useGpu);
+  real input1Data_case3[] = {1, 2, 3, 4};
+  input1.value->setData(input1Data_case3);
+
+  input1.sequenceStartPositions = ICpuGpuVector::create(4, useGpu);
+  int input1Seq[] = {0, 2, 3, 4};
+  input1.sequenceStartPositions->copyFrom(input1Seq, 4, useGpu);
+
+  real resultData_case3[] = {1, 1, 2, 3, 4, 4};
+  result.value->setData(resultData_case3);
+
+  doOneExpandTest("seq", true, useGpu, input1, input2, result);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
new file mode 100644
index 0000000000..ffe5cfb8db
--- /dev/null
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -0,0 +1,164 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index e1e8e7fae7..aab02f1684 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#ifdef PADDLE_WITH_CUDA
+#include <cudnn.h>
+#endif
 #include <gtest/gtest.h>
 #include <string>
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
@@ -51,7 +53,7 @@ TEST(Operator, dot_mul) {
 TEST(Projection, context) {
   for (auto contextStart : {-5, -3, -1, 0, 3}) {
     for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto batchSize : {1, 2, 5, 20, 50}) {
+      for (auto batchSize : {1, 2, 5, 20}) {
         for (auto trainablePadding : {false, true}) {
           LOG(INFO) << " contextStart=" << contextStart
                     << " contextLength=" << contextLength
@@ -152,6 +154,26 @@ TEST(Projection, identity) {
   }
 }
 
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
 TEST(Projection, scaling) {
   ProjectionConfig conf;
   conf.set_type("scaling");
@@ -169,10 +191,16 @@ TEST(Projection, scaling) {
 void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 4;
+  const int FILTER_SIZE_Y = 2;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
+#if CUDNN_VERSION >= 6000
+  const int DILATION = 2;
+#else
+  const int DILATION = 1;
+#endif
+
   ProjectionConfig conf;
   if (isDeconv) {
     conf.set_type("convt");
@@ -189,6 +217,8 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(DILATION);
+  conv->set_dilation_y(DILATION);
   conv->set_groups(groups);
   if (isDeconv) {
     conv->set_filter_channels(NUM_FILTERS / conv->groups());
@@ -197,20 +227,35 @@ void testProjectionConv(size_t groups, bool isDeconv) {
   }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
-                            conv->filter_size(),
+                            (conv->filter_size() - 1) * DILATION + 1,
                             conv->padding(),
                             conv->stride(),
                             /* caffeMode */ true);
   int output_y = outputSize(conv->img_size(),
-                            conv->filter_size_y(),
+                            (conv->filter_size_y() - 1) * DILATION + 1,
                             conv->padding_y(),
                             conv->stride_y(),
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
   conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
   if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
     conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
   } else {
     conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
     conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -227,7 +272,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                      true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Projection, conv) {
   /// test ConvProjection
   testProjectionConv(1, false);
@@ -347,6 +392,55 @@ TEST(Layer, CosSimVecMatLayer) {
   }
 }
 
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
 void testConvLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 16;
@@ -355,27 +449,38 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 384, 288});
+  int dilation = 2;
+  if (type == "cudnn_conv") {
+#if CUDNN_VERSION >= 6000
+    dilation = 2;
+#else
+    dilation = 1;
+#endif
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 768, 192});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(2);
   conv->set_channels(3);
   conv->set_padding(0);
   conv->set_padding_y(1);
   conv->set_stride(2);
   conv->set_stride_y(2);
+  conv->set_dilation(dilation);
+  conv->set_dilation_y(dilation);
   conv->set_groups(1);
   conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(16);
-  conv->set_img_size_y(8);
+  conv->set_img_size_y(16);
   conv->set_output_x(outputSize(conv->img_size(),
-                                conv->filter_size(),
+                                (conv->filter_size() - 1) * dilation + 1,
                                 conv->padding(),
                                 conv->stride(),
                                 /* caffeMode */ true));
   conv->set_output_y(outputSize(conv->img_size_y(),
-                                conv->filter_size_y(),
+                                (conv->filter_size_y() - 1) * dilation + 1,
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /* caffeMode */ true));
@@ -389,7 +494,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, convLayer) {
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -434,7 +539,7 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -493,16 +598,17 @@ TEST(Layer, maxoutLayer) {
     testLayerGrad(config, "maxout", 10, false, useGpu);
   }
 }
+
 void testFcLayer(string format, size_t nnz) {
   TestConfig config;
-  config.biasSize = 4096;
+  config.biasSize = 1024;
   config.layerConfig.set_type("fc");
-  config.layerConfig.set_size(4096);
+  config.layerConfig.set_size(1024);
   config.layerConfig.set_active_type("sigmoid");
   config.layerConfig.set_drop_rate(0.1);
 
   config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+      {INPUT_DATA, "layer_0", 2048, nnz, ParaSparse(format)});
   config.layerConfig.add_inputs();
 
   LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
@@ -519,9 +625,9 @@ void testFcLayer(string format, size_t nnz) {
 }
 
 TEST(Layer, fcLayer) {
-  testFcLayer("", 4096 * 4096 * 2);
-  testFcLayer("csc", 4096 * 40);
-  testFcLayer("csr", 4096 * 40);
+  testFcLayer("", 1024 * 1024 * 2);
+  testFcLayer("csc", 1024 * 10);
+  testFcLayer("csr", 1024 * 10);
 }
 
 TEST(Layer, SelectiveFullyConnectedLayer) {
@@ -547,7 +653,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
                 /* trans= */ false,
                 /* useGup= */ false,
                 false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testLayerGrad(config,
                 "selective_fc",
                 100,
@@ -590,12 +696,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -759,9 +866,27 @@ TEST(Layer, square_error_weighted) {
   }
 }
 
+TEST(Layer, huber_regression_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("huber_regression");
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    for (auto delta : {1, 3, 5}) {
+      config.layerConfig.set_delta(delta);
+      testLayerGrad(config, "huber_regression", 100, /* trans */ false, useGpu);
+    }
+  }
+}
+
 TEST(Layer, huber_two_class) {
   TestConfig config;
-  config.layerConfig.set_type("huber");
+  config.layerConfig.set_type("huber_classification");
   config.biasSize = 0;
 
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
@@ -770,7 +895,7 @@ TEST(Layer, huber_two_class) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "huber", 100, /* trans */ false, useGpu);
+    testLayerGrad(config, "huber_two_class", 100, /* trans */ false, useGpu);
   }
 }
 
@@ -845,8 +970,12 @@ void testDegradeLayer(bool hasSubseq,
 
 TEST(Layer, MaxLayer) {
   testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
+  testDegradeLayer(false,
+                   "max",
+                   "non-seq",
+                   5);  // seq max to a shorten seq, stride window = 5
+  testDegradeLayer(true, "max", "non-seq", -1);  // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);      // hasSubseq max to seq
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
@@ -862,14 +991,22 @@ TEST(Layer, SequenceLastInstanceLayer) {
                    "seqlastins",
                    "non-seq",
                    -1);  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(
-      true, "seqlastins", "seq", -1);  // hasSubseq seqlastins to seq
+  testDegradeLayer(true,
+                   "seqlastins",
+                   "seq",
+                   -1);  // hasSubseq seqlastins to seq
 }
 
 TEST(Layer, AverageLayer) {
   testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
-  testDegradeLayer(
-      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
+  testDegradeLayer(false,
+                   "average",
+                   "non-seq",
+                   5);  // seq average to a shorten seq, stride window = 5
+  testDegradeLayer(true,
+                   "average",
+                   "non-seq",
+                   -1);                          // hasSubseq average to non-seq
   testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
 }
 
@@ -965,6 +1102,21 @@ TEST(Layer, InterpolationLayer) {
   }
 }
 
+TEST(Layer, DotProdLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("dot_prod");
+  config.layerConfig.set_size(1);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "dot_prod", 10, false, useGpu);
+  }
+}
+
 TEST(Layer, OuterProdLayer) {
   TestConfig config;
   config.layerConfig.set_type("out_prod");
@@ -1078,7 +1230,10 @@ void setPoolConfig(TestConfig* config,
   pool->set_output_y(oh);
 }
 
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1086,6 +1241,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
 
   pool->set_img_size(14);
   pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
   setPoolConfig(&config, pool, poolType);
   config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                               pool->channels());
@@ -1093,7 +1249,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   testLayerGrad(config, "pool", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
@@ -1117,15 +1273,97 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 
 TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2("cudnn-avg-incl-pad-pool",
+                 /* trans= */ false,
+                 /* useGpu= */ true);
+  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void setPool3DConfig(TestConfig* config,
+                     PoolConfig* pool,
+                     const string& poolType) {
+  // filter size
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+  const int CHANNELS = 16;
+
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool3d");
+  (*config).layerConfig.set_num_filters(NUM_FILTERS);
+
+  int kw = FILTER_SIZE, kh = FILTER_SIZE_Y, kd = FILTER_SIZE_Z;
+  int pw = 0, ph = 0, pd = 0;
+  int sw = 2, sh = 2, sd = 2;
+
+  pool->set_pool_type(poolType);
+  pool->set_pool_type("avg");
+  pool->set_channels(CHANNELS);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_size_z(kd);
+  pool->set_padding(0);
+  pool->set_padding_y(0);
+  pool->set_padding_z(0);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+  pool->set_stride_z(sd);
+  pool->set_start(0);
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  int od = outputSize(pool->img_size_z(), kd, pd, sd, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+  pool->set_output_z(od);
+}
+
+void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 11664, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  pool->set_img_size(IMAGE_SIZE);
+  pool->set_img_size_y(IMAGE_SIZE_Y);
+  pool->set_img_size_z(IMAGE_SIZE_Z);
+
+  setPool3DConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  testLayerGrad(config, "pool3d", 100, trans, useGpu);
+}
+
+TEST(Layer, Pool3DLayer) {
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
+  testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
 
@@ -1239,7 +1477,8 @@ TEST(Layer, RecurrentLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
     }
   }
 }
@@ -1261,7 +1500,8 @@ TEST(Layer, LstmLayer) {
     for (auto reversed : {false, true}) {
       config.layerConfig.set_reversed(reversed);
       config.testState = !reversed;
-      testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+      testLayerGrad(
+          config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
     }
   }
   for (auto useGpu : {true}) {
@@ -1509,7 +1749,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, BatchNormalizationLayer) {
   testBatchNormLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNormLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNormLayer("cudnn_batch_norm", false, true);
@@ -1517,6 +1757,55 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
 
+void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
+  TestConfig config;
+  const int CHANNELS = 10;
+  const int IMG_SIZE = 16;
+  const int IMG_SIZE_Y = 8;
+  const int IMG_SIZE_Z = 8;
+  size_t size = CHANNELS * IMG_SIZE * IMG_SIZE_Y * IMG_SIZE_Z;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.biasSize = CHANNELS;
+  config.inputDefs.push_back({INPUT_DATA,
+                              "layer_0",
+                              /* dim= */ size,
+                              /* paraSize= */ CHANNELS});
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+  config.inputDefs.push_back({INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  config.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+  img_conf->set_img_size_y(IMG_SIZE_Y);
+  img_conf->set_img_size_z(IMG_SIZE_Z);
+
+  testLayerGrad(config,
+                "batch_norm",
+                64,
+                /* trans= */ trans,
+                useGpu,
+                /* useWeight */ true);
+}
+
+TEST(Layer, testBatchNorm3DLayer) {
+  testBatchNorm3DLayer("batch_norm", false, false);
+#ifdef PADDLE_WITH_CUDA
+  testBatchNorm3DLayer("batch_norm", false, true);
+  if (hl_get_cudnn_lib_version() >= int(4000)) {
+    testBatchNorm3DLayer("cudnn_batch_norm", false, true);
+  }
+#endif
+}
+
 void testConvOperator(bool isDeconv) {
   TestConfig config;
   const int NUM_FILTERS = 16;
@@ -1598,12 +1887,15 @@ TEST(Layer, FeatureMapExpandLayer) {
                               /* paraSize= */ 0});
   config.layerConfig.add_inputs();
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config,
-                  "featmap_expand",
-                  /*batch_size*/ 100,
-                  /* trans= */ false,
-                  useGpu,
-                  /* useWeight */ true);
+    for (auto asRowVec : {false, true}) {
+      config.layerConfig.set_user_arg(asRowVec ? "as_row_vec" : "as_col_vec");
+      testLayerGrad(config,
+                    "featmap_expand",
+                    /*batch_size*/ 100,
+                    /* trans= */ false,
+                    useGpu,
+                    /* useWeight */ true);
+    }
   }
 }
 
@@ -1658,6 +1950,8 @@ TEST(Layer, PadLayer) {
 
 TEST(Layer, CrossChannelNormLayer) {
   TestConfig config;
+  config.paramInitialMean = 1.;
+  config.paramInitialStd = 0.;
   config.layerConfig.set_type("norm");
   config.layerConfig.set_size(100);
   LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1671,7 +1965,7 @@ TEST(Layer, CrossChannelNormLayer) {
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false);
   }
 }
 
@@ -1689,10 +1983,74 @@ TEST(Layer, smooth_l1) {
   }
 }
 
+TEST(Layer, multibox_loss) {
+  TestConfig config;
+  config.layerConfig.set_type("multibox_loss");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MultiBoxLossConfig* multiboxLoss = input->mutable_multibox_loss_conf();
+  multiboxLoss->set_num_classes(21);
+  multiboxLoss->set_input_num(1);
+  multiboxLoss->set_overlap_threshold(0.5);
+  multiboxLoss->set_neg_pos_ratio(3);
+  multiboxLoss->set_neg_overlap(0.5);
+  multiboxLoss->set_background_id(0);
+  multiboxLoss->set_height(3);
+  multiboxLoss->set_width(3);
+
+  size_t gtNum = 1;
+  MatrixPtr labelValue = Matrix::create(gtNum, 6, false, false);
+  labelValue->randomizeUniform();
+  labelValue->add(-0.5);
+  labelValue->sigmoid(*labelValue);
+  real* labelData = labelValue->getData();
+  size_t labelWidth = labelValue->getWidth();
+  for (size_t i = 0; i < gtNum; ++i) {
+    *(labelData + i * labelWidth) = std::rand() % 20 + 1;
+    *(labelData + i * labelWidth + 1) = 0.400259;
+    *(labelData + i * labelWidth + 2) = 0.377857;
+    *(labelData + i * labelWidth + 3) = 0.525712;
+    *(labelData + i * labelWidth + 4) = 0.519368;
+  }
+  vector<int> seqStartPositions(gtNum + 1, 0);
+  for (size_t i = 1; i <= gtNum; ++i) {
+    seqStartPositions[i] = i;
+  }
+
+  // Ensure at lease one matched bbox
+  MatrixPtr priorValue = Matrix::create(1, 72, false, false);
+  priorValue->randomizeUniform();
+  priorValue->add(-0.5);
+  priorValue->sigmoid(*priorValue);
+  real* priorData = priorValue->getData();
+  *(priorData) = 0.424811;
+  *(priorData + 1) = 0.397059;
+  *(priorData + 2) = 0.538905;
+  *(priorData + 3) = 0.447091;
+  *(priorData + 4) = 0.425720;
+  *(priorData + 5) = 0.515228;
+  *(priorData + 6) = 0.519452;
+  *(priorData + 7) = 0.591065;
+
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "priorbox", priorValue, {}});
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "label", labelValue, seqStartPositions});
+  config.inputDefs.push_back({INPUT_DATA, "locPred", 36, 0});
+  config.inputDefs.push_back({INPUT_DATA, "confPred", 189, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "multibox_loss", 1, false, useGpu, false);
+  }
+}
+
 TEST(Layer, TransLayer) {
   TestConfig config;
   const int height = 128;
-  const int width = 1028;
+  const int width = 256;
   config.layerConfig.set_type("trans");
   config.layerConfig.set_size(width);
 
@@ -1705,6 +2063,466 @@ TEST(Layer, TransLayer) {
   }
 }
 
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, roi_pool) {
+  TestConfig config;
+  config.layerConfig.set_type("roi_pool");
+  config.biasSize = 0;
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ROIPoolConfig* roiPoolConf = input->mutable_roi_pool_conf();
+  roiPoolConf->set_pooled_width(7);
+  roiPoolConf->set_pooled_height(7);
+  roiPoolConf->set_spatial_scale(1. / 16);
+  roiPoolConf->set_width(14);
+  roiPoolConf->set_height(14);
+
+  const size_t roiNum = 10;
+  const size_t roiDim = 10;
+  const size_t batchSize = 5;
+  MatrixPtr roiValue = Matrix::create(roiNum, roiDim, false, false);
+  roiValue->zeroMem();
+  real* roiData = roiValue->getData();
+  for (size_t i = 0; i < roiNum; ++i) {
+    roiData[i * roiDim + 0] = std::rand() % batchSize;
+    roiData[i * roiDim + 1] = std::rand() % 224;  // xMin
+    roiData[i * roiDim + 2] = std::rand() % 224;  // yMin
+    size_t xMin = static_cast<size_t>(roiData[i * roiDim + 1]);
+    size_t yMin = static_cast<size_t>(roiData[i * roiDim + 2]);
+    roiData[i * roiDim + 3] = xMin + std::rand() % (224 - xMin);  // xMax
+    roiData[i * roiDim + 4] = yMin + std::rand() % (224 - yMin);  // yMax
+  }
+
+  config.inputDefs.push_back({INPUT_DATA, "input", 3 * 14 * 14, {}});
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "rois", roiValue, {}});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "roi_pool", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, SwitchOrderLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  img->set_img_size_y(16);
+
+  ReshapeConfig* reshape = config.layerConfig.mutable_reshape_conf();
+  reshape->add_height_axis(0);
+  reshape->add_height_axis(1);
+  reshape->add_height_axis(2);
+  reshape->add_width_axis(3);
+
+  // config softmax layer
+  config.layerConfig.set_type("switch_order");
+  config.layerConfig.set_name("switchOrderLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "switch_order", 100, false, useGpu, true);
+  }
+}
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+TEST(Layer, SubNestedSequenceLayer) {
+  // layer size is not crutial for this layer,
+  // so use a small layer size in unittest
+  const int layerSize = 4;
+
+  const int maxSeqNum = 50;
+  const int maxSeqLen = 50;
+  const int maxBeamSize = 32;
+
+  srand((size_t)(time(NULL)));
+  int beamSize = 1 + (rand() % maxBeamSize);
+
+  TestConfig config;
+  config.layerConfig.set_type("sub_nested_seq");
+  config.layerConfig.set_name("sub_nested_seq_layer");
+  config.layerConfig.set_size(layerSize);
+
+  int seqNum = 1 + (rand() % maxSeqNum);
+
+  // sequence information for the first input, it is a nested sequence
+  vector<int> seqStartPos(seqNum + 1, 0);
+  vector<int> subSeqStartPos(1, 0);
+
+  // selected indices
+  MatrixPtr selectedIndices = Matrix::create(seqNum, beamSize, false, false);
+  selectedIndices->one();
+  selectedIndices->mulScalar(-1.);
+  real* indicesData = selectedIndices->getData();
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqNum; ++j) {
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % maxSeqLen)));
+    }
+    vector<real> selSeqs =
+        randSampling(static_cast<real>(subSeqNum), min(beamSize, subSeqNum));
+    memcpy(indicesData + (i * beamSize),
+           selSeqs.data(),
+           selSeqs.size() * sizeof(real));
+    seqStartPos[i + 1] = subSeqStartPos.back();
+  }
+
+  MatrixPtr seqInputPtr =
+      Matrix::create(seqStartPos.back(), layerSize, false, false);
+  seqInputPtr->randomizeUniform();
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                              "nested_seq_input",
+                              seqInputPtr,
+                              seqStartPos,
+                              subSeqStartPos});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SELF_DEFINE_DATA, "selected_indices", selectedIndices});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "sub_nested_seq",
+                  /* batchSize */ seqNum,
+                  /* trans */ false,
+                  /* useGpu*/ useGpu,
+                  /* useWeight */ false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
+void test3DConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 9;
+  const int IMAGE_SIZE_Y = 9;
+  const int IMAGE_SIZE_Z = 9;
+
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  // Setting up conv3D-trans layer
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_img_size(IMAGE_SIZE);
+  conv->set_img_size_y(IMAGE_SIZE_Y);
+  conv->set_img_size_z(IMAGE_SIZE_Z);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /*  caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /*  caffeMode */ true));
+  conv->set_output_z(outputSize(conv->img_size_z(),
+                                conv->filter_size_z(),
+                                conv->padding_z(),
+                                conv->stride_z(),
+                                /*  caffeMode */ true));
+
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              conv->output_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "conv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "conv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DConvLayer) {
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
+  // filter size
+  const int NUM_FILTERS = 6;
+  // const int CHANNELS = 3;
+  const int FILTER_SIZE = 3;
+  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Z = 3;
+
+  // input image
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 4;
+  const int IMAGE_SIZE_Y = 6;
+  const int IMAGE_SIZE_Z = 6;
+
+  // Setting up conv-trans layer
+  TestConfig config;
+  config.biasSize = NUM_FILTERS;
+  config.layerConfig.set_type("deconv3d");
+  config.layerConfig.set_num_filters(NUM_FILTERS);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+
+  conv->set_channels(CHANNELS);
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_filter_size_z(FILTER_SIZE_Z);
+  conv->set_padding(0);
+  conv->set_padding_y(0);
+  conv->set_padding_z(0);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_stride_z(2);
+  conv->set_output_x(IMAGE_SIZE);
+  conv->set_output_y(IMAGE_SIZE_Y);
+  conv->set_output_z(IMAGE_SIZE_Z);
+
+  conv->set_img_size(imageSize(conv->output_x(),
+                               conv->filter_size(),
+                               conv->padding(),
+                               conv->stride(),
+                               true));
+  conv->set_img_size_y(imageSize(conv->output_y(),
+                                 conv->filter_size_y(),
+                                 conv->padding_y(),
+                                 conv->stride_y(),
+                                 true));
+  conv->set_img_size_z(imageSize(conv->output_z(),
+                                 conv->filter_size_z(),
+                                 conv->padding_z(),
+                                 conv->stride_z(),
+                                 true));
+  config.layerConfig.set_size(conv->img_size() * conv->img_size_y() *
+                              conv->img_size_z() * NUM_FILTERS);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  config.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       CHANNELS * IMAGE_SIZE * IMAGE_SIZE_Y * IMAGE_SIZE_Z,
+       conv->filter_channels() * FILTER_SIZE * FILTER_SIZE_Y * FILTER_SIZE_Z *
+           NUM_FILTERS});
+
+  testLayerGrad(config, "deconv3D", 10, trans, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "deconv3D", 2, trans, useGpu, true, 0.02);
+}
+
+TEST(Layer, test3DDeConvLayer) {
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
+#ifdef PADDLE_WITH_CUDA
+  test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
+#endif
+}
+
+TEST(Layer, ScaleShiftLayer) {
+  // FIXME: Disable ScaleShiftLayer because it is not stable.
+  // https://github.com/PaddlePaddle/Paddle/issues/7781
+  return;
+  //  const size_t batchSize = 16;
+  //  const size_t size = 32;
+  //  TestConfig config;
+  //  config.layerConfig.set_type("scale_shift");
+  //  config.layerConfig.set_size(size);
+  //  config.biasSize = 1;
+  //  config.inputDefs.push_back(
+  //      {INPUT_DATA, "input", /* dim= */ size, /* paraSize= */ 1});
+  //  config.layerConfig.add_inputs();
+  //  for (auto useGpu : {false, true}) {
+  //    testLayerGrad(config, "scale_shift", batchSize, false, useGpu, false);
+  //  }
+}
+
+TEST(Layer, ScaleSubRegionLayer) {
+  const size_t batchSize = 64;
+  const size_t size = 4096;
+  TestConfig config;
+  config.layerConfig.set_type("scale_sub_region");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  MatrixPtr indicesV = Matrix::create(batchSize, 6, false, false);
+  auto* data = indicesV->getData();
+  for (size_t i = 0; i < batchSize; ++i) {
+    data[i * 2] = 2;
+    data[i * 2 + 1] = 4;
+    data[i * 2 + 2] = 16;
+    data[i * 2 + 3] = 32;
+    data[i * 2 + 4] = 16;
+    data[i * 2 + 5] = 32;
+  }
+  config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "indices", indicesV, {}});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ScaleSubRegionConfig* scaleSubRegionConf =
+      input->mutable_scale_sub_region_conf();
+  ImageConfig* imgConf = scaleSubRegionConf->mutable_image_conf();
+  imgConf->set_img_size(32);
+  imgConf->set_img_size_y(32);
+  imgConf->set_channels(4);
+  scaleSubRegionConf->set_value(2.0);
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "scale_sub_region", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, L2DistanceLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("l2_distance");
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+
+  const size_t input_dim = 27;
+  const size_t batch_size = 11;
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
+  }
+}
+
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000..ad1dbc3ee2
--- /dev/null
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,448 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/activations/MKLDNNActivation.h"
+#include "paddle/math/MathUtils.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
+  MKLDNNTester tester;                                        \
+  for (auto bs : {DESC.bs, 1}) {                              \
+    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
+  }
+
+#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
+  TestConfig ref = DNN_CONFIG;                            \
+  ref.layerConfig.set_type(REF_TYPE);                     \
+  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
+
+struct testFcDesc {
+  int bs;
+  int ic;
+  int ih, iw;  // oh == ow == 1
+  int oc;
+};
+
+static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+}
+
+void testFcLayer(const testFcDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNFcConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  /* bs, ic, ih, iw, oc */
+  testFcLayer({2, 2, 1, 1, 3});
+  testFcLayer({3, 7, 1, 1, 19});
+  testFcLayer({8, 16, 13, 13, 32});
+  testFcLayer({4, 12, 13, 13, 18});
+  testFcLayer({2, 64, 16, 16, 32});
+  testFcLayer({15, 3, 16, 16, 6});
+}
+
+struct testConvDesc {
+  int bs, gp;
+  int ic, ih, iw;
+  int oc, oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+  int dh, dw;
+};
+
+static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_num_filters(pm.oc);
+  cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
+  cfg.layerConfig.set_shared_biases(true);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.fh * pm.fw / pm.gp)});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_groups(pm.gp);
+  conv->set_img_size(pm.iw);
+  conv->set_img_size_y(pm.ih);
+  conv->set_output_x(pm.ow);
+  conv->set_output_y(pm.oh);
+  conv->set_filter_size(pm.fw);
+  conv->set_filter_size_y(pm.fh);
+  conv->set_channels(pm.ic);
+  conv->set_padding(pm.pw);
+  conv->set_padding_y(pm.ph);
+  conv->set_stride(pm.sw);
+  conv->set_stride_y(pm.sh);
+  conv->set_dilation(pm.dw);
+  conv->set_dilation_y(pm.dh);
+  conv->set_caffe_mode(true);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  CHECK_EQ(conv->filter_channels() * pm.gp, conv->channels())
+      << "it is indivisible";
+
+  int fh = (pm.fh - 1) * pm.dh + 1;
+  int fw = (pm.fw - 1) * pm.dw + 1;
+  int ow = outputSize(pm.iw, fw, pm.pw, pm.sw, true);
+  int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testConvLayer(const testConvDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNConvConfig(dnnConfig, pm);
+  for (auto biasSize : {pm.oc, 0}) {
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
+  }
+}
+
+TEST(MKLDNNLayer, ConvLayer) {
+  /* bs, gp, ic, ih, iw, oc, oh, ow, fh, fw, ph, pw, sh, sw, dh, dw */
+  testConvLayer({2, 1, 3, 32, 32, 16, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 16, 16, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({3, 1, 16, 32, 32, 3, 32, 32, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({8, 1, 16, 18, 18, 32, 18, 18, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({16, 1, 1, 42, 31, 32, 23, 11, 4, 5, 3, 2, 2, 3, 1, 1});
+  testConvLayer({2, 1, 8, 16, 16, 8, 8, 8, 3, 3, 1, 1, 2, 2, 1, 1});
+  testConvLayer({3, 1, 8, 13, 13, 8, 7, 7, 3, 3, 1, 1, 2, 2, 1, 1});
+  // with groups
+  testConvLayer({2, 2, 4, 5, 5, 8, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({2, 3, 3, 5, 5, 3, 5, 5, 3, 3, 1, 1, 1, 1, 1, 1});
+  testConvLayer({4, 4, 16, 3, 3, 16, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1});
+}
+
+struct testPoolDesc {
+  int bs, ic;  // input channel and output channel are the same
+  int ih, iw;
+  int oh, ow;
+  int fh, fw;
+  int ph, pw;
+  int sh, sw;
+};
+
+static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+  pool->set_pool_type("avg-projection");
+  pool->set_channels(pm.ic);
+  pool->set_img_size(pm.iw);
+  pool->set_img_size_y(pm.ih);
+  pool->set_output_x(pm.ow);
+  pool->set_output_y(pm.oh);
+  pool->set_size_x(pm.fw);
+  pool->set_size_y(pm.fh);
+  pool->set_padding(pm.pw);
+  pool->set_padding_y(pm.ph);
+  pool->set_stride(pm.sw);
+  pool->set_stride_y(pm.sh);
+
+  int oh = outputSize(pm.ih, pm.fh, pm.ph, pm.sh, false);
+  int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
+  CHECK_EQ(ow, pm.ow) << "output size check failed";
+  CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
+
+void testPoolLayer(const testPoolDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNPoolConfig(dnnConfig, pm);
+  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
+  PoolConfig* pool = input->mutable_pool_conf();
+  for (auto type : {"max-projection", "avg-projection"}) {
+    pool->set_pool_type(type);
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
+  }
+}
+
+TEST(MKLDNNLayer, PoolLayer) {
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
+  testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
+  testPoolLayer({8, 16, 56, 56, 28, 28, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({8, 16, 14, 14, 7, 7, 3, 3, 0, 0, 2, 2});
+  testPoolLayer({4, 16, 7, 7, 1, 1, 7, 7, 0, 0, 1, 1});
+  testPoolLayer({4, 2, 5, 5, 3, 3, 5, 5, 1, 1, 1, 1});
+  testPoolLayer({2, 8, 56, 56, 29, 29, 3, 3, 1, 1, 2, 2});
+}
+
+struct testBatchNormDesc {
+  int bs;
+  int ic;
+  int ih, iw;
+};
+
+static void getMKLDNNBatchNormConfig(TestConfig& cfg,
+                                     const testBatchNormDesc& pm) {
+  cfg.layerConfig.set_size(pm.ic * pm.ih * pm.iw);
+  cfg.layerConfig.set_type("mkldnn_batch_norm");
+  cfg.biasSize = pm.ic;
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.ic)});
+  cfg.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_moving_mean", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_2_moving_var", 1, size_t(pm.ic)});
+  cfg.inputDefs.back().isStatic = true;
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  cfg.layerConfig.set_active_type("relu");
+  cfg.layerConfig.add_inputs();
+  cfg.layerConfig.add_inputs();
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(pm.ic);
+  img_conf->set_img_size_y(pm.ih);
+  img_conf->set_img_size(pm.iw);
+}
+
+void testBatchNormLayer(const testBatchNormDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNBatchNormConfig(dnnConfig, pm);
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("batch_norm");
+  // for PASS_TRAIN, use_global_stats always should be false, and batchsize != 1
+  VLOG(MKLDNN_TESTS) << "check train phase";
+  dnnConfig.layerConfig.set_use_global_stats(false);
+  refConfig.layerConfig.set_use_global_stats(false);
+  MKLDNNTester tester;
+  tester.run(dnnConfig, refConfig, pm.bs, pm.ih, pm.iw, PASS_TRAIN);
+  // for PASS_TEST, check use_global_stats true and false, and batchsize 1
+  VLOG(MKLDNN_TESTS) << "check test phase";
+  for (auto useGS : {false, true}) {
+    dnnConfig.layerConfig.set_use_global_stats(useGS);
+    refConfig.layerConfig.set_use_global_stats(useGS);
+    MKLDNNTester tester;
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(dnnConfig, refConfig, bs, pm.ih, pm.iw, PASS_TEST);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, BatchNormLayer) {
+  testBatchNormLayer({4, 10, 6, 6});
+  testBatchNormLayer({16, 32, 16, 16});
+  testBatchNormLayer({4, 16, 8, 10});
+}
+
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
+
+struct testImageDesc {
+  int bs, ic, ih, iw;
+};
+
+static void getAddtoConfig(TestConfig& cfg,
+                           const testImageDesc& pm,
+                           const size_t nInputs = 1) {
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("addto");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.layerConfig.set_size(layerSize);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < nInputs; ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back({INPUT_DATA, ss.str(), layerSize, 0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(pm.ic);
+    img_conf->set_img_size_y(pm.ih);
+    img_conf->set_img_size(pm.iw);
+  }
+}
+
+void testAddtoLayer(const testImageDesc& pm, const size_t nInputs) {
+  CHECK_GE(nInputs, 1UL);
+  TestConfig dnnConfig;
+  getAddtoConfig(dnnConfig, pm, nInputs);
+  dnnConfig.layerConfig.set_type("mkldnn_addto");
+  for (auto withBias : {false, true}) {
+    dnnConfig.biasSize = withBias ? pm.ic * pm.ih * pm.iw : 0;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "addto", pm)
+  }
+}
+
+TEST(MKLDNNLayer, AddtoLayer) {
+  testAddtoLayer({16, 5, 14, 14}, 1);
+  testAddtoLayer({8, 10, 8, 8}, 2);
+  testAddtoLayer({4, 12, 1, 1}, 3);
+}
+
+static void getMKLDNNConcatConfig(TestConfig& cfg,
+                                  const std::vector<testImageDesc>& inputs) {
+  CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
+  int oc = inputs[0].ic;
+  for (size_t i = 1; i < inputs.size(); ++i) {
+    CHECK_EQ(inputs[i].bs, inputs[0].bs);
+    CHECK_EQ(inputs[i].ih, inputs[0].ih);
+    CHECK_EQ(inputs[i].iw, inputs[0].iw);
+    oc += inputs[i].ic;
+  }
+  cfg.biasSize = 0;
+  cfg.layerConfig.set_type("mkldnn_concat");
+  cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
+  cfg.layerConfig.set_active_type("relu");
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    std::stringstream ss;
+    ss << "layer_" << i;
+    cfg.inputDefs.push_back(
+        {INPUT_DATA,
+         ss.str(),
+         (size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
+         0});
+    LayerInputConfig* input = cfg.layerConfig.add_inputs();
+    ImageConfig* img_conf = input->mutable_image_conf();
+    img_conf->set_channels(inputs[i].ic);
+    img_conf->set_img_size_y(inputs[i].ih);
+    img_conf->set_img_size(inputs[i].iw);
+  }
+}
+
+void testConcatLayer(const std::vector<testImageDesc>& inputs) {
+  TestConfig dnnConfig;
+  getMKLDNNConcatConfig(dnnConfig, inputs);
+  RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
+}
+
+TEST(MKLDNNLayer, ConcatLayer) {
+  testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
+  testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
+}
+
+void testActivation(std::string actType, const testImageDesc& pm) {
+  // TODO(TJ): remove me when paddle support elu activation
+  if (actType == "mkldnn_elu") {
+    return;
+  }
+  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
+  TestConfig cfg;
+  getAddtoConfig(cfg, pm);
+  TestConfig ref = cfg;
+  cfg.layerConfig.set_active_type(compareTypes[0]);
+  ref.layerConfig.set_active_type(compareTypes[1]);
+  RUN_MKLDNN_TEST(cfg, ref, pm)
+}
+
+TEST(MKLDNNActivation, Activations) {
+  auto types = MKLDNNActivation::getAllRegisteredTypes();
+  for (auto type : types) {
+    /* bs, c, h, w*/
+    testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
+  }
+}
+
+DECLARE_string(config_args);
+TEST(MKLDNNNet, net) {
+  std::vector<std::string> cases = {"simple", "branch"};
+  for (auto name : cases) {
+    std::string config = "./gserver/tests/mkldnn_" + name + "_net.conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runNetTest(config);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  initPython(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
new file mode 100644
index 0000000000..16438886df
--- /dev/null
+++ b/paddle/gserver/tests/test_MaxPoolingWithMaskOutput.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+
+#include "LayerGradUtil.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;
+
+void setPoolConfig(TestConfig* config,
+                   PoolConfig* pool,
+                   const string& poolType) {
+  (*config).biasSize = 0;
+  (*config).layerConfig.set_type("pool");
+  (*config).layerConfig.set_num_filters(1);
+
+  int kw = 3, kh = 3;
+  int pw = 0, ph = 0;
+  int sw = 2, sh = 2;
+  pool->set_pool_type(poolType);
+  pool->set_channels(1);
+  pool->set_size_x(kw);
+  pool->set_size_y(kh);
+  pool->set_start(0);
+  pool->set_padding(pw);
+  pool->set_padding_y(ph);
+  pool->set_stride(sw);
+  pool->set_stride_y(sh);
+
+  int ow = outputSize(pool->img_size(), kw, pw, sw, /* caffeMode */ false);
+  int oh = outputSize(pool->img_size_y(), kh, ph, sh, /* caffeMode */ false);
+  pool->set_output_x(ow);
+  pool->set_output_y(oh);
+}
+
+void doOneMaxPoolingWithMaskOutputTest(MatrixPtr& inputMat,
+                                       const string& poolType,
+                                       bool use_gpu,
+                                       MatrixPtr& maskMat) {
+  TestConfig config;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 25, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PoolConfig* pool = input->mutable_pool_conf();
+
+  pool->set_img_size(5);
+  pool->set_img_size_y(5);
+  setPoolConfig(&config, pool, poolType);
+  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
+                              pool->channels());
+
+  config.layerConfig.set_name("MaxPoolWithMask");
+
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+
+  initDataLayer(config,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "MaxPoolWithMask",
+                1,
+                false,
+                use_gpu);
+
+  dataLayers[0]->getOutputValue()->copyFrom(*inputMat);
+
+  FLAGS_use_gpu = use_gpu;
+  std::vector<ParameterPtr> parameters;
+  LayerPtr maxPoolingWithMaskOutputLayer;
+  initTestLayer(config, &layerMap, &parameters, &maxPoolingWithMaskOutputLayer);
+  maxPoolingWithMaskOutputLayer->forward(PASS_GC);
+
+  checkMatrixEqual(maxPoolingWithMaskOutputLayer->getOutput("mask").value,
+                   maskMat);
+}
+
+TEST(Layer, maxPoolingWithMaskOutputLayerFwd) {
+  bool useGpu = false;
+  MatrixPtr inputMat;
+  MatrixPtr maskMat;
+  real inputData[] = {0.1, 0.1, 0.5, 0.5, 1.1, 0.2, 0.2, 0.6, 0.1,
+                      0.1, 0.3, 0.3, 0.7, 0.1, 0.1, 0.4, 0.4, 0.8,
+                      0.8, 0.1, 1.0, 2.0, 3.0, 0.0, 9.0};
+  real maskData[] = {12, 4, 22, 24};
+
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->setData(inputData);
+  maskMat->setData(maskData);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#ifdef PADDLE_WITH_CUDA
+  useGpu = true;
+  inputMat = Matrix::create(1, 25, false, useGpu);
+  maskMat = Matrix::create(1, 4, false, useGpu);
+  inputMat->copyFrom(inputData, 25);
+  maskMat->copyFrom(maskData, 4);
+  doOneMaxPoolingWithMaskOutputTest(
+      inputMat, "max-pool-with-mask", useGpu, maskMat);
+#endif
+}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 40e662b22b..2b92211936 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -237,7 +237,13 @@ TEST(Compare, concat_table) {
   compareNetwork(config_file_a, config_file_b);
 }
 
-#ifndef PADDLE_ONLY_CPU
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
+#ifdef PADDLE_WITH_CUDA
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
   std::string config_file_b = "./gserver/tests/img_pool_b.conf";
@@ -263,7 +269,8 @@ TEST(Compare, img_conv2) {
   bool useGpu = FLAGS_use_gpu;
   double eps = FLAGS_checkgrad_eps;
   FLAGS_use_gpu = true;
-  FLAGS_checkgrad_eps = 1e-2;
+  // Sometimes, this unit test will fail with 1e-2
+  FLAGS_checkgrad_eps = 4e-2;
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
   FLAGS_checkgrad_eps = eps;
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index ae0e3bc3d2..8dc5568784 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
                     useGpu,
                     result);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   // reset the input parameters
   variance[1] = 0.1;
   variance[3] = 0.2;
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
deleted file mode 100644
index e11bf402c2..0000000000
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ /dev/null
@@ -1,732 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <string>
-
-#include <gtest/gtest.h>
-
-#include "paddle/gserver/dataproviders/ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-
-#include "paddle/testing/TestUtil.h"
-
-using namespace std;  // NOLINT
-
-std::vector<string> protoFiles{
-    "./test_ProtoDataProvider/data1.bin", "./test_ProtoDataProvider/data2.bin",
-};
-std::vector<string> protoFilesCompressed{
-    "./test_ProtoDataProvider/data1.bin.gz",
-    "./test_ProtoDataProvider/data2.bin.gz",
-};
-
-const char* kTestDir = "./test_ProtoDataProvider";
-const char kProtoFileList[] = "gserver/tests/proto_files.txt";
-const char kProtoFileListCompressed[] =
-    "gserver/tests/proto_files_compressed.txt";
-const int kSpraseMatrixDim = 1024;
-
-using namespace paddle;  // NOLINT
-
-void prepareData(DataBatch* batch,
-                 const int* numPerSlotType,
-                 bool iid,
-                 bool useGpu) {
-  batch->clear();
-  int64_t size = uniformRandom(100) + 10;
-  batch->setSize(size);
-
-  ICpuGpuVectorPtr sequenceStartPositions;
-  ICpuGpuVectorPtr subSequenceStartPositions;
-  if (!iid) {
-    int numSeqs = uniformRandom(10) + 1;
-    sequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* buf = sequenceStartPositions->getMutableData(false);
-    subSequenceStartPositions =
-        ICpuGpuVector::create(numSeqs + 1, /* useGpu= */ false);
-    int* subBuf = subSequenceStartPositions->getMutableData(false);
-    int64_t pos = 0;
-    int maxLen = 2 * size / numSeqs;
-    for (int i = 0; i < numSeqs; ++i) {
-      int len =
-          uniformRandom(min<int64_t>(maxLen, size - pos - numSeqs + i)) + 1;
-      buf[i] = pos;
-      subBuf[i] = pos;
-      pos += len;
-      VLOG(1) << " len=" << len;
-    }
-    buf[numSeqs] = size;
-    subBuf[numSeqs] = size;
-  }
-
-  vector<Argument>& arguments = batch->getStreams();
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_DENSE]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    MatrixPtr mat = Matrix::create(size, dim, /* trans= */ false, false);
-    mat->randomizeUniform();
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, false, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arg.subSequenceStartPositions = subSequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE]; ++i) {
-    MatrixPtr mat =
-        makeRandomSparseMatrix(size, kSpraseMatrixDim, true, useGpu);
-    Argument arg;
-    arg.value = mat;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::STRING]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    SVectorPtr vec = std::make_shared<std::vector<std::string>>();
-    for (int j = 0; j < size; ++j) {
-      vec->push_back(randStr(dim));
-    }
-    Argument arg;
-    arg.strs = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-  for (int i = 0; i < numPerSlotType[SlotDef::INDEX]; ++i) {
-    int64_t dim = rand() % 10 + 4;  // NOLINT rand_r
-    IVectorPtr vec = IVector::create(size, /* useGpu= */ false);
-    int* buf = vec->getData();
-    for (int j = 0; j < size; ++j) {
-      buf[j] = uniformRandom(dim);
-    }
-    Argument arg;
-    arg.ids = vec;
-    arg.sequenceStartPositions = sequenceStartPositions;
-    arguments.push_back(arg);
-  }
-}
-
-inline int getSlotDim(const Argument& arg) {
-  if (arg.value) {
-    return arg.value->getWidth();
-  } else if (arg.ids) {
-    return arg.ids->getMax() + 1;
-  } else if (arg.strs) {
-    return 1;
-  }
-  LOG(FATAL) << "Invalid argument";
-  return 0;
-}
-
-inline SlotDef::SlotType getSlotType(const Argument& arg) {
-  if (arg.value) {
-    auto& m = *arg.value;
-    auto& type = typeid(m);
-    if (type == typeid(CpuMatrix) || type == typeid(GpuMatrix)) {
-      return SlotDef::VECTOR_DENSE;
-    }
-    if (type == typeid(CpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<CpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-    if (type == typeid(GpuSparseMatrix)) {
-      auto valueType =
-          std::dynamic_pointer_cast<GpuSparseMatrix>(arg.value)->getValueType();
-      if (NO_VALUE == valueType) {
-        return SlotDef::VECTOR_SPARSE_NON_VALUE;
-      } else {
-        return SlotDef::VECTOR_SPARSE_VALUE;
-      }
-    }
-
-    LOG(FATAL) << "Unknown matrix type";
-  }
-  if (arg.ids) return SlotDef::INDEX;
-  if (arg.strs) return SlotDef::STRING;
-  LOG(FATAL) << "Invalid argument";
-  return SlotDef::VECTOR_DENSE;
-}
-
-void getColRow(const Argument& arg,
-               int64_t pos,
-               bool useGpu,
-               int* colNum,
-               const int** rowCols,
-               const real** rowValues) {
-  SlotDef::SlotType type = getSlotType(arg);
-  GpuSparseMatrixPtr matGpu;
-  CpuSparseMatrixPtr matCpu;
-  if (useGpu) {
-    matGpu = dynamic_pointer_cast<GpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matGpu != NULL);
-  } else {
-    matCpu = dynamic_pointer_cast<CpuSparseMatrix>(arg.value);
-    ASSERT_TRUE(matCpu != NULL);
-  }
-  *colNum = useGpu ? matGpu->getColNum(pos) : matCpu->getColNum(pos);
-  *rowCols = useGpu ? matGpu->getRowCols(pos) : matCpu->getRowCols(pos);
-  if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-    *rowValues = useGpu ? matGpu->getRowValues(pos) : matCpu->getRowValues(pos);
-  } else {
-    *rowValues = NULL;
-  }
-}
-
-void makeSample(const vector<Argument>& arguments,
-                int64_t pos,
-                bool isBeginning,
-                DataSample* sample,
-                bool useGpu) {
-  sample->set_is_beginning(isBeginning);
-  int slotid = 0;
-  for (auto& arg : arguments) {
-    SlotDef::SlotType type = getSlotType(arg);
-    int64_t dim = getSlotDim(arg);
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        values->Reserve(dim);
-        for (int i = 0; i < dim; ++i) {
-          values->AddAlreadyReserved(
-              static_cast<float>(arg.value->getElement(pos, i)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        sample->add_id_slots(arg.ids->get(pos));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;  // nullptr
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-        }
-        SubseqSlot* subseqSlot = sample->add_subseq_slots();  // subseq
-        subseqSlot->set_slot_id(slotid);
-        auto lens = subseqSlot->mutable_lens();
-        lens->Add(colNum);
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        auto values = vecSlot->mutable_values();
-        auto ids = vecSlot->mutable_ids();
-        int colNum;
-        const int* rowCols;
-        const real* rowValues;
-        getColRow(arg, pos, useGpu, &colNum, &rowCols, &rowValues);
-        ids->Reserve(colNum);
-        values->Reserve(colNum);
-        for (int i = 0; i < colNum; ++i) {
-          ids->AddAlreadyReserved(rowCols[i]);
-          values->AddAlreadyReserved(rowValues[i]);
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        VectorSlot* vecSlot = sample->add_vector_slots();
-        vecSlot->add_strs((*arg.strs)[pos]);
-        break;
-      }
-    }
-    slotid++;
-  }
-}
-
-void writeData(const DataBatch& batch, bool useGpu, bool dataCompression) {
-  DataHeader header;
-  const vector<Argument>& arguments = batch.getStreams();
-  for (auto& argument : arguments) {
-    SlotDef* slotDef = header.add_slot_defs();
-    slotDef->set_type(getSlotType(argument));
-    slotDef->set_dim(getSlotDim(argument));
-  }
-  VLOG(1) << "header=" << header.DebugString();
-
-  int64_t totalSeqs = batch.getNumSequences();
-  int64_t seq = 0;
-  ICpuGpuVectorPtr sequenceStartPositions = arguments[0].sequenceStartPositions;
-  int64_t numWritten = 0;
-  vector<string> curProtoFiles =
-      dataCompression ? protoFilesCompressed : protoFiles;
-  for (size_t i = 0; i < curProtoFiles.size(); ++i) {
-    int64_t numSeqs = totalSeqs * (i + 1) / curProtoFiles.size() -
-                      totalSeqs * i / curProtoFiles.size();
-    ofstream os(curProtoFiles[i]);
-    CHECK(os) << "Fail to open " << curProtoFiles[i];
-    unique_ptr<ProtoWriter> writer(new ProtoWriter(&os, dataCompression));
-    CHECK(writer->write(header));
-    for (int j = 0; j < numSeqs; ++j, ++seq) {
-      int64_t begin = seq;
-      int64_t end = seq + 1;
-      if (sequenceStartPositions) {
-        begin = sequenceStartPositions->getElement(seq);
-        end = sequenceStartPositions->getElement(seq + 1);
-      }
-      for (int pos = begin; pos < end; ++pos) {
-        DataSample sample;
-        makeSample(arguments, pos, pos == begin, &sample, useGpu);
-        CHECK(writer->write(sample));
-        ++numWritten;
-      }
-    }
-
-    writer.reset(nullptr);
-    os.close();
-  }
-  CHECK_EQ(arguments[0].getBatchSize(), numWritten);
-}
-
-// check that the sample at pos1 in args1 is same as the sample at pos2 in args2
-void checkSample(const vector<Argument>& args1,
-                 int64_t pos1,
-                 const vector<Argument>& args2,
-                 int64_t pos2,
-                 bool useGpu) {
-  EXPECT_EQ(args1.size(), args2.size());
-  VLOG(1) << " pos1=" << pos1 << " pos2=" << pos2;
-
-  for (size_t i = 0; i < args1.size(); ++i) {
-    auto type = getSlotType(args1[i]);
-    int dim = getSlotDim(args1[i]);
-    EXPECT_EQ(type, getSlotType(args2[i]));
-    if (type == SlotDef::INDEX) {
-      EXPECT_GE(dim, getSlotDim(args2[i]));
-    } else {
-      EXPECT_EQ(dim, getSlotDim(args2[i]));
-    }
-    switch (type) {
-      case SlotDef::VECTOR_DENSE: {
-        for (int j = 0; j < dim; ++j) {
-          EXPECT_EQ(static_cast<float>(args1[i].value->getElement(pos1, j)),
-                    static_cast<float>(args2[i].value->getElement(pos2, j)));
-        }
-        break;
-      }
-      case SlotDef::INDEX: {
-        EXPECT_EQ(args1[i].ids->get(pos1), args2[i].ids->get(pos2));
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE:
-      case SlotDef::VECTOR_SPARSE_VALUE: {
-        int colNum1, colNum2;
-        const int *rowCols1, *rowCols2;
-        const real *rowValues1, *rowValues2;
-        getColRow(args1[i], pos1, useGpu, &colNum1, &rowCols1, &rowValues1);
-        getColRow(args2[i], pos2, useGpu, &colNum2, &rowCols2, &rowValues2);
-        EXPECT_EQ(colNum1, colNum2);
-        for (int j = 0; j < colNum1; ++j) {
-          EXPECT_EQ(rowCols1[j], rowCols2[j]);
-          if (type == SlotDef::VECTOR_SPARSE_VALUE) {
-            EXPECT_EQ(rowValues1[j], rowValues2[j]);
-          }
-        }
-        break;
-      }
-      case SlotDef::VAR_MDIM_DENSE:
-      case SlotDef::VAR_MDIM_INDEX: {
-        LOG(FATAL) << "Not implemented";
-        break;
-      }
-      case SlotDef::STRING: {
-        EXPECT_EQ((*args1[i].strs)[pos1], (*args2[i].strs)[pos2]);
-        break;
-      }
-    }
-  }
-}
-
-void testProtoDataProvider(int* numPerSlotType,
-                           bool iid,
-                           bool async,
-                           bool useGpu,
-                           bool dataCompression,
-                           int numConstantSlots = 0) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data, numPerSlotType, iid, useGpu);
-  writeData(data, useGpu, dataCompression);
-
-  DataConfig config;
-  config.set_type("proto");
-  config.set_files(dataCompression ? kProtoFileListCompressed : kProtoFileList);
-  config.set_async_load_data(async);
-
-  for (int i = 0; i < numConstantSlots; ++i) {
-    config.add_constant_slots(i + 11);
-    MatrixPtr w = Matrix::create(data.getSize(),
-                                 1,
-                                 /* trans= */ false,
-                                 /* useGpu= */ false);
-    w->assign(config.constant_slots(i));
-    data.appendData(w);
-  }
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  size_t seq1 = 0;
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args2) {
-      EXPECT_EQ(iid, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    VLOG(1) << "numSeqs=" << numSeqs;
-    for (size_t seq2 = 0; seq2 < numSeqs; ++seq1, ++seq2) {
-      int64_t begin1 = seq1;
-      int64_t end1 = seq1 + 1;
-      if (sequenceStartPositions1) {
-        begin1 = sequenceStartPositions1->getElement(seq1);
-        end1 = sequenceStartPositions1->getElement(seq1 + 1);
-        EXPECT_LT(seq1, sequenceStartPositions1->getSize() - 1);
-      }
-
-      int64_t begin2 = seq2;
-      int64_t end2 = seq2 + 1;
-      if (sequenceStartPositions2) {
-        begin2 = sequenceStartPositions2->getElement(seq2);
-        end2 = sequenceStartPositions2->getElement(seq2 + 1);
-      }
-      VLOG(1) << " begin1=" << begin1 << " end1=" << end1
-              << " begin2=" << begin2 << " end2=" << end2;
-      EXPECT_EQ(end1 - begin1, end2 - begin2);
-      for (int i = 0; i < end1 - begin1; ++i) {
-        checkSample(args1, begin1 + i, args2, begin2 + i, useGpu);
-      }
-    }
-  }
-
-  EXPECT_EQ(seq1, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  int numSlotsArraySize = sizeof(numSlotsArray) / sizeof(numSlotsArray[0]);
-  const int numSlot = 5;
-  int combination[numSlot] = {0};
-  int k = numSlot - 1;
-  while (k >= 0) {
-    int numDenseVecSlots = numSlotsArray[combination[0]];
-    int numSparseNonValueVecSlots = numSlotsArray[combination[1]];
-    int numSparseValueVectorSlots = numSlotsArray[combination[2]];
-    int numStrSlots = numSlotsArray[combination[3]];
-    int numIdSlots = numSlotsArray[combination[4]];
-    // while loop : traverse all cases
-    k = numSlot - 1;
-    while (k >= 0) {
-      if (combination[k] < (numSlotsArraySize - 1)) {
-        ++combination[k];
-        break;
-      } else {
-        combination[k] = 0;
-        --k;
-      }
-    }
-    if (numDenseVecSlots + numSparseNonValueVecSlots +
-            numSparseValueVectorSlots + numStrSlots + numIdSlots <
-        1)
-      continue;
-    for (int iid : numTwoArray) {
-      for (int async : numTwoArray) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifdef PADDLE_ONLY_CPU
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numSparseValueVectorSlots="
-                      << numSparseValueVectorSlots
-                      << " numStrSlots=" << numStrSlots
-                      << " numIdSlots=" << numIdSlots << " iid=" << iid
-                      << " async=" << async << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] =
-                numSparseValueVectorSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            numPerSlotType[SlotDef::STRING] = numStrSlots;
-            testProtoDataProvider(
-                numPerSlotType, iid, async, useGpu, dataCompression);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int async : numTwoArray)
-    }        // end for (int iid : numTwoArray)
-  }          // end for (while, traverse all slots)
-}
-
-TEST(ProtoDataProvider, constant_slots) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numDenseVecSlots : numSlotsArray) {
-    for (int numSparseNonValueVecSlots : numSlotsArray) {
-      if (numDenseVecSlots + numSparseNonValueVecSlots < 1) continue;
-      for (int numConstantSlots : {1, 2}) {
-        for (int useGpu : numTwoArray) {
-          for (int dataCompression : numTwoArray) {
-#ifdef PADDLE_ONLY_CPU
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numConstantSlogs=" << numConstantSlots
-                      << " useGpu=" << useGpu
-                      << " dataCompression=" << dataCompression;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_VALUE] = 1;
-            numPerSlotType[SlotDef::INDEX] = 1;
-            testProtoDataProvider(numPerSlotType,
-                                  /* iid= */ true,
-                                  /* async= */ false,
-                                  useGpu,
-                                  dataCompression,
-                                  numConstantSlots);
-          }  // end for (int dataCompression : numTwoArray)
-        }    // end for (int useGpu : numTwoArray)
-      }      // end for (int numConstantSlots : {1, 2})
-    }        // end for (int numSparseNonValueVecSlots : numSlotsArray)
-  }          // end for (int numDenseVecSlots : numSlotsArray)
-}
-
-void checkSampleSequence(const vector<Argument>& args1,
-                         const vector<Argument>& args2,
-                         int64_t offset,
-                         int64_t numSeqs,
-                         bool useGpu) {
-  // check slot num are equal
-  EXPECT_EQ(args1.size(), args2.size());
-  for (size_t i = 0; i < args1.size(); i++) {
-    auto type = getSlotType(args1[i]);
-    // check for args2: sequenceStartPositions vs numSeqs
-    // (1) size
-    EXPECT_EQ(args2[i].sequenceStartPositions->getSize(), (size_t)numSeqs + 1);
-    // (2) content
-    auto checkArgContent = [&](const Argument& args, int numSeqs) {
-      for (int j = 0; j <= numSeqs; j++) {
-        int start_pos = args.sequenceStartPositions->getElement(j);
-        EXPECT_EQ(start_pos, j);
-      }
-    };
-    switch (type) {
-      case SlotDef::INDEX: {
-        // args1: for label
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: ids are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].ids->getSize(), (size_t)numSeqs);
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          EXPECT_EQ(args2[i].ids->get(j), args1[i].ids->get(offset + j));
-        }
-        break;
-      }
-      case SlotDef::VECTOR_SPARSE_NON_VALUE: {
-        // args1: for sparse_non_value
-        // args2 should put sparse indexes in ids
-        int colNum1;
-        const int* rowCols1;
-        const real* rowValues1;  // nullptr
-        int totalLength = 0;
-        for (int j = 0; j < numSeqs; j++) {
-          getColRow(
-              args1[i], offset + j, useGpu, &colNum1, &rowCols1, &rowValues1);
-          // (1) lengths
-          EXPECT_EQ(totalLength,
-                    args2[i].sequenceStartPositions->getElement(j));
-          EXPECT_EQ(totalLength,
-                    args2[i].subSequenceStartPositions->getElement(j));
-          // (2) content
-          for (int k = 0; k < colNum1; k++) {
-            EXPECT_EQ(rowCols1[k], args2[i].ids->get(totalLength + k));
-          }
-          totalLength += colNum1;
-          if (colNum1 == 0) {
-            // special case here: we will put a "-1" into ids when column num is
-            // zero. see ProtoSequenceDataProvider::getNextBatchInternal.
-            EXPECT_EQ(-1, args2[i].ids->get(totalLength));
-            totalLength++;
-          }
-        }
-        EXPECT_EQ(totalLength,
-                  args2[i].sequenceStartPositions->getElement(numSeqs));
-        EXPECT_EQ(totalLength,
-                  args2[i].subSequenceStartPositions->getElement(numSeqs));
-        break;
-      }
-      case SlotDef::VECTOR_DENSE: {
-        // args1: for dense vector
-        checkArgContent(args2[i], numSeqs);
-        // check for args2: values are equal to args1[offset]
-        // (1) size
-        EXPECT_EQ(args2[i].value->getHeight(), (size_t)numSeqs);
-        EXPECT_EQ(args2[i].value->getWidth(), (size_t)getSlotDim(args1[i]));
-        // (2) content
-        for (int j = 0; j < numSeqs; j++) {
-          for (size_t k = 0; k < args2[i].value->getWidth(); k++) {
-            EXPECT_EQ(
-                static_cast<float>(args1[i].value->getElement(j + offset, k)),
-                static_cast<float>(args2[i].value->getElement(j, k)));
-          }
-        }
-        break;
-      }
-      default: { EXPECT_EQ(true, false) << "should not reach here"; }
-    }
-  }
-}
-
-void testProtoSequenceDataProvider(int* numPerSlotType,
-                                   bool async,
-                                   bool useGpu) {
-  mkDir(kTestDir);
-  DataBatch data;
-
-  prepareData(&data,
-              numPerSlotType,
-              /* iid */ true,
-              useGpu);
-  writeData(data, useGpu, /* dataCompression */ false);
-
-  DataConfig config;
-  config.set_type("proto_sequence");
-  config.set_files(kProtoFileList);
-  config.set_async_load_data(async);
-
-  unique_ptr<DataProvider> dataProvider(DataProvider::create(config, useGpu));
-  dataProvider->setSkipShuffle();
-
-  EXPECT_EQ(data.getSize(), dataProvider->getSize());
-
-  int64_t batchSize = 10;
-  DataBatch batch;
-
-  vector<Argument>& args1 = data.getStreams();
-  ICpuGpuVectorPtr sequenceStartPositions1 = args1[0].sequenceStartPositions;
-
-  dataProvider->reset();
-
-  size_t args1Offset = 0;
-  while (dataProvider->getNextBatch(batchSize, &batch) > 0) {
-    CHECK_EQ(data.getNumStreams(), batch.getNumStreams());
-    vector<Argument>& args2 = batch.getStreams();
-    ICpuGpuVectorPtr sequenceStartPositions2 = args2[0].sequenceStartPositions;
-    for (auto& arg : args1) {
-      // args1 should not has sequence
-      EXPECT_EQ(true, !arg.sequenceStartPositions);
-    }
-    for (auto& arg : args2) {
-      // args2 should has sequence
-      EXPECT_NE(true, !arg.sequenceStartPositions);
-    }
-    size_t numSeqs = batch.getNumSequences();
-    checkSampleSequence(args1, args2, args1Offset, numSeqs, useGpu);
-    args1Offset += numSeqs;
-  }
-
-  EXPECT_EQ(args1Offset, (size_t)data.getNumSequences());
-  rmDir(kTestDir);
-}
-
-TEST(ProtoSequenceDataProvider, test) {
-  int numSlotsArray[] = {0, 3};
-  int numTwoArray[] = {0, 1};
-  for (int numSparseNonValueVecSlots : numSlotsArray) {
-    for (int numIdSlots : numSlotsArray) {
-      for (int numDenseVecSlots : numSlotsArray) {
-        if (numDenseVecSlots + numSparseNonValueVecSlots + numIdSlots < 1)
-          continue;
-        for (int async : numTwoArray) {
-          for (int useGpu : numTwoArray) {
-            if (async && useGpu) {
-              // Currently in async mode, useGpu is not supported
-              continue;
-            }
-#ifdef PADDLE_ONLY_CPU
-            if (useGpu) {
-              continue;
-            }
-#endif
-            LOG(INFO) << " numDenseVecSlots=" << numDenseVecSlots
-                      << " numSparseNonValueVecSlots="
-                      << numSparseNonValueVecSlots
-                      << " numIdSlots=" << numIdSlots << " async=" << async
-                      << " useGpu=" << useGpu;
-            int numPerSlotType[SlotDef::SlotType_ARRAYSIZE] = {0};
-            numPerSlotType[SlotDef::VECTOR_DENSE] = numDenseVecSlots;
-            numPerSlotType[SlotDef::VECTOR_SPARSE_NON_VALUE] =
-                numSparseNonValueVecSlots;
-            numPerSlotType[SlotDef::INDEX] = numIdSlots;
-            testProtoSequenceDataProvider(numPerSlotType, async, useGpu);
-          }  // end for (int useGpu : numTwoArray)
-        }    // end for (int async : numTwoArray)
-      }      // end for (int numDenseVecSlots : numSlotsArray)
-    }        // end for (int numIdSlots : numSlotsArray)
-  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index db883543c3..fe54799259 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
   config.clear_files();
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
   EXPECT_EQ(config.IsInitialized(), true);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 2e6225519f..044aede98e 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -1,17 +1,16 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
+#    http://www.apache.org/licenses/LICENSE-2.0
 #
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import random
 
 from paddle.trainer.PyDataProvider2 import *
@@ -51,7 +50,10 @@ def test_sparse_non_value_no_seq(setting, filename):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
 
 
-@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(input_types=[
+    sparse_float_vector(
+        30000, seq_type=SequenceType.NO_SEQUENCE)
+])
 def test_sparse_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 4a846397e6..6b19eb0ce5 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -155,6 +155,15 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
   }
 }
 
+TEST(RecurrentGradientMachine, rnn_mixed_input) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_mixed_inputs.py",
+         "gserver/tests/sequence_rnn_matched_inputs.py",
+         1e-6,
+         useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
 
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 16ab0e6aec..0e13084333 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -222,6 +222,7 @@ TEST(Layer, RecurrentLayer) {
 #define protected public
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
 #include "paddle/gserver/layers/LstmLayer.h"
+#include "paddle/gserver/layers/RecurrentLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
@@ -420,12 +421,151 @@ TEST(Layer, LstmLayer) {
   }
 }
 
+#ifdef PADDLE_WITH_MKLML
+
+#include "paddle/gserver/layers/MKLPackedRecurrentLayer.h"
+
+LayerPtr initMKLPackedLayer(LayerConfig layerConfig,
+                            bool reversed,
+                            int layerSize,
+                            LayerPtr dataLayer,
+                            ParameterPtr para,
+                            ParameterPtr bias = nullptr) {
+  LayerMap layerMap;
+  ParameterMap parameterMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  parameterMap[para->getName()] = para;
+  if (bias) {
+    parameterMap[bias->getName()] = bias;
+    layerConfig.set_bias_parameter_name("bias_0");
+  }
+
+  layerConfig.set_size(layerSize);
+  layerConfig.set_reversed(reversed);
+  layerConfig.add_inputs();
+  LayerInputConfig& input = *(layerConfig.mutable_inputs(0));
+  input.set_input_layer_name("layer_0");
+  input.set_input_parameter_name("para_0");
+
+  LayerPtr testLayer = Layer::create(layerConfig);
+  layerMap[testLayer->getName()] = testLayer;
+
+  testLayer->init(layerMap, parameterMap);
+  testLayer->setNeedGradient(true);
+
+  return testLayer;
+}
+
+void checkMKLPackedLayer(LayerConfig layerConfig1,
+                         LayerConfig layerConfig2,
+                         bool reversed,
+                         int layerSize,
+                         int batchSize,
+                         bool useBatch1,
+                         bool useBatch2) {
+  LayerPtr dataLayer;
+  ParameterPtr para, bias;
+
+  if (layerConfig1.type() == "recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize, false);
+    bias = nullptr;
+  } else if (layerConfig1.type() == "gated_recurrent") {
+    dataLayer = creatDataLayer("layer_0", batchSize, layerSize * 3, false);
+    para = creatParameter("para_0", 0, layerSize * layerSize * 3, false);
+    bias = creatParameterBias("bias_0", 1, layerSize * 3, false);
+  }
+
+  LayerPtr testLayer1 = initMKLPackedLayer(
+      layerConfig1, reversed, layerSize, dataLayer, para, bias);
+  LayerPtr testLayer2 = initMKLPackedLayer(
+      layerConfig2, reversed, layerSize, dataLayer, para, bias);
+
+  const VectorPtr& weightGrad =
+      (testLayer1->getParameters()[0])->getBuf(PARAMETER_GRADIENT);
+  const MatrixPtr& inputGrad = testLayer1->getPrev(0)->getOutputGrad();
+  CpuVector wgt_grad1(weightGrad->getSize());
+  CpuVector wgt_grad2(weightGrad->getSize());
+  CpuMatrix input_grad1(inputGrad->getHeight(), inputGrad->getWidth());
+  CpuMatrix input_grad2(inputGrad->getHeight(), inputGrad->getWidth());
+
+  for (int i = 0; i < 2; i++) {
+    FLAGS_rnn_use_batch = useBatch1;
+
+    testLayer1->forward(PASS_GC);
+
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->forward(PASS_GC);
+
+    testLayer1->getOutputGrad()->randomizeUniform();
+    testLayer2->getOutputGrad()->copyFrom(*testLayer1->getOutputGrad());
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch1;
+    testLayer1->backward(nullptr);
+
+    wgt_grad1.copyFrom(*weightGrad);
+    input_grad1.copyFrom(*inputGrad);
+
+    weightGrad->zero();
+    inputGrad->zero();
+    FLAGS_rnn_use_batch = useBatch2;
+    testLayer2->backward(nullptr);
+
+    wgt_grad2.copyFrom(*weightGrad);
+    input_grad2.copyFrom(*inputGrad);
+
+    checkError(*testLayer1->getOutputValue(), *testLayer2->getOutputValue());
+    checkError(wgt_grad1, wgt_grad2);
+    checkError(input_grad1, input_grad2);
+  }
+}
+
+TEST(MKLPackedLayer, RecurrentLayer) {
+  LayerConfig layerConfig1;
+  LayerConfig layerConfig2;
+
+  layerConfig1.set_name("paddle-rnn");
+  layerConfig1.set_type("recurrent");
+  layerConfig1.set_active_type("relu");
+
+  layerConfig2.set_name("mkl-packed-rnn");
+  layerConfig2.set_type("mkl_packed_recurrent");
+  layerConfig2.set_active_type("relu");
+
+  FLAGS_use_gpu = false;
+
+  for (auto layerSize : {32, 64, 128, 256, 512}) {
+    for (auto batchSize : {1, 5, 100, 500}) {
+      for (auto reversed : {true, false}) {
+        for (auto paddle_use_batch : {true, false}) {
+          for (auto MKLPacked_use_batch : {true, false}) {
+            LOG(INFO) << " layerSize=" << layerSize
+                      << " batchSize=" << batchSize << " reversed=" << reversed
+                      << " paddle_use_batch=" << paddle_use_batch
+                      << " MKLPacked_use_batch=" << MKLPacked_use_batch;
+
+            checkMKLPackedLayer(layerConfig1,
+                                layerConfig2,
+                                reversed,
+                                layerSize,
+                                batchSize,
+                                paddle_use_batch,
+                                MKLPacked_use_batch);
+          }
+        }
+      }
+    }
+  }
+}
+#endif
+
 int main(int argc, char** argv) {
-  if (version::isWithGpu()) {
-    testing::InitGoogleTest(&argc, argv);
-    initMain(argc, argv);
-    return RUN_ALL_TESTS();
-  } else {
-    return 0;
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  if (!version::isWithGpu()) {
+    testing::GTEST_FLAG(filter) = "-Layer.*";
   }
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index ab23d00a2c..d164e382c4 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
 #include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -321,7 +320,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
       "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     if (useGpu) {
       break;
     }
@@ -388,7 +387,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                           outMatSelfc->getWidth(),
                           outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -418,7 +417,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
   MatrixPtr cpuOutMatFc(
       new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -443,7 +442,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
   selLayerConfig.set_size(fcLayerWidth);
 
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
new file mode 100644
index 0000000000..3dbffc5634
--- /dev/null
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+const int MAX_SEQ_NUM = 17;
+const int MAX_SEQ_LEN = 23;
+const int MAX_BEAM_SIZE = 13;
+
+const size_t SEED = (size_t)(time(NULL));
+
+vector<real> randSampling(real range, int n) {
+  CHECK_GE(range, n);
+  vector<real> num(range);
+  iota(begin(num), end(num), 0.);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  sort(begin(num), end(num));
+  return num;
+}
+
+void genSeqInfo(vector<int>& seqStartPos, vector<int>& subSeqStartPos) {
+  seqStartPos.resize(1, 0);
+  subSeqStartPos.resize(1, 0);
+
+  srand(SEED);
+  int seqNum = 1 + (rand() % MAX_SEQ_NUM);
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqNum = 1 + (rand() % MAX_SEQ_NUM);
+    for (int j = 0; j < subSeqNum; ++j)
+      subSeqStartPos.push_back(subSeqStartPos.back() +
+                               (1 + (rand() % MAX_SEQ_LEN)));
+    seqStartPos.push_back(subSeqStartPos.back());
+  }
+}
+
+/*
+  generate start indices according to sequence start positions.
+ */
+void genStarts(vector<int>& seqStartPos,
+               vector<vector<real>>& starts,
+               size_t beamSize) {
+  starts.clear();
+  starts.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < seqStartPos.size() - 1; ++i) {
+    int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+    vector<real> randStarts =
+        randSampling(seqLen, min(seqLen, static_cast<int>(beamSize)));
+    copy(begin(randStarts), end(randStarts), begin(starts[i]));
+  }
+}
+
+/*
+  generate end indices according to sequence start positions and start indices.
+ */
+void genEnds(vector<int>& seqStartPos,
+             vector<vector<real>>& starts,
+             vector<vector<real>>& ends,
+             size_t beamSize) {
+  CHECK_EQ(seqStartPos.size() - 1, starts.size());
+  ends.clear();
+  ends.resize(seqStartPos.size() - 1, vector<real>(beamSize, -1.));
+
+  for (size_t i = 0; i < starts.size(); ++i) {
+    for (size_t j = 0; j < starts[i].size(); ++j) {
+      int seqLen = seqStartPos[i + 1] - seqStartPos[i];
+      CHECK_GE(seqLen - 1, starts[i][j]);
+      if (starts[i][j] == -1.) break;
+      if (starts[i][j] == (seqLen - 1)) {
+        ends[i][j] = starts[i][j];
+      } else {
+        ends[i][j] = starts[i][j] + randSampling(seqLen - starts[i][j], 1)[0];
+      }
+    }
+  }
+}
+
+void genTestData(vector<int>& seqStartPos,
+                 vector<int>& subSeqStartPos,
+                 vector<vector<real>>& starts,
+                 vector<vector<real>>& ends,
+                 bool hasSubseq) {
+  size_t beamSize = 1 + (rand() % MAX_BEAM_SIZE);
+  genSeqInfo(seqStartPos, subSeqStartPos);
+
+  genStarts(hasSubseq ? subSeqStartPos : seqStartPos, starts, beamSize);
+  genEnds(hasSubseq ? subSeqStartPos : seqStartPos, starts, ends, beamSize);
+}
+
+template <typename T>
+void flatten2dVector(vector<vector<T>>& inVec, vector<T>& outVec) {
+  size_t totalSize{0};
+  for (auto const& items : inVec) totalSize += items.size();
+  outVec.reserve(totalSize);
+
+  for (auto& items : inVec)
+    move(items.begin(), items.end(), back_inserter(outVec));
+}
+
+void testSeqSliceLayer(bool hasSubseq,
+                       bool useGpu,
+                       vector<int>& seqStartPos,
+                       vector<int>& subSeqStartPos,
+                       vector<vector<real>>& starts,
+                       vector<vector<real>>& ends) {
+  // layer size is not crutial for this layer,
+  // so here use a small layer size in the unittest.
+  const size_t layerSize{4};
+  TestConfig config;
+  config.layerConfig.set_type("seq_slice");
+  config.layerConfig.set_size(layerSize);
+
+  // add the first input
+  MatrixPtr seqInputPtr =
+      Matrix::create(hasSubseq ? subSeqStartPos.back() : seqStartPos.back(),
+                     layerSize,
+                     false,
+                     false);
+  seqInputPtr->randomizeUniform();
+
+  if (hasSubseq) {
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                "seq_input",
+                                seqInputPtr,
+                                seqStartPos,
+                                subSeqStartPos});
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "seq_input", seqInputPtr, seqStartPos});
+  }
+  config.layerConfig.add_inputs();
+
+  // add start indices
+  if (starts.size()) {
+    vector<real> startsToVec;
+    flatten2dVector(starts, startsToVec);
+
+    MatrixPtr startMatrixPtr =
+        Matrix::create(starts.size(), starts[0].size(), false, false);
+    startMatrixPtr->copyFrom(startsToVec.data(), startsToVec.size());
+
+    config.inputDefs.push_back(
+        {INPUT_SELF_DEFINE_DATA, "starts", startMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(true);
+  }
+
+  // add end indices
+  if (ends.size()) {
+    vector<real> endsToVec;
+    flatten2dVector(ends, endsToVec);
+
+    MatrixPtr endMatrixPtr =
+        Matrix::create(ends.size(), ends[0].size(), false, false);
+    endMatrixPtr->copyFrom(endsToVec.data(), endsToVec.size());
+
+    config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA, "ends", endMatrixPtr});
+    config.layerConfig.add_inputs();
+    config.layerConfig.set_select_first(false);
+  }
+
+  testLayerGrad(config, "seq_slice", /*batchSize*/ 100, false, useGpu, false);
+}
+
+TEST(Layer, SeqSliceLayer) {
+  vector<int> seqStartPos;
+  vector<int> subSeqStartPos;
+  vector<vector<real>> starts;
+  vector<vector<real>> ends;
+
+  std::vector<bool> mode = {false};
+#ifdef PADDLE_WITH_CUDA
+  mode.push_back(true);
+#endif
+  genSeqInfo(seqStartPos, subSeqStartPos);
+  for (bool hasSubseq : {true, false}) {
+    LOG(INFO) << "hasSubSeq : " << hasSubseq;
+    genTestData(seqStartPos, subSeqStartPos, starts, ends, hasSubseq);
+    for (bool useGpu : mode) {
+      vector<vector<real>> tmp;
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, tmp, ends);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, tmp);
+      testSeqSliceLayer(
+          hasSubseq, useGpu, seqStartPos, subSeqStartPos, starts, ends);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 55427e2f12..da82946006 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
     for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
           if (useGpu) continue;
 #endif
           LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
new file mode 100644
index 0000000000..0288266c08
--- /dev/null
+++ b/paddle/inference/CMakeLists.txt
@@ -0,0 +1,29 @@
+set(FLUID_CORE_MODULES proto_desc paddle_memory executor prune init)
+
+cc_library(paddle_fluid_api
+    SRCS io.cc
+    DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Merge all modules into a single static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
+
+# Create shared library
+add_library(paddle_fluid_shared SHARED io.cc)
+
+target_circle_link_libraries(paddle_fluid_shared
+  ARCHIVE_START
+  ${GLOB_OP_LIB}
+  ARCHIVE_END
+  ${FLUID_CORE_MODULES})
+
+SET_TARGET_PROPERTIES(paddle_fluid_shared PROPERTIES OUTPUT_NAME paddle_fluid)
+
+# install library & headers
+if(NOT WITH_C_API AND WITH_FLUID)
+  install(FILES io.h DESTINATION include/paddle/inference)
+  install(TARGETS paddle_fluid_shared DESTINATION lib)
+endif()
+
+if(WITH_TESTING)
+  add_subdirectory(tests/book)
+endif()
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
new file mode 100644
index 0000000000..60ad7af1c0
--- /dev/null
+++ b/paddle/inference/io.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/inference/io.h"
+
+#include <fstream>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/feed_fetch_type.h"
+
+namespace paddle {
+namespace inference {
+
+bool IsParameter(const framework::VarDesc* var,
+                 const framework::ProgramDesc& main_program) {
+  if (var->Persistable()) {
+    // There are many unreachable variables in the program
+    for (size_t i = 0; i < main_program.Size(); ++i) {
+      const framework::BlockDesc& block = main_program.Block(i);
+      for (auto* op : block.AllOps()) {
+        if (op->Type() == framework::kFeedOpType) {
+          continue;
+        }
+        for (auto input_argument_name : op->InputArgumentNames()) {
+          if (input_argument_name == var->Name()) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      const framework::ProgramDesc& main_program) {
+  const framework::BlockDesc& global_block = main_program.Block(0);
+
+  framework::ProgramDesc* load_program = new framework::ProgramDesc();
+  framework::BlockDesc* load_block = load_program->MutableBlock(0);
+  for (auto* var : global_block.AllVars()) {
+    if (IsParameter(var, main_program)) {
+      VLOG(3) << "parameter's name: " << var->Name();
+
+      framework::VarDesc* new_var = load_block->Var(var->Name());
+      new_var->SetShape(var->Shape());
+      new_var->SetDataType(var->GetDataType());
+      new_var->SetType(var->GetType());
+      new_var->SetLoDLevel(var->GetLoDLevel());
+      new_var->SetPersistable(true);
+
+      // append_op
+      framework::OpDesc* op = load_block->AppendOp();
+      op->SetType("load");
+      op->SetOutput("Out", {new_var->Name()});
+      op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+      op->CheckAttrs();
+    }
+  }
+  executor.Run(*load_program, &scope, 0, true, true);
+  delete load_program;
+}
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname) {
+  std::string model_filename = dirname + "/__model__";
+  LOG(INFO) << "loading model from " << model_filename;
+  std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+  std::string program_desc_str;
+  inputfs.seekg(0, std::ios::end);
+  program_desc_str.resize(inputfs.tellg());
+  inputfs.seekg(0, std::ios::beg);
+  LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+  inputfs.read(&program_desc_str[0], program_desc_str.size());
+  inputfs.close();
+
+  std::unique_ptr<framework::ProgramDesc> main_program(
+      new framework::ProgramDesc(program_desc_str));
+
+  LoadPersistables(executor, scope, dirname, *main_program);
+  return main_program;
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/io.h b/paddle/inference/io.h
new file mode 100644
index 0000000000..962b6c4e20
--- /dev/null
+++ b/paddle/inference/io.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace inference {
+
+void LoadPersistables(framework::Executor& executor,
+                      framework::Scope& scope,
+                      const std::string& dirname,
+                      const framework::ProgramDesc& main_program);
+
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
+                                             framework::Scope& scope,
+                                             const std::string& dirname);
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000..d3798fb8fd
--- /dev/null
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(PYTHON_TESTS_DIR ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/tests)
+cc_test(test_inference_recognize_digits_mlp
+    SRCS test_inference_recognize_digits.cc
+    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
+    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
+set_tests_properties(test_inference_recognize_digits_mlp
+    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
new file mode 100644
index 0000000000..26dc2aee04
--- /dev/null
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <time.h>
+#include <sstream>
+#include "gflags/gflags.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/inference/io.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+
+template <typename Place, typename T>
+void TestInference(const std::string& dirname,
+                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
+                   std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
+  // 1. Define place, executor and scope
+  auto place = Place();
+  auto executor = paddle::framework::Executor(place);
+  auto* scope = new paddle::framework::Scope();
+
+  // 2. Initialize the inference_program and load all parameters from file
+  auto inference_program = paddle::inference::Load(executor, *scope, dirname);
+
+  // 3. Get the feed_target_names and fetch_target_names
+  const std::vector<std::string>& feed_target_names =
+      inference_program->GetFeedTargetNames();
+  const std::vector<std::string>& fetch_target_names =
+      inference_program->GetFetchTargetNames();
+
+  // 4. Prepare inputs: set up maps for feed targets
+  std::map<std::string, const paddle::framework::LoDTensor*> feed_targets;
+  for (size_t i = 0; i < feed_target_names.size(); ++i) {
+    // Please make sure that cpu_feeds[i] is right for feed_target_names[i]
+    feed_targets[feed_target_names[i]] = cpu_feeds[i];
+  }
+
+  // 5. Define Tensor to get the outputs: set up maps for fetch targets
+  std::map<std::string, paddle::framework::LoDTensor*> fetch_targets;
+  for (size_t i = 0; i < fetch_target_names.size(); ++i) {
+    fetch_targets[fetch_target_names[i]] = cpu_fetchs[i];
+  }
+
+  // 6. Run the inference program
+  executor.Run(*inference_program, scope, feed_targets, fetch_targets);
+
+  delete scope;
+}
+
+TEST(inference, recognize_digits) {
+  if (FLAGS_dirname.empty()) {
+    LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
+  }
+
+  LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+  std::string dirname = FLAGS_dirname;
+
+  // 0. Call `paddle::framework::InitDevices()` initialize all the devices
+  // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
+
+  paddle::framework::LoDTensor input;
+  srand(time(0));
+  float* input_ptr =
+      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
+  for (int i = 0; i < 784; ++i) {
+    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
+  }
+  std::vector<paddle::framework::LoDTensor*> cpu_feeds;
+  cpu_feeds.push_back(&input);
+
+  paddle::framework::LoDTensor output1;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
+  cpu_fetchs1.push_back(&output1);
+
+  // Run inference on CPU
+  TestInference<paddle::platform::CPUPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs1);
+  LOG(INFO) << output1.dims();
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::framework::LoDTensor output2;
+  std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
+  cpu_fetchs2.push_back(&output2);
+
+  // Run inference on CUDA GPU
+  TestInference<paddle::platform::CUDAPlace, float>(
+      dirname, cpu_feeds, cpu_fetchs2);
+  LOG(INFO) << output2.dims();
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  float err = 1E-3;
+  int count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+#endif
+}
diff --git a/paddle/majel/.gitignore b/paddle/majel/.gitignore
deleted file mode 100644
index 1f5acdebb5..0000000000
--- a/paddle/majel/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-build
-third-party
\ No newline at end of file
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
deleted file mode 100644
index 93e5e2c22f..0000000000
--- a/paddle/majel/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-cc_library(place SRCS place.cc)
-cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
-
-cc_library(ddim SRCS ddim.cc)
-cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-
-nv_test(cuda_test SRCS cuda_test.cu)
-nv_test(dim_test SRCS dim_test.cu DEPS ddim)
diff --git a/paddle/majel/cuda_test.cu b/paddle/majel/cuda_test.cu
deleted file mode 100644
index 4067dda2f1..0000000000
--- a/paddle/majel/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
diff --git a/paddle/majel/ddim.cc b/paddle/majel/ddim.cc
deleted file mode 100644
index f32408ed53..0000000000
--- a/paddle/majel/ddim.cc
+++ /dev/null
@@ -1,222 +0,0 @@
-#include "paddle/majel/ddim.h"
-
-namespace majel {
-
-///@cond HIDDEN
-
-template <int i>
-Dim<i> make_dim(const int* d) {
-  return Dim<i>(*d, make_dim<i - 1>(d + 1));
-}
-
-template <>
-Dim<1> make_dim<1>(const int* d) {
-  return Dim<1>(*d);
-}
-
-void make_ddim(DDim& ddim, const int* dims, int n) {
-  switch (n) {
-    case 1:
-      ddim = make_dim<1>(dims);
-      break;
-    case 2:
-      ddim = make_dim<2>(dims);
-      break;
-    case 3:
-      ddim = make_dim<3>(dims);
-      break;
-    case 4:
-      ddim = make_dim<4>(dims);
-      break;
-    case 5:
-      ddim = make_dim<5>(dims);
-      break;
-    case 6:
-      ddim = make_dim<6>(dims);
-      break;
-    case 7:
-      ddim = make_dim<7>(dims);
-      break;
-    case 8:
-      ddim = make_dim<8>(dims);
-      break;
-    case 9:
-      ddim = make_dim<9>(dims);
-      break;
-    default:
-      throw std::invalid_argument(
-          "Dynamic dimensions must have between [1, 9] dimensions.");
-  }
-}
-
-///@endcond
-
-DDim make_ddim(std::initializer_list<int> dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, dims.begin(), dims.size());
-  return result;
-}
-
-DDim make_ddim(const std::vector<int>& dims) {
-  DDim result(make_dim(0));
-  make_ddim(result, &dims[0], dims.size());
-  return result;
-}
-
-///@cond HIDDEN
-// XXX For some reason, putting this in an anonymous namespace causes errors
-class DynamicMutableIndexer : public boost::static_visitor<int&> {
-public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int& operator()(Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
-private:
-  int idx_;
-};
-
-class DynamicConstIndexer : public boost::static_visitor<int> {
-public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
-
-  template <int D>
-  int operator()(const Dim<D>& dim) const {
-    return dim[idx_];
-  }
-
-private:
-  int idx_;
-};
-
-///@endcond
-
-int& DDim::operator[](int idx) {
-  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
-}
-
-int DDim::operator[](int idx) const {
-  return boost::apply_visitor(DynamicConstIndexer(idx), var);
-}
-
-bool DDim::operator==(DDim d) const {
-  if (var.which() != d.getVar().which()) {
-    return false;
-  } else {
-    std::vector<int> v1 = vectorize(*this);
-    std::vector<int> v2 = vectorize(d);
-
-    for (unsigned int i = 0; i < v1.size(); i++) {
-      if (v1[i] != v2[i]) {
-        return false;
-      }
-    }
-
-    return true;
-  }
-}
-
-bool DDim::operator!=(DDim d) const { return !(*this == d); }
-
-DDim DDim::operator+(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
-
-  std::vector<int> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] + v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-DDim DDim::operator*(DDim d) const {
-  std::vector<int> v1 = vectorize(*this);
-  std::vector<int> v2 = vectorize(d);
-
-  std::vector<int> v3;
-
-  assert(v1.size() == v2.size());
-
-  for (unsigned int i = 0; i < v1.size(); i++) {
-    v3.push_back(v1[i] * v2[i]);
-  }
-
-  return make_ddim(v3);
-}
-
-int get(const DDim& ddim, int idx) { return ddim[idx]; }
-
-void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-
-///@cond HIDDEN
-struct VectorizeVisitor : public boost::static_visitor<> {
-  std::vector<int>& vector;
-
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    vector.push_back(t.head);
-    this->operator()(t.tail);
-  }
-
-  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
-};
-///@endcond
-
-std::vector<int> vectorize(const DDim& ddim) {
-  std::vector<int> result;
-  VectorizeVisitor visitor(result);
-  boost::apply_visitor(visitor, ddim);
-  return result;
-}
-
-ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
-  }
-  return result;
-}
-
-///\cond HIDDEN
-
-struct ArityVisitor : boost::static_visitor<int> {
-  template <int D>
-  int operator()(Dim<D>) const {
-    return D;
-  }
-};
-
-///\endcond
-
-int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
-
-///\cond HIDDEN
-
-struct DDimPrinter : boost::static_visitor<void> {
-  std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
-
-  template <typename T>
-  void operator()(const T& t) {
-    os << t;
-  }
-};
-
-///\endcond
-
-std::ostream& operator<<(std::ostream& os, const majel::DDim& ddim) {
-  DDimPrinter printer(os);
-  boost::apply_visitor(printer, ddim);
-  return os;
-}
-
-}  // namespace majel
diff --git a/paddle/majel/ddim.h b/paddle/majel/ddim.h
deleted file mode 100644
index 7be756f8c0..0000000000
--- a/paddle/majel/ddim.h
+++ /dev/null
@@ -1,109 +0,0 @@
-#pragma once
-
-#include <boost/variant.hpp>
-#include <initializer_list>
-#include <stdexcept>
-#include <vector>
-
-#include "paddle/majel/dim.h"
-
-namespace majel {
-
-namespace {
-typedef boost::variant<Dim<1>,
-                       Dim<2>,
-                       Dim<3>,
-                       Dim<4>,
-                       Dim<5>,
-                       Dim<6>,
-                       Dim<7>,
-                       Dim<8>,
-                       Dim<9>>
-    DDimVar;
-}
-
-/**
- * \brief A dynamically sized dimension.
- *
- * The number of dimensions must be between [1, 9].
- */
-struct DDim {
-  DDimVar var;
-
-  DDim() : var(Dim<1>()) {}
-
-  template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
-
-  template <int D>
-  DDim& operator=(const Dim<D>& in) {
-    var = in;
-    return *this;
-  }
-
-  int& operator[](int idx);
-  int operator[](int idx) const;
-
-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) {
-    return var.apply_visitor(visitor);
-  }
-
-  template <typename Visitor>
-  typename Visitor::result_type apply_visitor(Visitor& visitor) const {
-    return var.apply_visitor(visitor);
-  }
-
-  DDimVar getVar() { return var; }
-
-  bool operator==(DDim d) const;
-
-  bool operator!=(DDim d) const;
-
-  DDim operator+(DDim d) const;
-
-  DDim operator*(DDim d) const;
-};
-
-/**
- * \brief Make a DDim from std::vector<int>
- *
- * \param dims An vector of ints. Must be sized between [1, 9]
- */
-DDim make_ddim(const std::vector<int>& dims);
-
-/**
- * \brief Make a DDim from an initializer list
- *
- * \param dims An initializer list of ints. Must be sized between [1, 9]
- *
- */
-DDim make_ddim(std::initializer_list<int> dims);
-
-int get(const DDim& dim, int idx);
-void set(DDim& dim, int idx, int val);
-
-std::vector<int> vectorize(const DDim& ddim);
-
-ssize_t product(const DDim& ddim);
-
-/**
- * \brief What is the length of this dimension?
- *
- * \param Dynamic dimension to inspect
- */
-
-int arity(const DDim& ddim);
-
-std::ostream& operator<<(std::ostream&, const majel::DDim&);
-
-}  // namespace majel
-
-namespace boost {
-
-template <typename T>
-T get(const majel::DDim& in) {
-  return boost::get<T>(in.var);
-}
-
-}  // namespace boost
diff --git a/paddle/majel/ddim_test.cc b/paddle/majel/ddim_test.cc
deleted file mode 100644
index a5b8a7c4d2..0000000000
--- a/paddle/majel/ddim_test.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-//#include <stdexcept>
-//#include <unittest/unittest.h>
-#include <sstream>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "paddle/majel/ddim.h"
-
-TEST(DDim, Equality) {
-  // construct a DDim from an initialization list
-  majel::DDim ddim = majel::make_ddim({9, 1, 5});
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // construct a DDim from a vector
-  std::vector<int> vec({9, 1, 5});
-  majel::DDim vddim = majel::make_ddim(vec);
-  EXPECT_EQ(ddim[0], 9);
-  EXPECT_EQ(ddim[1], 1);
-  EXPECT_EQ(ddim[2], 5);
-
-  // mutate a DDim
-  ddim[1] = 2;
-  EXPECT_EQ(ddim[1], 2);
-  majel::set(ddim, 0, 6);
-  EXPECT_EQ(majel::get(ddim, 0), 6);
-
-  // vectorize a DDim
-  std::vector<int> res_vec = majel::vectorize(vddim);
-  EXPECT_EQ(res_vec[0], 9);
-  EXPECT_EQ(res_vec[1], 1);
-  EXPECT_EQ(res_vec[2], 5);
-  majel::Dim<3> d(3, 2, 1);
-  res_vec = majel::vectorize(majel::DDim(d));
-  EXPECT_EQ(res_vec[0], 3);
-  EXPECT_EQ(res_vec[1], 2);
-  EXPECT_EQ(res_vec[2], 1);
-
-  // add two DDims
-  majel::DDim ddim_sum = ddim + vddim;
-  EXPECT_EQ(ddim_sum[0], 15);
-  EXPECT_EQ(ddim_sum[1], 3);
-  EXPECT_EQ(ddim_sum[2], 10);
-
-  // multiply two DDims
-  majel::DDim ddim_mul = ddim * vddim;
-  EXPECT_EQ(ddim_mul[0], 54);
-  EXPECT_EQ(ddim_mul[1], 2);
-  EXPECT_EQ(ddim_mul[2], 25);
-
-  // arity of a DDim
-  EXPECT_EQ(majel::arity(ddim), 3);
-
-  // product of a DDim
-  EXPECT_EQ(majel::product(vddim), 45);
-}
-
-TEST(DDim, Print) {
-  // print a DDim
-  std::stringstream ss;
-  majel::DDim ddim = majel::make_ddim({2, 3, 4});
-  ss << ddim;
-  EXPECT_EQ("2, 3, 4", ss.str());
-}
diff --git a/paddle/majel/detail/cuda_assert.h b/paddle/majel/detail/cuda_assert.h
deleted file mode 100644
index 9490d0ae3e..0000000000
--- a/paddle/majel/detail/cuda_assert.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#pragma once
-
-#define STRINGIFY(x) #x
-#define TOSTRING(x) STRINGIFY(x)
-
-#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
-#include <stdio.h>
-#define MAJEL_ASSERT(e)                                                       \
-  do {                                                                        \
-    if (!(e)) {                                                               \
-      printf(                                                                 \
-          "%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, TOSTRING(e)); \
-      asm("trap;");                                                           \
-    }                                                                         \
-  } while (0)
-
-#define MAJEL_ASSERT_MSG(e, m)                      \
-  do {                                              \
-    if (!(e)) {                                     \
-      printf("%s:%d Assertion `%s` failed (%s).\n", \
-             __FILE__,                              \
-             __LINE__,                              \
-             TOSTRING(e),                           \
-             m);                                    \
-      asm("trap;");                                 \
-    }                                               \
-  } while (0)
-#else
-#include <assert.h>
-#define MAJEL_ASSERT(e) assert(e)
-#define MAJEL_ASSERT_MSG(e, m) assert((e) && (m))
-#endif
diff --git a/paddle/majel/detail/hostdevice.h b/paddle/majel/detail/hostdevice.h
deleted file mode 100644
index e7de86b7b2..0000000000
--- a/paddle/majel/detail/hostdevice.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#pragma once
-
-#ifdef __CUDACC__
-#define HOSTDEVICE __host__ __device__
-#define HOST __host__
-#else
-#define HOSTDEVICE
-#define HOST
-#endif
diff --git a/paddle/majel/dim_test.cu b/paddle/majel/dim_test.cu
deleted file mode 100644
index a7d81e595b..0000000000
--- a/paddle/majel/dim_test.cu
+++ /dev/null
@@ -1,128 +0,0 @@
-#include <thrust/device_vector.h>
-#include <sstream>
-
-#include "paddle/majel/dim.h"
-#include "gtest/gtest.h"
-
-__global__ void test(majel::Dim<2>* o) {
-    o[0] = majel::make_dim(5, 6);
-}
-
-__global__ void dyn_idx_gpu(int* o) {
-    auto d = majel::make_dim(5, 6);
-    o[0] = d[1];
-}
-
-TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = majel::make_dim(3, 4);
-    EXPECT_EQ(majel::get<0>(a), 3);
-    EXPECT_EQ(majel::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<majel::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(majel::get<0>(a), 5);
-    EXPECT_EQ(majel::get<1>(a), 6);
-
-    // linearization
-    auto b = majel::make_dim(7, 8);
-    EXPECT_EQ(majel::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(majel::product(a), 30);
-
-    // mutate a Dim
-    majel::get<1>(b) = 10;
-    EXPECT_EQ(majel::get<0>(b), 7);
-    EXPECT_EQ(majel::get<1>(b), 10);
-
-    // dynamic access
-    majel::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(majel::get<0>(b), 8);
-    EXPECT_EQ(majel::get<1>(b), 11);
-    EXPECT_EQ(majel::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    majel::Dim<3> c = majel::ex_prefix_mul(majel::Dim<3>(3, 4, 5));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 12);
-
-    // contiguous_strides
-    c = majel::contiguous_strides(majel::Dim<3>(10, 1, 10));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 0);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(10, 10, 1));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 10);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::contiguous_strides(majel::Dim<3>(1, 10, 10));
-    EXPECT_EQ(majel::get<0>(c), 0);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 10);
-    c = majel::contiguous_strides(majel::Dim<3>(2, 3, 4));
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 2);
-    EXPECT_EQ(majel::get<2>(c), 6);
-
-    // generate from an index
-    auto size = majel::make_dim(4, 5, 2);
-    c = majel::Dim<3>(14, size);
-    EXPECT_EQ(majel::get<0>(c), 2);
-    EXPECT_EQ(majel::get<1>(c), 3);
-    EXPECT_EQ(majel::get<2>(c), 0);
-    c = majel::Dim<3>(25, size);
-    EXPECT_EQ(majel::get<0>(c), 1);
-    EXPECT_EQ(majel::get<1>(c), 1);
-    EXPECT_EQ(majel::get<2>(c), 1);
-}
-
-TEST(Dim, Bool) {
-    auto a = majel::make_dim(3, 4);
-    auto b = majel::make_dim(5, 6);
-    auto c = majel::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(majel::contained(a, b));
-    EXPECT_FALSE(majel::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    majel::Dim<3> sizef(x, y, z);
-    majel::Dim<3> stridea(1, x, x*y);
-    majel::Dim<3> strideb(2, 2*x, 2*x*y);
-    majel::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(majel::contiguous(sizef, stridea));
-    EXPECT_FALSE(majel::contiguous(sizef, strideb));
-    EXPECT_FALSE(majel::contiguous(sizef, stridec));
-}
-
-TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = majel::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << majel::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
-}
diff --git a/paddle/majel/place.cc b/paddle/majel/place.cc
deleted file mode 100644
index ca50b37843..0000000000
--- a/paddle/majel/place.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "paddle/majel/place.h"
-
-namespace majel {
-
-namespace detail {
-
-class PlacePrinter : public boost::static_visitor<> {
-private:
-  std::ostream& os_;
-
-public:
-  PlacePrinter(std::ostream& os) : os_(os) {}
-
-  void operator()(const CpuPlace&) { os_ << "CpuPlace"; }
-
-  void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
-};
-
-}  // namespace detail
-
-static Place the_default_place;
-
-void set_place(const Place& place) { the_default_place = place; }
-
-const Place& get_place() { return the_default_place; }
-
-const GpuPlace default_gpu() { return GpuPlace(0); }
-
-const CpuPlace default_cpu() { return CpuPlace(); }
-
-bool is_gpu_place(const Place& p) {
-  return boost::apply_visitor(IsGpuPlace(), p);
-}
-
-bool is_cpu_place(const Place& p) {
-  return !boost::apply_visitor(IsGpuPlace(), p);
-}
-
-bool places_are_same_class(const Place& p1, const Place& p2) {
-  return is_gpu_place(p1) == is_gpu_place(p2);
-}
-
-std::ostream& operator<<(std::ostream& os, const majel::Place& p) {
-  majel::detail::PlacePrinter printer(os);
-  boost::apply_visitor(printer, p);
-  return os;
-}
-
-}  // namespace majel
diff --git a/paddle/majel/place.h b/paddle/majel/place.h
deleted file mode 100644
index ad3dc3fe0b..0000000000
--- a/paddle/majel/place.h
+++ /dev/null
@@ -1,50 +0,0 @@
-#pragma once
-#include <boost/variant.hpp>
-#include <iostream>
-
-namespace majel {
-
-struct CpuPlace {
-  CpuPlace() {}  // WORKAROUND: for some reason, omitting this constructor
-                 // causes errors with boost 1.59 and OSX
-  // needed for variant equality comparison
-  inline bool operator==(const CpuPlace&) const { return true; }
-
-  inline bool operator!=(const CpuPlace&) const { return false; }
-};
-
-struct GpuPlace {
-  GpuPlace(int d) : device(d) {}
-
-  // needed for variant equality comparison
-  inline bool operator==(const GpuPlace& o) const { return device == o.device; }
-
-  inline bool operator!=(const GpuPlace& o) const { return !(*this == o); }
-
-  GpuPlace() : GpuPlace(0) {}
-  int device;
-};
-
-class IsGpuPlace : public boost::static_visitor<bool> {
-public:
-  bool operator()(const CpuPlace&) const { return false; }
-
-  bool operator()(const GpuPlace& gpu) const { return true; }
-};
-
-typedef boost::variant<GpuPlace, CpuPlace> Place;
-
-void set_place(const Place&);
-
-const Place& get_place();
-
-const GpuPlace default_gpu();
-const CpuPlace default_cpu();
-
-bool is_gpu_place(const Place&);
-bool is_cpu_place(const Place&);
-bool places_are_same_class(const Place&, const Place&);
-
-std::ostream& operator<<(std::ostream&, const majel::Place&);
-
-}  // namespace majel
diff --git a/paddle/majel/place_test.cc b/paddle/majel/place_test.cc
deleted file mode 100644
index 6a099ae6b6..0000000000
--- a/paddle/majel/place_test.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "paddle/majel/place.h"
-#include <sstream>
-#include "gtest/gtest.h"
-
-TEST(Place, Equality) {
-  majel::CpuPlace cpu;
-  majel::GpuPlace g0(0), g1(1), gg0(0);
-
-  EXPECT_EQ(cpu, cpu);
-  EXPECT_EQ(g0, g0);
-  EXPECT_EQ(g1, g1);
-  EXPECT_EQ(g0, gg0);
-
-  EXPECT_NE(g0, g1);
-
-  EXPECT_TRUE(majel::places_are_same_class(g0, gg0));
-  EXPECT_FALSE(majel::places_are_same_class(g0, cpu));
-}
-
-TEST(Place, Default) {
-  EXPECT_TRUE(majel::is_gpu_place(majel::get_place()));
-  EXPECT_TRUE(majel::is_gpu_place(majel::default_gpu()));
-  EXPECT_TRUE(majel::is_cpu_place(majel::default_cpu()));
-
-  majel::set_place(majel::CpuPlace());
-  EXPECT_TRUE(majel::is_cpu_place(majel::get_place()));
-}
-
-TEST(Place, Print) {
-  {
-    std::stringstream ss;
-    ss << majel::GpuPlace(1);
-    EXPECT_EQ("GpuPlace(1)", ss.str());
-  }
-  {
-    std::stringstream ss;
-    ss << majel::CpuPlace();
-    EXPECT_EQ("CpuPlace", ss.str());
-  }
-}
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 666a8b8368..17563bf5e1 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -48,7 +48,13 @@ public:
    */
   virtual void* alloc(size_t size) {
     void* ptr;
+#ifdef PADDLE_WITH_MKLDNN
+    // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+    // memory alignment
+    CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0);
+#else
     CHECK_EQ(posix_memalign(&ptr, 32ul, size), 0);
+#endif
     CHECK(ptr) << "Fail to allocate CPU memory: size=" << size;
     return ptr;
   }
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index de48b6fac9..e3eff59dc5 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -12,21 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
-#include <string.h>
 #include <paddle/utils/Logging.h>
+#include <string.h>
+#include <cmath>
 #include "BaseMatrix.h"
-#include "hl_matrix_ops.cuh"
-#include "hl_matrix_base.cuh"
-#include "hl_matrix_apply.cuh"
-#include "SIMDFunctions.h"
 #include "MathFunctions.h"
+#include "NEONFunctions.h"
+#include "SIMDFunctions.h"
+#include "hl_matrix_apply.cuh"
+#include "hl_matrix_base.cuh"
+#include "hl_matrix_ops.cuh"
 
 namespace paddle {
 
 const char* SPARSE_SUPPORT_ERROR = "Sparse Matrix/Vector is not supported.";
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyUnary(Op op) {
   MatrixOffset offset(0, 0);
@@ -34,9 +35,11 @@ int BaseMatrixT<T>::applyUnary(Op op) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
+int BaseMatrixT<T>::applyUnary(Op op,
+                               int numRows,
+                               int numCols,
                                MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   int dimM = numRows;
@@ -56,7 +59,7 @@ int BaseMatrixT<T>::applyUnary(Op op, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   CHECK(height_ == b.height_ && width_ == b.width_)
@@ -67,18 +70,23 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                                MatrixOffset& offset) {
+int BaseMatrixT<T>::applyBinary(
+    Op op, BaseMatrixT& b, int numRows, int numCols, MatrixOffset& offset) {
   applyBinary(op, b, numRows, numCols, offset, false_type(), false_type());
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class bAsRowVector, class bAsColVector>
-int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
-                            MatrixOffset& offset, bAsRowVector, bAsColVector) {
+int BaseMatrixT<T>::applyBinary(Op op,
+                                BaseMatrixT& b,
+                                int numRows,
+                                int numCols,
+                                MatrixOffset& offset,
+                                bAsRowVector,
+                                bAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(useGpu_ == b.useGpu_) << "Matrix type mismatch";
@@ -91,8 +99,8 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   T* A = data_;
   T* B = b.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
   if (!bAsRowVector::value && !bAsColVector::value) {
@@ -115,7 +123,7 @@ int BaseMatrixT<T>::applyBinary(Op op, BaseMatrixT& b, int numRows, int numCols,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
 int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK_EQ(height_, b.height_);
@@ -129,21 +137,29 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c) {
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols,
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
                                  MatrixOffset& offset) {
   applyTernary(op, b, c, numRows, numCols, offset, false_type(), false_type());
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op, class cAsRowVector, class cAsColVector>
-int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                 int numRows, int numCols, MatrixOffset& offset,
-                                 cAsRowVector, cAsColVector) {
+int BaseMatrixT<T>::applyTernary(Op op,
+                                 BaseMatrixT& b,
+                                 BaseMatrixT& c,
+                                 int numRows,
+                                 int numCols,
+                                 MatrixOffset& offset,
+                                 cAsRowVector,
+                                 cAsColVector) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!c.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -160,10 +176,10 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* B = b.data_;
   T* C = c.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -180,21 +196,21 @@ int BaseMatrixT<T>::applyTernary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   }
 
   if (true == useGpu_) {
-    hl_gpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_gpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   } else {
-    hl_cpu_apply_ternary_op
-      <T, Op, cAsRowVector::value, cAsColVector::value>(
+    hl_cpu_apply_ternary_op<T, Op, cAsRowVector::value, cAsColVector::value>(
         op, A, B, C, dimM, dimN, lda, ldb, ldc);
   }
 
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
                                     BaseMatrixT& d) {
   CHECK_EQ(height_, b.height_);
   CHECK_EQ(width_, b.width_);
@@ -209,10 +225,14 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   return 0;
 }
 
-template<class T>
+template <class T>
 template <class Op>
-int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
-                                    BaseMatrixT& d, int numRows, int numCols,
+int BaseMatrixT<T>::applyQuaternary(Op op,
+                                    BaseMatrixT& b,
+                                    BaseMatrixT& c,
+                                    BaseMatrixT& d,
+                                    int numRows,
+                                    int numCols,
                                     MatrixOffset& offset) {
   CHECK(!this->isSparse()) << SPARSE_SUPPORT_ERROR;
   CHECK(!b.isSparse()) << SPARSE_SUPPORT_ERROR;
@@ -234,12 +254,12 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   T* C = c.data_;
   T* D = d.data_;
   CAL_MATRIX_START_ADDRESS(A, height_, width_, lda, offset.aCol_, offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
-  CAL_MATRIX_START_ADDRESS(D, d.height_, d.width_, ldd, offset.dCol_,
-                           offset.dRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      D, d.height_, d.width_, ldd, offset.dCol_, offset.dRow_);
 
   CHECK_LE(dimM + offset.aRow_, this->height_);
   CHECK_LE(dimN + offset.aCol_, this->width_);
@@ -250,22 +270,29 @@ int BaseMatrixT<T>::applyQuaternary(Op op, BaseMatrixT& b, BaseMatrixT& c,
   CHECK_LE(dimM + offset.dRow_, d.height_);
   CHECK_LE(dimN + offset.dCol_, d.width_);
   if (true == useGpu_) {
-    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_gpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   } else {
-    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb,
-                               ldc, ldd);
+    hl_cpu_apply_quaternary_op(op, A, B, C, D, dimM, dimN, lda, ldb, ldc, ldd);
   }
 
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              int numRows, int numCols, MatrixOffset& offset,
-                              aAsRowVector, aAsColVector) {
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
+                              aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
 
   int ld = stride_;
@@ -273,10 +300,10 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
 
   T* dst = data_;
   T* B = b.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
@@ -297,12 +324,21 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   return 0;
 }
 
-template<class T>
-template <class Agg, class Op, class Saver, class aAsRowVector,
+template <class T>
+template <class Agg,
+          class Op,
+          class Saver,
+          class aAsRowVector,
           class aAsColVector>
-int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
-                              BaseMatrixT& c, int numRows, int numCols,
-                              MatrixOffset& offset, aAsRowVector,
+int BaseMatrixT<T>::aggregate(Agg agg,
+                              Op op,
+                              Saver sv,
+                              BaseMatrixT& b,
+                              BaseMatrixT& c,
+                              int numRows,
+                              int numCols,
+                              MatrixOffset& offset,
+                              aAsRowVector,
                               aAsColVector) {
   CHECK_EQ(useGpu_, b.useGpu_);
   CHECK_EQ(useGpu_, c.useGpu_);
@@ -314,28 +350,28 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
   T* dst = data_;
   T* B = b.data_;
   T* C = c.data_;
-  CAL_MATRIX_START_ADDRESS(dst, height_, width_, ld, offset.aCol_,
-                           offset.aRow_);
-  CAL_MATRIX_START_ADDRESS(B, b.height_, b.width_, ldb, offset.bCol_,
-                           offset.bRow_);
-  CAL_MATRIX_START_ADDRESS(C, c.height_, c.width_, ldc, offset.cCol_,
-                           offset.cRow_);
+  CAL_MATRIX_START_ADDRESS(
+      dst, height_, width_, ld, offset.aCol_, offset.aRow_);
+  CAL_MATRIX_START_ADDRESS(
+      B, b.height_, b.width_, ldb, offset.bCol_, offset.bRow_);
+  CAL_MATRIX_START_ADDRESS(
+      C, c.height_, c.width_, ldc, offset.cCol_, offset.cRow_);
 
   if (aAsRowVector::value && !aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_gpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_column_op(agg, op, sv, numRows, numCols, dst, B,
-                              ldb, C, ldc);
+      hl_cpu_matrix_column_op(
+          agg, op, sv, numRows, numCols, dst, B, ldb, C, ldc);
     }
   } else if (!aAsRowVector::value && aAsColVector::value) {
     if (useGpu_) {
-      hl_gpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_gpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     } else {
-      hl_cpu_matrix_row_op(agg, op, sv, numRows, numCols, dst, ld, B,
-                           ldb, C, ldc);
+      hl_cpu_matrix_row_op(
+          agg, op, sv, numRows, numCols, dst, ld, B, ldb, C, ldc);
     }
   } else {
     LOG(FATAL) << "not supported";
@@ -350,15 +386,19 @@ int BaseMatrixT<T>::aggregate(Agg agg, Op op, Saver sv, BaseMatrixT& b,
  */
 
 DEFINE_MATRIX_UNARY_OP(Neg, a = -a);
-template<class T>
-void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
+template <class T>
+void BaseMatrixT<T>::neg() {
+  applyUnary(unary::Neg<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
-template<>
-void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
+template <>
+void BaseMatrixT<real>::exp2() {
+  applyUnary(unary::Exp<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
-template<>
+template <>
 void BaseMatrixT<real>::log2() {
   if (useGpu_) {
     applyUnary(unary::Log<real>());
@@ -368,30 +408,42 @@ void BaseMatrixT<real>::log2() {
 }
 
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
-template<>
-void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
+template <>
+void BaseMatrixT<real>::sqrt2() {
+  applyUnary(unary::Sqrt<real>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
-template<class T>
-void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
+template <class T>
+void BaseMatrixT<T>::square2() {
+  applyUnary(unary::Square<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
-template<class T>
-void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
+template <class T>
+void BaseMatrixT<T>::reciprocal2() {
+  applyUnary(unary::Reciprocal<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
+template <class T>
+void BaseMatrixT<T>::abs2() {
+  applyUnary(unary::Abs<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
-template<class T>
-void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
+template <class T>
+void BaseMatrixT<T>::sign2() {
+  applyUnary(unary::Sign<T>());
+}
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
-template<class T>
-void BaseMatrixT<T>::zero() { applyUnary(unary::Zero<T>()); }
+template <class T>
+void BaseMatrixT<T>::zero() {
+  applyUnary(unary::Zero<T>());
+}
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
   int numRows = height_;
   int numCols = numColumns;
@@ -400,11 +452,13 @@ void BaseMatrixT<T>::zeroAtOffset(int64_t columnOffset, int64_t numColumns) {
 }
 
 DEFINE_MATRIX_UNARY_OP(One, a = 1);
-template<class T>
-void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
+template <class T>
+void BaseMatrixT<T>::one() {
+  applyUnary(unary::One<T>());
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(real p) {
   if (useGpu_) {
     applyUnary(unary::Pow<real>(p));
@@ -414,44 +468,67 @@ void BaseMatrixT<real>::pow2(real p) {
 }
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a -= p);
-template<class T>
-void BaseMatrixT<T>::subScalar(T p) { applyUnary(unary::SubScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::subScalar(T p) {
+  applyUnary(unary::SubScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a *= p);
-template<class T>
-void BaseMatrixT<T>::mulScalar(T p) { applyUnary(unary::MulScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::mulScalar(T p) {
+  applyUnary(unary::MulScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a /= p);
-template<class T>
-void BaseMatrixT<T>::divScalar(T p) { applyUnary(unary::DivScalar<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::divScalar(T p) {
+  applyUnary(unary::DivScalar<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Assign, ONE_PARAMETER, a = p);
-template<class T>
-void BaseMatrixT<T>::assign(T p) { applyUnary(unary::Assign<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::assign(T p) {
+  applyUnary(unary::Assign<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add, ONE_PARAMETER, a += p);
-template<class T>
-void BaseMatrixT<T>::add(T p) { applyUnary(unary::Add<T>(p)); }
+template <class T>
+void BaseMatrixT<T>::add(T p) {
+  applyUnary(unary::Add<T>(p));
+}
 
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = a * p1 + p2);
-template<class T>
-void BaseMatrixT<T>::add(T p1, T p2) { applyUnary(unary::Add2<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::add(T p1, T p2) {
+  applyUnary(unary::Add2<T>(p1, p2));
+}
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
+DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip,
+                                 TWO_PARAMETER,
                                  a = a < p1 ? p1 : (a > p2 ? p2 : a));
-template<class T>
-void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
+template <class T>
+void BaseMatrixT<T>::clip(T p1, T p2) {
+  applyUnary(unary::Clip<T>(p1, p2));
+}
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative,
+                                  TWO_PARAMETER,
+                                  a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template <class T>
+void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
+}
+
+DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar,
+                                 ONE_PARAMETER,
                                  a = a > p ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThanScalar(T p) {
   applyUnary(unary::BiggerThanScalar<T>(p));
 }
 
-DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER,
-                                 a = a > p ? a : p);
-template<class T>
+DEFINE_MATRIX_UNARY_PARAMETER_OP(DownClip, ONE_PARAMETER, a = a > p ? a : p);
+template <class T>
 void BaseMatrixT<T>::downClip(T p) {
   applyUnary(unary::DownClip<T>(p));
 }
@@ -462,12 +539,12 @@ void BaseMatrixT<T>::downClip(T p) {
  */
 
 DEFINE_MATRIX_BINARY_OP(Add, a += b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b) {
   applyBinary(binary::Add<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::add(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Add<real>(), b);
@@ -478,7 +555,7 @@ void BaseMatrixT<real>::add(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -497,43 +574,53 @@ void BaseMatrixT<T>::addAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addP2P(BaseMatrixT& b) {
   T* A = data_;
   T* B = b.data_;
   int dimM = height_;
   int dimN = width_;
 
-  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>
-    (binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
+  hl_gpu_apply_binary_op<T, binary::Add<T>, 0, 0>(
+      binary::Add<T>(), A, B, dimM, dimN, dimN, dimN);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset, false_type(),
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
               true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add1, ONE_PARAMETER, a += b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
   applyBinary(binary::Add1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
-template<>
+template <>
 void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
   if (useGpu_) {
     applyBinary(binary::Pow<real>(p), b);
@@ -543,36 +630,52 @@ void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Add2, TWO_PARAMETER, a = p1 * a + p2 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Add2<T>(p1, p2), b);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addBias(BaseMatrixT& b, T scale) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::Add1<T>(scale), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::Add1<T>(scale),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(Sub, a -= b);
-template<class T>
-void BaseMatrixT<T>::sub(BaseMatrixT& b) { applyBinary(binary::Sub<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::sub(BaseMatrixT& b) {
+  applyBinary(binary::Sub<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Sub1, ONE_PARAMETER, a -= b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p) {
   applyBinary(binary::Sub1<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Relu, b = a > 0.0f ? a : 0.0f);
-template<class T>
-void BaseMatrixT<T>::relu(BaseMatrixT& b) { applyBinary(binary::Relu<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::relu(BaseMatrixT& b) {
+  applyBinary(binary::Relu<T>(), b);
+}
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+template <>
+void BaseMatrixT<float>::relu(BaseMatrixT& b) {
+  neon::relu(data_, b.data_, height_ * width_);
+}
+#endif
 
 DEFINE_MATRIX_BINARY_OP(ReluDerivative, a *= (b > 0.0f ? 1.0f : 0.0f));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reluDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReluDerivative<T>(), b);
 }
@@ -582,7 +685,7 @@ DEFINE_MATRIX_BINARY_OP(Softrelu, const T THRESHOLD = 40.0;
                                               ? THRESHOLD
                                               : ((a < -THRESHOLD) ? (-THRESHOLD)
                                                                   : a))));
-template<>
+template <>
 void BaseMatrixT<real>::softrelu(BaseMatrixT& b) {
   applyBinary(binary::Softrelu<real>(), b);
 }
@@ -592,97 +695,100 @@ DEFINE_MATRIX_BINARY_OP(
     a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
                                 ? THRESHOLD
                                 : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
-template<>
+template <>
 void BaseMatrixT<real>::softreluDerivative(BaseMatrixT& b) {
   applyBinary(binary::SoftreluDerivative<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Brelu, TWO_PARAMETER, b = a > p1 ? a : p1;
                                   b = b < p2 ? b : p2);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::brelu(BaseMatrixT& b) {
-  int p1 = 0, p2 = 24;    //! TODO(yuyang18): Make p1,p2 configuable.
+  int p1 = 0, p2 = 24;  //! TODO(yuyang18): Make p1,p2 configuable.
   applyBinary(binary::Brelu<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(BreluDerivative,
+                                  TWO_PARAMETER,
                                   a *= (b > p1 && b < p2) ? 1.0 : 0.0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
   int p1 = 0, p2 = 24;
   applyBinary(binary::BreluDerivative<T>(p1, p2), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::square2(BaseMatrixT& b) {
   applyBinary(binary::Square<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(SquareDerivative, a *= 2.0 * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
   applyBinary(binary::SquareDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(Tanh,
-    T tmp = -2.0 * a;
-    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template<>
+DEFINE_MATRIX_BINARY_OP(Tanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
   applyBinary(binary::Tanh<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(TanhDerivative, a *= 1 - b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::tanhDerivative(BaseMatrixT& b) {
   applyBinary(binary::TanhDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanh, TWO_PARAMETER,
-                                  b = p1 *
-                                      (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
-template<>
+DEFINE_MATRIX_BINARY_PARAMETER_OP(
+    ScaledTanh, TWO_PARAMETER, b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0));
+template <>
 void BaseMatrixT<real>::scaledTanh(BaseMatrixT& b, real p1, real p2) {
   applyBinary(binary::ScaledTanh<real>(p1, p2), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ScaledTanhDerivative,
+                                  TWO_PARAMETER,
                                   a *= p2 * (p1 - b * b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::ScaledTanhDerivative<T>(p1 * p1, p2 / p1), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
   applyBinary(binary::Reciprocal<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ReciprocalDerivative, a *= -b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
   applyBinary(binary::ReciprocalDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
-template<class T>
-void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+template <class T>
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) {
+  applyBinary(binary::Abs<T>(), b);
+}
 
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::absDerivative(BaseMatrixT& b) {
   applyBinary(binary::AbsDerivative<T>(), b);
 }
 
-DEFINE_MATRIX_BINARY_OP(
-    Sigmoid, const T THRESHOLD_MIN = -40.0; const T THRESHOLD_MAX = 13.0;
-    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
-                                   : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
-    b = 1.0f / (1.0f + exp(-tmp)));
-template<>
+DEFINE_MATRIX_BINARY_OP(Sigmoid, const T THRESHOLD_MIN = -40.0;
+                        const T THRESHOLD_MAX = 13.0;
+                        T tmp = (a < THRESHOLD_MIN)
+                                    ? THRESHOLD_MIN
+                                    : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+                        b = 1.0f / (1.0f + exp(-tmp)));
+template <>
 void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Sigmoid<real>(), b);
@@ -716,31 +822,31 @@ void BaseMatrixT<real>::sigmoid(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(SigmoidDerivative, a *= b * (1 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sigmoidDerivative(BaseMatrixT& b) {
   applyBinary(binary::SigmoidDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(ExpDerivative, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
   applyBinary(binary::ExpDerivative<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
   applyBinary(binary::Sign<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
-template<>
+template <>
 void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
   applyBinary(binary::Exp<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
-template<>
+template <>
 void BaseMatrixT<real>::log2(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Log<real>(), b);
@@ -750,13 +856,13 @@ void BaseMatrixT<real>::log2(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
   applyBinary(binary::Sqrt<real>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(InvSqrt, a = 1.0f / sqrt(b));
-template<>
+template <>
 void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::InvSqrt<real>(), b);
@@ -768,37 +874,37 @@ void BaseMatrixT<real>::invSqrt(BaseMatrixT& b) {
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(IsEqual, ONE_PARAMETER, a = (b == p));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::isEqualTo(BaseMatrixT& b, T value) {
   applyBinary(binary::IsEqual<T>(value), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddScalar, ONE_PARAMETER, a = b + p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::AddScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(SubScalar, ONE_PARAMETER, a = b - p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::subScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::SubScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(MulScalar, ONE_PARAMETER, a = b * p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::MulScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(DivScalar, ONE_PARAMETER, a = b / p);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divScalar(BaseMatrixT& b, T p) {
   applyBinary(binary::DivScalar<T>(p), b);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(ScalarDiv, ONE_PARAMETER, a = p / b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
   applyBinary(binary::ScalarDiv<T>(p), b);
 }
@@ -810,20 +916,20 @@ void BaseMatrixT<T>::scalarDiv(BaseMatrixT& b, T p) {
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropy,
                          a = -c * log(b) - (1 - c) * log(1 - b));
-template<>
+template <>
 void BaseMatrixT<real>::softCrossEntropy(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropy<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(SoftCrossEntropyBp, a += (b - c) / (b * (1 - b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::softCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::SoftCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropy,
                          a = c > 0.5 ? -log(b) : -log(1.0 - b));
-template<>
+template <>
 void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
                                                 BaseMatrixT& c) {
   if (useGpu_) {
@@ -851,70 +957,73 @@ void BaseMatrixT<real>::binaryLabelCrossEntropy(BaseMatrixT& b,
 
 DEFINE_MATRIX_TERNARY_OP(BinaryCrossEntropyBp,
                          a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::binaryLabelCrossEntropyBp(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BinaryCrossEntropyBp<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add, a = b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add1, TWO_PARAMETER, a = p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Add1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Sub, a = b - c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Sub<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(Sub1, TWO_PARAMETER, a = p1 * b - p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sub(BaseMatrixT& b, T p1, BaseMatrixT& c, T p2) {
   applyTernary(ternary::Sub1<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(Add2, a = a + b + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Add2<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(Add3,
+                                   THREE_PARAMETER,
                                    a = p1 * a + p2 * b + p3 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::add2(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::Add3<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(SgdUpdate,
+                                   THREE_PARAMETER,
                                    c = p2 * c - p1 * (b + p3 * a);
                                    a = a + c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad
                                BaseMatrixT& c,  // mom
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyTernary(ternary::SgdUpdate<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(SgdUpdate,
+                                      THREE_PARAMETER,
                                       c = p2 * c - p1 * d * (b + p3 * a);
                                       a += c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::sgdUpdate(BaseMatrixT& b,  // grad,
                                BaseMatrixT& c,  // mom,
                                BaseMatrixT& d,  // lr,
-                               T p1,        // learningRate,
-                               T p2,        // momentum,
-                               T p3) {      // decayRate
+                               T p1,            // learningRate,
+                               T p2,            // momentum,
+                               T p3) {          // decayRate
   applyQuaternary(quaternary::SgdUpdate<T>(p1, p2, p3), b, c, d);
 }
 
@@ -922,19 +1031,22 @@ DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p * b;
                                   a = (a > lambda)
                                           ? (a - lambda)
                                           : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(BaseMatrixT& lr, T learningRate, T decayRate) {
   applyBinary(binary::ApplyL1<T>(learningRate * decayRate), lr);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(BaseMatrixT& lr,
                                 real learningRate,
                                 real decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL1<real>(learningRate * decayRate), lr);
   } else {
-    simd::decayL1(this->data_, this->data_, lr.data_, learningRate * decayRate,
+    simd::decayL1(this->data_,
+                  this->data_,
+                  lr.data_,
+                  learningRate * decayRate,
                   height_ * width_);
   }
 }
@@ -943,24 +1055,25 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(ApplyL1, ONE_PARAMETER, T lambda = p;
                                  a = (a > lambda)
                                          ? (a - lambda)
                                          : (a < -lambda) ? (a + lambda) : 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL1(T learningRate, T decayRate) {
   applyUnary(unary::ApplyL1<T>(learningRate * decayRate));
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::applyL1(real learningRate, real decayRate) {
   if (useGpu_) {
     applyUnary(unary::ApplyL1<real>(learningRate * decayRate));
   } else {
-    simd::decayL1(this->data_, this->data_, learningRate * decayRate,
-                  height_ * width_);
+    simd::decayL1(
+        this->data_, this->data_, learningRate * decayRate, height_ * width_);
   }
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2, ONE_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ApplyL2,
+                                  ONE_PARAMETER,
                                   a *= (1.0f / (1.0f + p * b)));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   if (useGpu_) {
     applyBinary(binary::ApplyL2<T>(learningRate * decayRate), lr);
@@ -973,32 +1086,33 @@ void BaseMatrixT<T>::applyL2(BaseMatrixT& lr, T learningRate, T decayRate) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::applyL2(T learningRate, T decayRate) {
   BaseMatrixT<T>::mulScalar(1.0f / (1.0f + learningRate * decayRate));
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMul, a *= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b) {
   applyBinary(binary::DotMul<T>(), b);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMul, a = b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMul(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMul<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotDiv, a = (b == 0.0) ? 0.0 : b / c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotDiv<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotDiv2P,
+                                   TWO_PARAMETER,
                                    a = (b + p1) / (c + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotDiv(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotDiv2P<T>(p1, p2), b, c);
 }
@@ -1008,7 +1122,7 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLoss, const T THRESHOLD = 40.0; a = b - c;
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
                             a = log(1 + exp(a)) - a * d);
-template<>
+template <>
 void BaseMatrixT<real>::rankLoss(BaseMatrixT& b,
                                  BaseMatrixT& c,
                                  BaseMatrixT& d) {
@@ -1019,8 +1133,9 @@ DEFINE_MATRIX_QUATERNARY_OP(RankLossBp, const T THRESHOLD = 40.0; a = b - c;
                             a = (a > THRESHOLD)
                                     ? THRESHOLD
                                     : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
-                            a = exp(a); a = (a / (1 + a) - d));
-template<>
+                            a = exp(a);
+                            a = (a / (1 + a) - d));
+template <>
 void BaseMatrixT<real>::rankLossBp(BaseMatrixT& b,
                                    BaseMatrixT& c,
                                    BaseMatrixT& d) {
@@ -1033,7 +1148,7 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLoss, const T THRESHOLD = 40.0;
                                                                  ? -THRESHOLD
                                                                  : b;
                          a = log(1 + exp(x)) - c * x);
-template<>
+template <>
 void BaseMatrixT<real>::logisticRegressionLoss(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLoss<real>(), b, c);
 }
@@ -1043,22 +1158,23 @@ DEFINE_MATRIX_TERNARY_OP(LogisticRegressionLossBp, const T THRESHOLD = 40.0;
                          T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
                                                                  ? -THRESHOLD
                                                                  : b;
-                         x = exp(x); a = x / (1 + x) - c);
-template<>
+                         x = exp(x);
+                         a = x / (1 + x) - c);
+template <>
 void BaseMatrixT<real>::logisticRegressionLossBp(BaseMatrixT& b,
                                                  BaseMatrixT& c) {
   applyTernary(ternary::LogisticRegressionLossBp<real>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(BiggerThan, a = (b > c) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::BiggerThan<T>(), b, c);
 }
 
 DEFINE_MATRIX_QUATERNARY_OP(
     BiggerThan, a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
                                 BaseMatrixT& c,
                                 BaseMatrixT& d) {
@@ -1066,25 +1182,34 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 }
 
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::Max<T>(), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError, ONE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(BinaryClassificationError,
+                                   ONE_PARAMETER,
                                    c += ((a > p) == (b > p)) ? 0.0f : 1.0f);
-template<class T>
-void BaseMatrixT<T>::binaryClassificationError2(size_t destCol, BaseMatrixT& b,
-                                                BaseMatrixT& c, T p) {
+template <class T>
+void BaseMatrixT<T>::binaryClassificationError2(size_t destCol,
+                                                BaseMatrixT& b,
+                                                BaseMatrixT& c,
+                                                T p) {
   CHECK(!useGpu_) << "do not support gpu";
   MatrixOffset offset(0, 0, 0, 0, destCol, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  b.applyTernary(ternary::BinaryClassificationError<T>(p), c, *this, numRows,
-                 numCols, offset, false_type(), true_type() /*cAsColVector*/);
+  b.applyTernary(ternary::BinaryClassificationError<T>(p),
+                 c,
+                 *this,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
                                                   BaseMatrixT& b,
                                                   BaseMatrixT& c,
@@ -1092,127 +1217,148 @@ void BaseMatrixT<real>::binaryClassificationError(size_t destCol,
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::classificationError(p),
-            base::binary::add(), b, c, numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::classificationError(p),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(Add3,
+                                      THREE_PARAMETER,
                                       a = p1 * b + p2 * c + p3 * d);
-template<class T>
-void BaseMatrixT<T>::add3(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1,
-                          T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::add3(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT& d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::Add3<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotMulSquare, a = b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotMulSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_TERNARY_OP(DotSquareSquare, a = b * b * c * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSquare(BaseMatrixT& b, BaseMatrixT& c) {
   applyTernary(ternary::DotSquareSquare<T>(), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotMulSquare, a *= b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSquare(BaseMatrixT& b) {
   applyBinary(binary::DotMulSquare<T>(), b);
 }
 
 DEFINE_MATRIX_BINARY_OP(DotSquareMul, a = a * a * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareMul(BaseMatrixT& b) {
   applyBinary(binary::DotSquareMul<T>(), b);
 }
 
-DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum, THREE_PARAMETER,
+DEFINE_MATRIX_QUATERNARY_PARAMETER_OP(AddSquareSum,
+                                      THREE_PARAMETER,
                                       T tmp = p1 * b + p2 * c + p3 * d;
                                       a += tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::addSquareSum(BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d,
-                                  T p1, T p2, T p3) {
+template <class T>
+void BaseMatrixT<T>::addSquareSum(
+    BaseMatrixT& b, BaseMatrixT& c, BaseMatrixT d, T p1, T p2, T p3) {
   applyQuaternary(quaternary::AddSquareSum<T>(p1, p2, p3), b, c, d);
 }
 
 DEFINE_MATRIX_BINARY_PARAMETER_OP(AddSquare, ONE_PARAMETER, a += p * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addSquare(BaseMatrixT& b, T p) {
   applyBinary(binary::AddSquare<T>(p), b);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(DecayAddSquare,
+                                  TWO_PARAMETER,
                                   a = p1 * a + p2 * b * b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::decayAddSquare(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::DecayAddSquare<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DecayAddSquareMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * b * c * c);
-template<class T>
-void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::decayAddSquareMul(BaseMatrixT& b,
+                                       BaseMatrixT& c,
+                                       T p1,
                                        T p2) {
   applyTernary(ternary::DecayAddSquareMul<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum, THREE_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(ReciprocalSum,
+                                   THREE_PARAMETER,
                                    a = 1 / (p1 * b + p2 * c + p3));
-template<class T>
-void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
-                                   T p3) {
+template <class T>
+void BaseMatrixT<T>::reciprocalSum(
+    BaseMatrixT& b, BaseMatrixT& c, T p1, T p2, T p3) {
   applyTernary(ternary::ReciprocalSum<T>(p1, p2, p3), b, c);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
+DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2,
+                                  TWO_PARAMETER,
                                   a = 1 / (p1 * b + p2));
-template<class T>
+template <class T>
 void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
   applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a *= tmp * tmp);
-template<class T>
-void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1,
+template <class T>
+void BaseMatrixT<T>::dotMulSquareSum(BaseMatrixT& b,
+                                     BaseMatrixT& c,
+                                     T p1,
                                      T p2) {
   applyTernary(ternary::DotMulSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotSquareSum,
+                                   TWO_PARAMETER,
                                    T tmp = p1 * b + p2 * c;
                                    a = tmp * tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotSquareSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotSquareSum<T>(p1, p2), b, c);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(DotMulSum,
+                                   TWO_PARAMETER,
                                    a *= p1 * b + p2 * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::dotMulSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::DotMulSum<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(CopyAndClear, b = a; a = 0);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::copyAndClear(BaseMatrixT& b) {
   applyBinary(binary::CopyAndClear<T>(), b);
 }
 
-DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul, TWO_PARAMETER,
+DEFINE_MATRIX_TERNARY_PARAMETER_OP(AddDotMul,
+                                   TWO_PARAMETER,
                                    a = p1 * a + p2 * b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMul(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2) {
   applyTernary(ternary::AddDotMul<T>(p1, p2), b, c);
 }
 
 DEFINE_MATRIX_BINARY_OP(Assign, a = b;);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   if (useGpu_) {
     applyBinary(binary::Assign<T>(), b);
@@ -1223,7 +1369,7 @@ void BaseMatrixT<T>::assign(BaseMatrixT& b) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
   if (columnOffset + b.width_ <= width_) {
     int numRows = height_;
@@ -1243,24 +1389,31 @@ void BaseMatrixT<T>::assignAtOffset(BaseMatrixT& b, int64_t columnOffset) {
 }
 
 DEFINE_MATRIX_BINARY_OP(DeepSwap, T tmp = a; a = b; b = tmp);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::deepSwap(BaseMatrixT& b) {
-    applyBinary(binary::DeepSwap<T>(), b);
+  applyBinary(binary::DeepSwap<T>(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::rowDotMul(size_t destCol,
                                   BaseMatrixT& b,
                                   BaseMatrixT& c) {
   int numRows = b.height_;
   int numCols = b.width_;
   MatrixOffset offset(destCol, 0, 0, 0, 0, 0);
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, false_type(),
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
             true_type() /*aAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowDotMul2(size_t destCol,
                                 BaseMatrixT& b,
                                 BaseMatrixT& c) {
@@ -1283,17 +1436,24 @@ void BaseMatrixT<T>::rowDotMul2(size_t destCol,
   }
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::addDotMulVMM(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = b.height_;
   int numCols = b.width_;
-  aggregate(aggregate::sum(), base::binary::mul(), base::binary::add(), b, c,
-            numRows, numCols, offset, true_type() /*aAsRowVector*/,
+  aggregate(aggregate::sum(),
+            base::binary::mul(),
+            base::binary::add(),
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
             false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1314,16 +1474,22 @@ void BaseMatrixT<T>::addDotMulVMM2(BaseMatrixT& b, BaseMatrixT& c) {
 }
 
 DEFINE_MATRIX_TERNARY_OP(addDotMulMMV, a += b * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV(BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /*cAsRowVector*/, false_type());
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /*cAsRowVector*/,
+               false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1343,16 +1509,22 @@ void BaseMatrixT<T>::addDotMulMMV2(BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   CHECK(!useGpu_) << "do not support gpu";
 
@@ -1372,52 +1544,82 @@ void BaseMatrixT<T>::rowScale2(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::colScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::DotMul<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::DotMul<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addColScale(size_t cRow, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, cRow);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               true_type() /* cAsRowVector */, false_type() /* cAsColVector */);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               true_type() /* cAsRowVector */,
+               false_type() /* cAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::addRowScale(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::addDotMulMMV<T>(), b, c, numRows, numCols, offset,
-               false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::addDotMulMMV<T>(),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(RowAdd, ONE_PARAMETER, a = b + p * c);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::rowAdd(size_t cCol, BaseMatrixT& b, BaseMatrixT& c, T p) {
   MatrixOffset offset(0, 0, 0, 0, cCol, 0);
   int numRows = height_;
   int numCols = width_;
-  applyTernary(ternary::RowAdd<T>(p), b, c, numRows, numCols, offset,
-    false_type(), true_type() /*cAsColVector*/);
+  applyTernary(ternary::RowAdd<T>(p),
+               b,
+               c,
+               numRows,
+               numCols,
+               offset,
+               false_type(),
+               true_type() /*cAsColVector*/);
 }
 
 DEFINE_MATRIX_TERNARY_OP(RowPow, a = pow(b, c));
-template<>
+template <>
 void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   if (useGpu_) {
     MatrixOffset offset(0, 0, 0, 0, cCol, 0);
     int numRows = height_;
     int numCols = width_;
-    applyTernary(ternary::RowPow<real>(), b, c, numRows, numCols, offset,
-                 false_type(), true_type() /*cAsColVector*/);
+    applyTernary(ternary::RowPow<real>(),
+                 b,
+                 c,
+                 numRows,
+                 numCols,
+                 offset,
+                 false_type(),
+                 true_type() /*cAsColVector*/);
   } else {
     size_t height = this->height_;
     size_t width = this->width_;
@@ -1434,44 +1636,64 @@ void BaseMatrixT<real>::rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c) {
   }
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
 DEFINE_MATRIX_BINARY_OP(DotDiv, a /= b);
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              true_type() /* bAsRowVector */, false_type());
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              true_type() /* bAsRowVector */,
+              false_type());
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotMul<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<class T>
+template <class T>
 void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0);
   int numRows = height_;
   int numCols = width_;
-  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
-              false_type(), true_type() /* bAsColVector */);
+  applyBinary(binary::DotDiv<T>(),
+              b,
+              numRows,
+              numCols,
+              offset,
+              false_type(),
+              true_type() /* bAsColVector */);
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1479,13 +1701,20 @@ int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1493,16 +1722,25 @@ int BaseMatrixT<real>::applyRow(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(height_, numRows);
   CHECK_EQ(width_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyRow(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyRow(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1514,10 +1752,10 @@ int BaseMatrixT<real>::applyRow(
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op, class Saver>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(
+    Agg agg, Op op, Saver sv, BaseMatrixT& b, BaseMatrixT& c) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
   size_t numRows = b.height_;
   size_t numCols = b.width_;
@@ -1525,16 +1763,27 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, Saver sv,
   CHECK_EQ(width_, 1UL);
   CHECK_EQ(c.height_, numRows);
   CHECK_EQ(c.width_, numCols);
-  aggregate(agg, op, sv,
-            b, c, numRows, numCols, offset,
-            false_type(), true_type() /*aAsColVector*/);
+  aggregate(agg,
+            op,
+            sv,
+            b,
+            c,
+            numRows,
+            numCols,
+            offset,
+            false_type(),
+            true_type() /*aAsColVector*/);
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Op>
-int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
-                                BaseMatrixT& b, BaseMatrixT& c) {
+int BaseMatrixT<real>::applyRow(Agg agg,
+                                Op op,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b,
+                                BaseMatrixT& c) {
   if (scaleDest != 0) {
     applyRow(agg, op, base::binary::add2(scaleDest, scaleAgg), b, c);
   } else {
@@ -1546,7 +1795,7 @@ int BaseMatrixT<real>::applyRow(Agg agg, Op op, real scaleDest, real scaleAgg,
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
 int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1554,13 +1803,20 @@ int BaseMatrixT<real>::applyCol(Agg agg, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), base::binary::second(), b, numRows,
-            numCols, offset, true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            base::binary::second(),
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg, class Saver>
 int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   MatrixOffset offset(0, 0, 0, 0, 0, 0);
@@ -1568,16 +1824,25 @@ int BaseMatrixT<real>::applyCol(Agg agg, Saver sv, BaseMatrixT& b) {
   size_t numCols = b.width_;
   CHECK_EQ(width_, numCols);
   CHECK_EQ(height_, 1UL);
-  aggregate(agg, base::unary::identity(), sv, b, numRows, numCols, offset,
-            true_type() /*aAsRowVector*/, false_type());
+  aggregate(agg,
+            base::unary::identity(),
+            sv,
+            b,
+            numRows,
+            numCols,
+            offset,
+            true_type() /*aAsRowVector*/,
+            false_type());
 
   return 0;
 }
 
-template<>
+template <>
 template <class Agg>
-int BaseMatrixT<real>::applyCol(
-     Agg agg, real scaleDest, real scaleAgg, BaseMatrixT& b) {
+int BaseMatrixT<real>::applyCol(Agg agg,
+                                real scaleDest,
+                                real scaleAgg,
+                                BaseMatrixT& b) {
   if (scaleDest != 0) {
     applyCol(agg, base::binary::add2(scaleDest, scaleAgg), b);
   } else {
@@ -1589,50 +1854,100 @@ int BaseMatrixT<real>::applyCol(
   return 0;
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumRows(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyRow(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxRows(BaseMatrixT& b) {
   applyRow(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minRows(BaseMatrixT& b) {
   applyRow(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::maxCols(BaseMatrixT& b) {
   applyCol(aggregate::max(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::minCols(BaseMatrixT& b) {
   applyCol(aggregate::min(), b);
 }
 
-template<>
+template <>
 void BaseMatrixT<real>::sumCols(BaseMatrixT& b, real scaleSum, real scaleDest) {
   applyCol(aggregate::sum(), scaleDest, scaleSum, b);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfSquaredDiffs(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::squaredDiff(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfSquaredDiffs(BaseMatrixT& b,
+                                          BaseMatrixT& c,
+                                          real scaleSum,
+                                          real scaleDest) {
+  applyRow(
+      aggregate::sum(), base::binary::squaredDiff(), scaleDest, scaleSum, b, c);
 }
 
-template<>
-void BaseMatrixT<real>::sumOfProducts(
-    BaseMatrixT& b, BaseMatrixT& c, real scaleSum, real scaleDest) {
-  applyRow(aggregate::sum(), base::binary::mul(),
-           scaleDest, scaleSum, b, c);
+template <>
+void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
+                                      BaseMatrixT& c,
+                                      real scaleSum,
+                                      real scaleDest) {
+  applyRow(aggregate::sum(), base::binary::mul(), scaleDest, scaleSum, b, c);
 }
 
 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 120d69f718..12ad2d45a0 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -488,6 +488,13 @@ public:
    */
   void clip(T p1, T p2);
 
+  /**
+   * this = b < low ? 0 : 1
+   *
+   * this = b > high ? 0 : 1
+   */
+  void clipDerivative(BaseMatrixT& b, T p1, T p2);
+
   /**
    * @code
    * a = a > p ? 1.0f : 0.0f
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index f5657c4690..922fb51722 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -14,14 +14,36 @@
 #
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
+
+if(NOT WITH_MKLDNN)
+    set(DNN_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.h")
+    set(DNN_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/MKLDNNMatrix.cpp")
+    list(REMOVE_ITEM MATH_HEADERS "${DNN_HEADER}")
+    list(REMOVE_ITEM MATH_SOURCES "${DNN_SOURCE}")
+    message(STATUS "Skip compiling with MKLDNNMatrix")
+else()
+    message(STATUS "Compile with MKLDNNMatrix")
+endif()
+
+if(MOBILE_INFERENCE)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
-    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
-    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
     ${MATH_SOURCES})
 if(NOT WITH_GPU)
     # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
@@ -33,7 +55,7 @@ endif()
 add_style_check_target(paddle_math ${MATH_SOURCES})
 add_style_check_target(paddle_math ${MATH_HEADERS})
 
-add_dependencies(paddle_math gen_proto_cpp)  # depends
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03..dc6979cf5a 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 860cad1047..522b436a2a 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"
 
@@ -236,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
@@ -302,6 +314,64 @@ public:
   bool isSparse() const { return true; }
 
 private:
+  using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
new file mode 100644
index 0000000000..a710479bab
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNMatrix.h"
+
+using namespace mkldnn;  // NOLINT
+
+namespace paddle {
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
+  memory::desc md = pd.desc();
+  size_t ndims = md.data.ndims;
+  int* dims = md.data.dims;
+  CHECK(ndims > 0) << "Input dims should not be empty";
+  size_t cnts = 1;
+  for (size_t i = 0; i < ndims; ++i) {
+    cnts *= dims[i];
+  }
+
+  if (m == nullptr) {
+    size_t height = dims[0];
+    size_t width = cnts / dims[0];
+    m = Matrix::create(height, width, false, false);
+  }
+  CHECK(m) << " Matrix should not be empty";
+
+  CpuMatrixPtr cpuMatrix = std::dynamic_pointer_cast<CpuMatrix>(m);
+  CHECK(cpuMatrix) << "Only support create from CPU matrix yet";
+  CHECK_EQ(cpuMatrix->getElementCnt(), cnts) << "Count size does not match";
+  return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
+}
+
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
+                                     memory::format fmt,
+                                     engine& eg,
+                                     MatrixPtr m,
+                                     mkldnn::memory::data_type dtype) {
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
+}
+
+std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
+                                                     const MKLDNNMatrixPtr& dst,
+                                                     bool checkData) {
+  if (src == dst || src->getPrimitiveDesc() == dst->getPrimitiveDesc()) {
+    return nullptr;
+  }
+
+  if (checkData && (src->getData() == dst->getData())) {
+    LOG(FATAL) << "can not create reorder with inplace data";
+    return nullptr;
+  }
+
+  memory::dims srcDims = src->getDims();
+  memory::dims dstDims = dst->getDims();
+  CHECK_EQ(srcDims.size(), dstDims.size());
+  for (size_t i = 0; i < srcDims.size(); ++i) {
+    CHECK_EQ(srcDims[i], dstDims[i]);
+  }
+  return std::make_shared<reorder>(*src, *dst);
+}
+
+void MKLDNNMatrix::reorderDataFrom(const MKLDNNMatrixPtr& m,
+                                   memory::format srcFmt,
+                                   memory::dims targetDim) {
+  memory::format dstFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderDataTo(const MKLDNNMatrixPtr& m,
+                                 memory::format dstFmt,
+                                 memory::dims targetDim) {
+  memory::format srcFmt = getFormat();
+  if (srcFmt == dstFmt) {
+    return;
+  }
+  CHECK_EQ(getElementCnt(), m->getElementCnt()) << "size should equal";
+  reorderOnce(getData(), m->getData(), srcFmt, dstFmt, targetDim);
+}
+
+void MKLDNNMatrix::reorderOnce(void* srcData,
+                               void* dstData,
+                               memory::format srcFmt,
+                               memory::format dstFmt,
+                               memory::dims dm) {
+  CHECK(srcData);
+  CHECK(dstData);
+  MatrixPtr tmpSrc;
+  if (dstData == srcData) {
+    // inplace data
+    size_t sz = 1;
+    for (size_t i = 0; i < dm.size(); ++i) {
+      sz *= dm[i];
+    }
+    tmpSrc = Matrix::create(sz, 1, false, false);
+    tmpSrc->copyFrom((real*)srcData, sz);
+    srcData = tmpSrc->getData();
+  }
+
+  auto dtype = this->getDtype();
+  auto srcMD = memory::desc(dm, dtype, srcFmt);
+  auto dstMD = memory::desc(dm, dtype, dstFmt);
+
+  auto eg = this->getEngine();
+  auto src = memory(memory::primitive_desc(srcMD, eg), srcData);
+  auto dst = memory(memory::primitive_desc(dstMD, eg), dstData);
+
+  auto r = reorder(src, dst);
+  stream(stream::kind::eager).submit({r}).wait();
+}
+
+void MKLDNNMatrix::downSpatial() {
+  int fmt = getFormat();
+  if (!(fmt == memory::format::nchw || fmt == memory::format::oihw)) {
+    // only support nchw and oihw yet, later can support more like nhwc, ihwo
+    return;
+  }
+
+  // TODO(TJ): change H(height) and W(width) if support nhwc or more
+  const int H = 2, W = 3;
+  memory::dims srcDims = getDims();
+  if (srcDims[H] != 1 || srcDims[W] != 1) {
+    // can not down spatial
+    return;
+  }
+
+  memory::dims dstDims = memory::dims{srcDims[0], srcDims[1]};
+  memory::format dstFmt;
+  switch (fmt) {
+    case memory::format::nchw:
+      dstFmt = memory::format::nc;
+      break;
+    case memory::format::oihw:
+      dstFmt = memory::format::oi;
+      break;
+    default:
+      LOG(FATAL) << "unsupported format";
+  }
+  memory::desc md = memory::desc(dstDims, getDtype(), dstFmt);
+  memory::primitive_desc pd = memory::primitive_desc(md, getEngine());
+  resetMKLDNNMemory(pd, data_);
+}
+
+}  // namespace paddle
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
new file mode 100644
index 0000000000..39d40a1f61
--- /dev/null
+++ b/paddle/math/MKLDNNMatrix.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Matrix.h"
+#include "mkldnn.hpp"
+#include "paddle/parameter/Parameter.h"
+
+namespace paddle {
+
+class MKLDNNMatrix;
+typedef std::shared_ptr<MKLDNNMatrix> MKLDNNMatrixPtr;
+
+#define CHECK_PRIMITIVE_DESC_EQ(MAT, PD, ...)                        \
+  CHECK(MAT) << " can not be empty.";                                \
+  CHECK(MAT->getPrimitiveDesc() == PD)                               \
+      << #MAT "->getPrimitiveDesc() and " #PD " should be equal.\n " \
+      << "" __VA_ARGS__;
+
+/**
+ * @brief MKLDNN Matrix.
+ *
+ */
+class MKLDNNMatrix : public CpuMatrix, public mkldnn::memory {
+public:
+  MKLDNNMatrix(CpuMatrixPtr m, mkldnn::memory::primitive_desc pd)
+      : CpuMatrix(m->getData(), m->getHeight(), m->getWidth(), false),
+        mkldnn::memory(pd, m->getData()),
+        m_(m) {}
+
+  ~MKLDNNMatrix() {}
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
+   */
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
+
+  /**
+   * Create MKLDNNMatrix from a MatrixPtr and memory details info
+   */
+  static MKLDNNMatrixPtr create(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
+      mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
+
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
+  /**
+   * Create Memory descriptor.
+   * default with any format and f32 dtype
+   */
+  static mkldnn::memory::desc createMemoryDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::desc(dims, dtype, fmt);
+  }
+
+  /**
+   * Create reorder primitive.
+   * Create a mkldnn::reorder handle for converting src MKLDNNMatrix to dst.
+   * checkData: whether to check the data handle of src and dst.
+   *            if true, it will check the data and do not allow them equal;
+   *            otherwise, it will not check them, then the reorder created
+   *            may have inplace buffer.
+   *            Do not set false, if you can not guarantee the inplace logical
+   *            would work with your reorder.
+   */
+  static std::shared_ptr<mkldnn::reorder> createReorder(
+      const MKLDNNMatrixPtr& src,
+      const MKLDNNMatrixPtr& dst,
+      bool checkData = true);
+
+  void copyFrom(const Matrix& src) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    m_->copyFrom(src);
+  }
+
+  void copyTo(Matrix& dst) {
+    // TODO(TJ): reorder data if this format is not nchw or x
+    dst.copyFrom(*m_);
+  }
+
+public:
+  /**
+   * Reorder this MKLDNNMatrix from other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change this original dim or format info
+   */
+  void reorderDataFrom(const MKLDNNMatrixPtr& m,
+                       memory::format srcFmt,
+                       memory::dims targetDim);
+
+  /**
+   * Reorder this MKLDNNMatrix to other format.
+   * Support inplace reorder.
+   * @note: this function would only reorder the data layout.
+   *        will NOT change the dst dim or format info
+   */
+  void reorderDataTo(const MKLDNNMatrixPtr& m,
+                     memory::format dstFmt,
+                     memory::dims targetDim);
+
+  /**
+   * Dimensionality reduction.
+   * Change format "nchw --> nc" or "oihw --> oi" if the h and w are both 1
+   */
+  void downSpatial();
+
+  /**
+   * set the memory data handle.
+   * Caution: This will not check the buffer size of the data,
+   *          it should be coverd by user.
+   */
+  void setData(real* data) {
+    set_data_handle(data);
+    CpuMatrix::setData(data);
+    m_.reset();
+  }
+
+  /**
+   * override the CpuMatrix::resize
+   */
+  void resize(size_t newHeight, size_t newWidth) override {
+    m_->resize(newHeight, newWidth);
+    if (data_ == m_->getData() && elementCnt_ == newHeight * newWidth) {
+      return;
+    }
+    CpuMatrix::setData(data_);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+    auto pd = mkldnn::memory::primitive_desc(
+        mkldnn::memory::desc({(int)newHeight, (int)newWidth},
+                             getDtype(),
+                             mkldnn::memory::format::nc),
+        getEngine());
+    resetMKLDNNMemory(pd, data_);
+  }
+
+  /**
+   * override Matrix::getData
+   * check data before return
+   */
+  real* getData() override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  const real* getData() const override {
+    CHECK_EQ((void*)data_, get_data_handle());
+    return data_;
+  }
+
+  /**
+   * Get primitive descriptor.
+   */
+  mkldnn::memory::primitive_desc getPrimitiveDesc() {
+    return this->get_primitive_desc();
+  }
+
+  /**
+   * Get memory descriptor.
+   */
+  mkldnn::memory::desc getMemoryDesc() { return getPrimitiveDesc().desc(); }
+
+  /**
+   * Get dimensions.
+   */
+  mkldnn::memory::dims getDims() {
+    mkldnn::memory::desc md = getMemoryDesc();
+    const int* src = md.data.dims;
+    int ndims = md.data.ndims;
+    mkldnn::memory::dims dst;
+    dst.resize(ndims);
+    for (int i = 0; i < ndims; ++i) {
+      dst[i] = src[i];
+    }
+    return dst;
+  }
+
+  /**
+   * Get format.
+   */
+  mkldnn::memory::format getFormat() {
+    return (mkldnn::memory::format)(getMemoryDesc().data.format);
+  }
+
+  /**
+   * Get memory data type.
+   */
+  mkldnn::memory::data_type getDtype() {
+    return (mkldnn::memory::data_type)(getMemoryDesc().data.data_type);
+  }
+
+  /**
+   * Get engine.
+   */
+  mkldnn::engine getEngine() { return getPrimitiveDesc().get_engine(); }
+
+protected:
+  /**
+   * Do reorder once.
+   * Can support inplace.
+   */
+  void reorderOnce(void* srcData,
+                   void* dstData,
+                   memory::format srcFmt,
+                   memory::format dstFmt,
+                   memory::dims dm);
+  /**
+   * reset this MKLDNN Memory from primitve desc
+   */
+  void resetMKLDNNMemory(memory::primitive_desc pd, real* data) {
+    mkldnn_primitive_t result;
+    mkldnn::error::wrap_c_api(
+        mkldnn_primitive_create(&result, pd.get(), nullptr, nullptr),
+        "could not create a memory primitive");
+    reset(result);
+    set_data_handle(data);
+  }
+
+private:
+  // save the CpuMatrixPtr in case the buffer released outside
+  CpuMatrixPtr m_;
+};
+
+}  // namespace paddle
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 7045562dd4..28ab54b450 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -84,6 +84,7 @@ LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
 
 namespace paddle {
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void gemm<float>(const CBLAS_TRANSPOSE transA,
                  const CBLAS_TRANSPOSE transB,
@@ -143,6 +144,7 @@ void gemm<double>(const CBLAS_TRANSPOSE transA,
               C,
               ldc);
 }
+#endif
 
 template <>
 int getrf<float>(const CBLAS_ORDER order,
@@ -182,6 +184,7 @@ int getri<double>(const CBLAS_ORDER order,
   return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
 }
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
@@ -201,8 +204,9 @@ template <>
 double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
+#endif
 
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_WITH_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,36 +247,6 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
-
-template <>
-void vInvSqrt<float>(const int n, const float* a, float* r) {
-  vsInvSqrt(n, a, r);
-}
-
-template <>
-void vInvSqrt<double>(const int n, const double* a, double* r) {
-  vdInvSqrt(n, a, r);
-}
-
-template <>
-void vLog1p<float>(const int n, const float* a, float* r) {
-  vsLog1p(n, a, r);
-}
-
-template <>
-void vLog1p<double>(const int n, const double* a, double* r) {
-  vdLog1p(n, a, r);
-}
-
-template <>
-void vTanh<float>(const int n, const float* a, float* r) {
-  vsTanh(n, a, r);
-}
-
-template <>
-void vTanh<double>(const int n, const double* a, double* r) {
-  vdTanh(n, a, r);
-}
 #else
 
 DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
@@ -289,29 +263,6 @@ void vLog(const int n, const T* a, T* r) {
       binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
-template <class T>
-void vInvSqrt(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
-      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
-template <class T>
-void vLog1p(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
-      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
-                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
-                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
-template <class T>
-void vTanh(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
-      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
 template <class T>
 void vPow(const int n, const T* a, const T b, T* r) {
@@ -337,12 +288,6 @@ template void vExp(const int n, const float* a, float* r);
 template void vExp(const int n, const double* a, double* r);
 template void vLog(const int n, const float* a, float* r);
 template void vLog(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const double* a, double* r);
-template void vInvSqrt(const int n, const float* a, float* r);
-template void vLog1p(const int n, const float* a, float* r);
-template void vLog1p(const int n, const double* a, double* r);
-template void vTanh(const int n, const float* a, float* r);
-template void vTanh(const int n, const double* a, double* r);
 template void vPow(const int n, const float* a, const float b, float* r);
 template void vPow(const int n, const double* a, const double b, double* r);
 template void vAdd(const int n, const float* a, const float* b, float* r);
@@ -350,4 +295,34 @@ template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
+DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
+template <class T>
+void vInvSqrt(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vInvSqrt<T>, 0, 0>(
+      binary::vInvSqrt<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog1p, b = std::log(1.0f + a));
+template <class T>
+void vLog1p(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog1p<T>, 0, 0>(
+      binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vTanh, T tmp = -2.0 * a;
+                        tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+                        b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
+template <class T>
+void vTanh(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
+      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+template void vInvSqrt(const int n, const double* a, double* r);
+template void vInvSqrt(const int n, const float* a, float* r);
+template void vLog1p(const int n, const float* a, float* r);
+template void vLog1p(const int n, const double* a, double* r);
+template void vTanh(const int n, const float* a, float* r);
+template void vTanh(const int n, const double* a, double* r);
+
 }  // namespace paddle
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8ada0d34c6..29fe36e3a4 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,12 +15,13 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
-#ifdef PADDLE_USE_MKL
-#include <mkl.h>
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_cblas.h>
 #include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
 #endif
 
-#ifdef PADDLE_USE_ATLAS
+#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB)
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>
@@ -34,7 +35,14 @@ extern "C" {
 
 #ifndef LAPACK_FOUND
 extern "C" {
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 #include <cblas.h>
+#else
+typedef enum CBLAS_ORDER {
+  CblasRowMajor = 101,
+  CblasColMajor = 102
+} CBLAS_ORDER;
+#endif
 int LAPACKE_sgetrf(
     int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
 int LAPACKE_dgetrf(
@@ -50,6 +58,7 @@ int LAPACKE_dgetri(
 
 namespace paddle {
 
+#ifndef PADDLE_USE_EIGEN_FOR_BLAS
 template <class T>
 void gemm(const CBLAS_TRANSPOSE transA,
           const CBLAS_TRANSPOSE transB,
@@ -64,6 +73,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
           const T beta,
           T* C,
           const int ldc);
+#endif
 
 template <class T>
 int getrf(const CBLAS_ORDER Order,
@@ -78,10 +88,21 @@ int getri(
     const CBLAS_ORDER Order, const int N, T* A, const int lda, const int* ipiv);
 
 template <class T>
-void axpy(const int n, const T alpha, const T* x, T* y);
+void axpy(const int n, const T alpha, const T* x, T* y) {
+  /// y = y + alpha * x
+  for (int i = 0; i < n; i++) {
+    y[i] = y[i] + alpha * x[i];
+  }
+}
 
 template <class T>
-T dotProduct(const int n, const T* x, const T* y);
+T dotProduct(const int n, const T* x, const T* y) {
+  T result = static_cast<T>(0);
+  for (int i = 0; i < n; i++) {
+    result += x[i] * y[i];
+  }
+  return result;
+}
 
 template <class T>
 void vExp(const int n, const T* a, T* r);
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 5bbc3e4e37..980b6e1388 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -25,7 +25,7 @@ namespace paddle {
  */
 void sparseRand(
     int* major, int* minor, int nnz, int majorLen, int minorMax, bool useGpu) {
-  CHECK(size_t(nnz) > size_t(1));
+  CHECK(size_t(nnz) >= size_t(1));
   int* cpuMajor;
   int* cpuMinor;
   CpuIVector cpuMinorVec(nnz);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164..1ec4336cab 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -28,6 +28,8 @@ limitations under the License. */
 #include "hl_top_k.h"
 #include "paddle/utils/Logging.h"
 
+#include "NEONFunctions.h"
+#include "paddle/function/GemmFunctor.h"
 #include "paddle/utils/ThreadLocal.h"
 
 #include "SIMDFunctions.h"
@@ -450,6 +452,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }
 
 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
   GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@@ -460,6 +463,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
     hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
     hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
   }
+#endif
 }
 
 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@@ -551,6 +555,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                     const GpuMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(b.isContiguous());
   CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@@ -577,12 +582,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                           b.height_,
                           scaleAB,
                           scaleT);
+#endif
 }
 
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuSparseMatrix& b,
                     real scaleAB,
                     real scaleT) {
+#ifdef PADDLE_WITH_CUDA
   CHECK(isContiguous());
   CHECK(a.isContiguous());
   CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@@ -621,6 +628,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                             scaleAB,
                             scaleT);
   }
+#endif
 }
 
 /* this = a*b */
@@ -669,7 +677,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
 }
 
 void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -693,7 +701,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
 }
 
 void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -740,7 +748,7 @@ void GpuMatrix::rowMax(Matrix& max) {
 }
 
 void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
@@ -1016,81 +1024,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1102,22 +1035,28 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
+  real* maskData = NULL;
   size_t frameNum = inputMat.getHeight();
-  size_t width = imgSizeW;
-  size_t height = imgSizeH;
-  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
+  if (maskMatP != NULL) {
+    CHECK(maskMatP->useGpu_ == true) << "Matrix type are not equal";
+    CHECK(outputH * outputW * channels == maskMatP->getWidth());
+    maskData = maskMatP->getData();
+  }
+
   hl_maxpool_forward(frameNum,
                      inputData,
                      channels,
-                     height,
-                     width,
+                     imgSizeH,
+                     imgSizeW,
                      outputH,
                      outputW,
                      sizeX,
@@ -1127,7 +1066,8 @@ void GpuMatrix::maxPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     maskData);
 }
 
 void GpuMatrix::maxPoolBackward(Matrix& inputMat,
@@ -1154,11 +1094,8 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat,
   real* outDiff = outGrad.getData();
   size_t frameNum = inputMat.getHeight();
   size_t channels = outV.getWidth() / outputH / outputW;
-  size_t width = imgSizeW;
-  size_t height = imgSizeH;
-  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
   CHECK(height_ == inputMat.getHeight());
-  CHECK(width_ == width * height * channels);
   CHECK(outGrad.getHeight() == outV.getHeight() &&
         outGrad.getWidth() == outV.getWidth());
 
@@ -1167,8 +1104,8 @@ void GpuMatrix::maxPoolBackward(Matrix& inputMat,
                       outData,
                       outDiff,
                       channels,
-                      height,
-                      width,
+                      imgSizeH,
+                      imgSizeW,
                       outputH,
                       outputW,
                       sizeX,
@@ -1194,22 +1131,21 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
 
   real* inputData = inputMat.getData();
   size_t frameNum = inputMat.getHeight();
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-  CHECK(height * width * channels == inputMat.getWidth());
+  CHECK(imgSizeH * imgSizeW * channels == inputMat.getWidth());
   CHECK(height_ == inputMat.getHeight());
   CHECK(width_ == outputH * outputW * channels);
 
   hl_avgpool_forward(frameNum,
                      inputData,
                      channels,
-                     height,
-                     width,
+                     imgSizeH,
+                     imgSizeW,
                      outputH,
                      outputW,
                      sizeX,
@@ -1219,7 +1155,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                      paddingH,
                      paddingW,
                      data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1234,23 +1171,22 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
 
   real* outDiff = outGrad.getData();
   size_t frameNum = outGrad.getHeight();
   size_t channels = outGrad.getWidth() / outputH / outputW;
-  size_t height = imgSizeH;
-  size_t width = imgSizeW;
-  CHECK(height * width * channels == width_);
+  CHECK(imgSizeH * imgSizeW * channels == width_);
   CHECK(height_ == outGrad.getHeight());
   CHECK(outGrad.getWidth() == outputH * outputW * channels);
 
   hl_avgpool_backward(frameNum,
                       outDiff,
                       channels,
-                      height,
-                      width,
+                      imgSizeH,
+                      imgSizeW,
                       outputH,
                       outputW,
                       sizeX,
@@ -1262,7 +1198,210 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                       scaleTargets,
                       scaleOutput,
                       data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
+}
+
+void GpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not correct";
+
+  real* inputData = inputMat.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_maxpool3D_forward(num,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       maxPoolIdxData,
+                       getStride());
+}
+
+void GpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_ && maxPoolIdx.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t frameNum = getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == getWidth());
+  CHECK(outGrad.getHeight() == maxPoolIdx.getHeight() &&
+        outGrad.getWidth() == maxPoolIdx.getWidth());
+
+  hl_maxpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        maxPoolIdxData,
+                        outGrad.getStride());
+}
+
+void GpuMatrix::avgPool3DForward(Matrix& inputMat,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  CHECK(inputMat.useGpu_) << "Matrix type are not equal";
+
+  real* inputData = inputMat.getData();
+  size_t frameNum = inputMat.getHeight();
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == inputMat.getWidth());
+  CHECK(height_ == inputMat.getHeight());
+  CHECK(width_ == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_forward(frameNum,
+                       inputData,
+                       channels,
+                       imgSizeD,
+                       imgSizeH,
+                       imgSizeW,
+                       outputD,
+                       outputH,
+                       outputW,
+                       sizeZ,
+                       sizeY,
+                       sizeX,
+                       strideD,
+                       strideH,
+                       strideW,
+                       paddingD,
+                       paddingH,
+                       paddingW,
+                       getData(),
+                       getStride());
+}
+
+void GpuMatrix::avgPool3DBackward(Matrix& outGrad,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  CHECK(outGrad.useGpu_) << "Matrix type are not equal";
+
+  real* outDiff = outGrad.getData();
+  size_t frameNum = outGrad.getHeight();
+  size_t channels = outGrad.getWidth() / outputD / outputH / outputW;
+  CHECK(imgSizeD * imgSizeH * imgSizeW * channels == width_);
+  CHECK(height_ == outGrad.getHeight());
+  CHECK(outGrad.getWidth() == outputD * outputH * outputW * channels);
+
+  hl_avgpool3D_backward(frameNum,
+                        outDiff,
+                        channels,
+                        imgSizeD,
+                        imgSizeH,
+                        imgSizeW,
+                        outputD,
+                        outputH,
+                        outputW,
+                        sizeZ,
+                        sizeY,
+                        sizeX,
+                        strideD,
+                        strideH,
+                        strideW,
+                        paddingD,
+                        paddingH,
+                        paddingW,
+                        scaleTargets,
+                        scaleOutput,
+                        getData(),
+                        outGrad.getStride());
 }
 
 void GpuMatrix::maxSequenceForward(Matrix& input,
@@ -1429,6 +1568,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1444,9 +1584,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy(
       output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }
 
 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
   GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
   auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);
 
@@ -1462,6 +1604,73 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
   hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
   hl_matrix_multi_binary_cross_entropy_bp(
       output_d, grad_d, mat_d, height_, width_);
+#endif
+}
+
+void GpuMatrix::vol2Col(real* dataSrc,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  hl_matrix_vol2Col(dataSrc,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData());
+}
+
+void GpuMatrix::col2Vol(real* dataDst,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  hl_matrix_col2Vol(dataDst,
+                    channels,
+                    depth,
+                    height,
+                    width,
+                    filterD,
+                    filterH,
+                    filterW,
+                    strideD,
+                    strideH,
+                    strideW,
+                    paddingD,
+                    paddingH,
+                    paddingW,
+                    getData(),
+                    alpha,
+                    beta);
 }
 
 /**
@@ -1565,6 +1774,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                     const_cast<real*>(src.getData()),
                     sizeof(real) * elementCnt_,
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
@@ -1775,103 +1986,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1883,17 +1997,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               MatrixPtr maskMatP) {
   real* inputData = inputMat.getData();
   real* outData = data_;
+  real* maskData = NULL;
   size_t num = inputMat.getHeight();
-  size_t inWidth = imgSizeW;
-  size_t inHeight = imgSizeH;
-  CHECK(inHeight * inWidth == inputMat.getWidth() / channels);
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength == inputMat.getWidth() / channels);
   CHECK_EQ(num, this->getHeight());
-  CHECK_EQ(channels * outputH * outputW, this->getWidth());
+  CHECK_EQ(channels * outLength, this->getWidth());
   size_t outStride = getStride();
 
+  if (maskMatP != NULL) {
+    maskData = maskMatP->getData();
+    CHECK_EQ(channels * outLength, maskMatP->getWidth());
+  }
+
   /* initialize the data_ */
   for (size_t i = 0; i < height_; i++) {
     for (size_t j = 0; j < width_; j++) {
@@ -1908,24 +2029,37 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
     }
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
         for (size_t pw = 0; pw < outputW; ++pw) {
-          int hstart = ph * strideH - paddingH;
           int wstart = pw * strideW - paddingW;
-          int hend = std::min(hstart + sizeY, inHeight);
-          int wend = std::min(wstart + sizeX, inWidth);
-          hstart = std::max(hstart, 0);
+          int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          for (int h = hstart; h < hend; ++h) {
-            for (int w = wstart; w < wend; ++w) {
-              outData[ph * outputW + pw] = std::max(outData[ph * outputW + pw],
-                                                    inputData[h * inWidth + w]);
+          if (maskData == NULL) {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                outData[ph * outputW + pw] = std::max(
+                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+              }
+            }
+          } else {
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (outData[ph * outputW + pw] < inputData[h * imgSizeW + w]) {
+                  outData[ph * outputW + pw] = inputData[h * imgSizeW + w];
+                  maskData[ph * outputW + pw] = h * imgSizeW + w;
+                }
+              }
             }
           }
         }
       }
       // compute offset
-      inputData += inHeight * inWidth;
-      outData += outputH * outputW;
+      inputData += inLength;
+      outData += outLength;
+
+      if (maskData != NULL) maskData += outLength;
     }
   }
 }
@@ -1946,8 +2080,10 @@ void CpuMatrix::maxPoolBackward(Matrix& image,
                                 size_t paddingH,
                                 size_t paddingW) {
   size_t num = image.getHeight();
-  size_t channels = size_t(width_ / imgSizeH / imgSizeW);
-  CHECK(image.getWidth() == imgSizeH * imgSizeW * channels);
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(image.getWidth() == inLength * channels);
   CHECK(image.getHeight() == height_ && image.getWidth() == width_);
   CHECK(outV.getHeight() == outGrad.getHeight() &&
         outV.getWidth() == outGrad.getWidth());
@@ -1968,12 +2104,12 @@ void CpuMatrix::maxPoolBackward(Matrix& image,
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
         for (size_t pw = 0; pw < outputW; ++pw) {
-          int hstart = ph * strideH - paddingH;
           int wstart = pw * strideW - paddingW;
-          int hend = std::min(hstart + sizeY, imgSizeH);
           int wend = std::min(wstart + sizeX, imgSizeW);
-          hstart = std::max(hstart, 0);
           wstart = std::max(wstart, 0);
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
@@ -1986,10 +2122,10 @@ void CpuMatrix::maxPoolBackward(Matrix& image,
         }
       }
       // offset
-      inData += imgSizeH * imgSizeW;
-      tgtGrad += imgSizeH * imgSizeW;
-      otData += outputH * outputW;
-      otGrad += outputH * outputW;
+      inData += inLength;
+      tgtGrad += inLength;
+      otData += outLength;
+      otGrad += outLength;
     }
   }
 }
@@ -2005,13 +2141,14 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                                size_t outputH,
                                size_t outputW,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
   // The main loop
   size_t num = input.getHeight();
-  size_t inHeight = imgSizeH;
-  size_t inWidth = imgSizeW;
-  CHECK(inHeight * inWidth * channels == input.getWidth());
-  CHECK(outputH * outputW * channels * num == height_ * width_);
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
   real* tgtData = data_;
   real* inData = input.getData();
 
@@ -2021,30 +2158,28 @@ void CpuMatrix::avgPoolForward(Matrix& input,
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
         for (size_t pw = 0; pw < outputW; ++pw) {
-          int hstart = ph * strideH - paddingH;
           int wstart = pw * strideW - paddingW;
-          int hend = std::min(hstart + sizeY, inHeight + paddingH);
-          int wend = std::min(wstart + sizeX, inWidth + paddingW);
-          int poolSize = (hend - hstart) * (wend - wstart);
-          hstart = std::max(hstart, 0);
+          int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          hend = std::min(hend, static_cast<int>(inHeight));
-          wend = std::min(wend, static_cast<int>(inWidth));
-
-          CHECK(poolSize);
           tgtData[ph * outputW + pw] = 0;  // clear
           for (int h = hstart; h < hend; ++h) {
             for (int w = wstart; w < wend; ++w) {
-              tgtData[ph * outputW + pw] += inData[h * inWidth + w];
+              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
             }
           }
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+          CHECK(poolSize);
           tgtData[ph * outputW + pw] /= poolSize;
         }
       }
       // compute offset
-      inData += inHeight * inWidth;
-      tgtData += outputH * outputW;
+      inData += inLength;
+      tgtData += outLength;
     }
   }
 }
@@ -2061,10 +2196,13 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                 real scaleTargets,
                                 real scaleOutput,
                                 size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
   size_t num = input.getHeight();
   size_t channels = input.getWidth() / outputH / outputW;
-  CHECK(imgSizeH * imgSizeW * channels == getWidth());
+  size_t inLength = imgSizeH * imgSizeW;
+  size_t outLength = outputH * outputW;
+  CHECK(inLength * channels == getWidth());
   real* inData = input.getData();
   real* outData = getData();
 
@@ -2074,16 +2212,15 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
     }
     for (size_t c = 0; c < channels; ++c) {
       for (size_t ph = 0; ph < outputH; ++ph) {
+        int hstart = ph * strideH - paddingH;
+        int hend = std::min(hstart + sizeY, imgSizeH);
+        hstart = std::max(hstart, 0);
         for (size_t pw = 0; pw < outputW; ++pw) {
-          int hstart = ph * strideH - paddingH;
           int wstart = pw * strideW - paddingW;
-          int hend = std::min(hstart + sizeY, imgSizeH + paddingH);
-          int wend = std::min(wstart + sizeX, imgSizeW + paddingW);
-          int poolSize = (hend - hstart) * (wend - wstart);
-          hstart = std::max(hstart, 0);
+          int wend = std::min(wstart + sizeX, imgSizeW);
           wstart = std::max(wstart, 0);
-          hend = std::min(hend, static_cast<int>(imgSizeH));
-          wend = std::min(wend, static_cast<int>(imgSizeW));
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
           CHECK(poolSize);
 
           for (int h = hstart; h < hend; ++h) {
@@ -2094,8 +2231,274 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
         }
       }
       // offset
-      outData += imgSizeH * imgSizeW;
-      inData += outputH * outputW;
+      outData += inLength;
+      inData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DForward(Matrix& inputMat,
+                                 Matrix& maxPoolIdx,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  real* inputData = inputMat.getData();
+  real* outData = getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t num = inputMat.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength == inputMat.getWidth() / channels);
+  CHECK_EQ(num, this->getHeight());
+  CHECK_EQ(channels * outLength, this->getWidth());
+  size_t outStride = getStride();
+
+  /* initialize the data_ */
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      outData[(i)*outStride + j] = -(real)FLT_MAX;
+      maxPoolIdxData[(i)*outStride + j] = -1;
+    }
+  }
+
+  /* pool max one by one */
+  for (size_t n = 0; n < num; ++n) {  // frame by frame
+    if (!isContiguous()) {
+      outData = getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {  // channel by channel
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int maxIdx = -1;
+            real maxOutData = outData[(pd * outputH + ph) * outputW + pw];
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  if (maxOutData <
+                      inputData[(d * imgSizeH + h) * imgSizeW + w]) {
+                    maxOutData = inputData[(d * imgSizeH + h) * imgSizeW + w];
+                    maxIdx = (d * imgSizeH + h) * imgSizeW + w;
+                  }
+                }
+              }
+            }
+            outData[(pd * outputH + ph) * outputW + pw] = maxOutData;
+            maxPoolIdxData[(pd * outputH + ph) * outputW + pw] = maxIdx;
+          }
+        }
+      }
+      // compute offset
+      inputData += inLength;
+      outData += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::maxPool3DBackward(Matrix& outGrad,
+                                  Matrix& maxPoolIdx,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = size_t(width_ / inLength);
+  CHECK(maxPoolIdx.getHeight() == outGrad.getHeight() &&
+        maxPoolIdx.getWidth() == outGrad.getWidth());
+
+  real* tgtGrad = getData();
+  real* otGrad = outGrad.getData();
+  real* maxPoolIdxData = maxPoolIdx.getData();
+  size_t outStride = outGrad.getStride();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!outGrad.isContiguous()) {
+      otGrad = outGrad.getData() + n * outStride;
+      maxPoolIdxData = maxPoolIdx.getData() + n * outStride;
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            const size_t index = (pd * outputH + ph) * outputW + pw;
+            const size_t tgtIdx = static_cast<size_t>(maxPoolIdxData[index]);
+            tgtGrad[tgtIdx] =
+                scaleTargets * tgtGrad[tgtIdx] + scaleOutput * otGrad[index];
+          }
+        }
+      }
+      // offset
+      tgtGrad += inLength;
+      otGrad += outLength;
+      maxPoolIdxData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DForward(Matrix& input,
+                                 size_t channels,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW) {
+  // The main loop
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  CHECK(inLength * channels == input.getWidth());
+  CHECK(outLength * channels * num == height_ * width_);
+  real* tgtData = getData();
+  real* inData = input.getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!isContiguous()) {
+      tgtData = data_ + n * getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+
+            tgtData[(pd * outputH + ph) * outputW + pw] = 0;  // clear
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  tgtData[(pd * outputH + ph) * outputW + pw] +=
+                      inData[(d * imgSizeH + h) * imgSizeW + w];
+                }
+              }
+            }
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            tgtData[(pd * outputH + ph) * outputW + pw] /= poolSize;
+          }
+        }
+      }
+      // compute offset
+      inData += inLength;
+      tgtData += outLength;
+    }
+  }
+}
+
+void CpuMatrix::avgPool3DBackward(Matrix& input,
+                                  size_t imgSizeD,
+                                  size_t imgSizeH,
+                                  size_t imgSizeW,
+                                  size_t outputD,
+                                  size_t outputH,
+                                  size_t outputW,
+                                  size_t sizeZ,
+                                  size_t sizeY,
+                                  size_t sizeX,
+                                  size_t strideD,
+                                  size_t strideH,
+                                  size_t strideW,
+                                  size_t paddingD,
+                                  size_t paddingH,
+                                  size_t paddingW,
+                                  real scaleTargets,
+                                  real scaleOutput) {
+  size_t num = input.getHeight();
+  size_t inLength = imgSizeH * imgSizeW * imgSizeD;
+  size_t outLength = outputH * outputW * outputD;
+  size_t channels = input.getWidth() / outLength;
+  CHECK(inLength * channels == getWidth());
+  real* inData = input.getData();
+  real* outData = getData();
+
+  for (size_t n = 0; n < num; ++n) {
+    if (!input.isContiguous()) {
+      inData = input.getData() + n * input.getStride();
+    }
+    for (size_t c = 0; c < channels; ++c) {
+      for (size_t pd = 0; pd < outputD; ++pd) {
+        int dstart = pd * strideD - paddingD;
+        int dend = std::min(dstart + sizeZ, imgSizeD);
+        dstart = std::max(dstart, 0);
+        for (size_t ph = 0; ph < outputH; ++ph) {
+          int hstart = ph * strideH - paddingH;
+          int hend = std::min(hstart + sizeY, imgSizeH);
+          hstart = std::max(hstart, 0);
+          for (size_t pw = 0; pw < outputW; ++pw) {
+            int wstart = pw * strideW - paddingW;
+            int wend = std::min(wstart + sizeX, imgSizeW);
+            wstart = std::max(wstart, 0);
+            int poolSize = (dend - dstart) * (hend - hstart) * (wend - wstart);
+            CHECK(poolSize);
+            for (int d = dstart; d < dend; ++d) {
+              for (int h = hstart; h < hend; ++h) {
+                for (int w = wstart; w < wend; ++w) {
+                  outData[(d * imgSizeH + h) * imgSizeW + w] +=
+                      inData[(pd * outputH + ph) * outputW + pw] / poolSize;
+                }
+              }
+            }
+          }
+        }
+      }
+      // offset
+      outData += inLength;
+      inData += outLength;
     }
   }
 }
@@ -2392,24 +2795,24 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   CHECK(!isTransposed()) << "Not supported";
 
   size_t a_col, b_col, a_row, b_row;
-  CBLAS_TRANSPOSE a_trans, b_trans;
+  bool a_trans, b_trans;
   if (!a->isTransposed()) {
     a_col = a->getWidth();
     a_row = a->getHeight();
-    a_trans = CblasNoTrans;
+    a_trans = false;
   } else {
     a_col = a->getHeight();
     a_row = a->getWidth();
-    a_trans = CblasTrans;
+    a_trans = true;
   }
   if (!b->isTransposed()) {
     b_col = b->getWidth();
     b_row = b->getHeight();
-    b_trans = CblasNoTrans;
+    b_trans = false;
   } else {
     b_col = b->getHeight();
     b_row = b->getWidth();
-    b_trans = CblasTrans;
+    b_trans = true;
   }
 
   CHECK_EQ(a_col, b_row);
@@ -2426,7 +2829,7 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int lda = a->getStride();
   int ldb = b->getStride();
   int ldc = getStride();
-  gemm<real>(
+  BlasGemm<DEVICE_TYPE_CPU, real>::compute(
       a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }
 
@@ -2871,6 +3274,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                            real scaleAB,
                                                            real scaleT);
 
+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                           CpuMatrix* b,
                           real scaleAB,
@@ -2999,6 +3403,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
   }
 }
 
+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
   BaseMatrix::addColVector(const_cast<Matrix&>(b));
@@ -3761,16 +4166,36 @@ void CpuMatrix::print(std::ostream& os) const {
 void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* input = data.getData();
   real* w = W.getData();
+  real* output = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
   size_t paraSize = W.getHeight() * W.getWidth();
   CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+
   size_t partial_sum = numElements / paraSize;
+  if (paraSize == numElements) {
+    for (size_t n = 0; n < numSamples * numElements; ++n) {
+      output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements];
+    }
+    return;
+  }
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  for (size_t n = 0; n < numSamples; ++n) {
+    for (size_t i = 0; i < paraSize; i++) {
+      neon::prelu(
+          input + i * partial_sum, w[i], output + i * partial_sum, partial_sum);
+    }
+    input = input + numElements;
+    output = output + numElements;
+  }
+#else
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
-      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
+      output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
     }
   }
+#endif
 }
 
 void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
@@ -4145,6 +4570,95 @@ void CpuMatrix::bilinearBackward(const Matrix& out,
   }
 }
 
+void CpuMatrix::vol2Col(real* data,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW) {
+  real* outData = getData();
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIn = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] =
+                data[((cIn * depth + dPad) * height + hPad) * width + wPad];
+          else
+            outData[((c * outDepth + d) * outHeight + h) * outWidth + w] = 0;
+        }
+      }
+    }
+  }
+}
+
+void CpuMatrix::col2Vol(real* trg,
+                        int channels,
+                        int depth,
+                        int height,
+                        int width,
+                        int filterD,
+                        int filterH,
+                        int filterW,
+                        int strideD,
+                        int strideH,
+                        int strideW,
+                        int paddingD,
+                        int paddingH,
+                        int paddingW,
+                        real alpha,
+                        real beta) {
+  real* src = getData();
+  int outDepth = (depth + 2 * paddingD - filterD) / strideD + 1;
+  int outHeight = (height + 2 * paddingH - filterH) / strideH + 1;
+  int outWidth = (width + 2 * paddingW - filterW) / strideW + 1;
+  int channelsCol = channels * filterD * filterH * filterW;
+  for (int c = 0; c < channelsCol; ++c) {
+    int wOffset = c % filterW;
+    int hOffset = (c / filterW) % filterH;
+    int dOffset = (c / filterW / filterH) % filterD;
+    int cIm = c / filterW / filterH / filterD;
+    for (int d = 0; d < outDepth; ++d) {
+      for (int h = 0; h < outHeight; ++h) {
+        for (int w = 0; w < outWidth; ++w) {
+          int dPad = d * strideD - paddingD + dOffset;
+          int hPad = h * strideH - paddingH + hOffset;
+          int wPad = w * strideW - paddingW + wOffset;
+          if (hPad >= 0 && hPad < height && wPad >= 0 && wPad < width &&
+              dPad >= 0 && dPad < depth)
+            trg[((cIm * depth + dPad) * height + hPad) * width + wPad] =
+                alpha *
+                    src[((c * outDepth + d) * outHeight + h) * outWidth + w] +
+                beta *
+                    trg[((cIm * depth + dPad) * height + hPad) * width + wPad];
+        }
+      }
+    }
+  }
+}
+
 ////////////////////////////////////////////////////////////////
 //               functions executed via cpu                   //
 ////////////////////////////////////////////////////////////////
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4..c8e690e642 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
   virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
     LOG(FATAL) << "Not implemented";
   }
@@ -858,52 +859,10 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
+   * in the sizeX of value, if the maskMatP is not NULL, it will
+   * also caculate the location indices.
    */
   virtual void maxPoolForward(Matrix& inputMat,
                               size_t imgSizeH,
@@ -916,7 +875,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              MatrixPtr maskMatP = NULL) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -951,7 +911,8 @@ public:
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                              size_t paddingW) {
+                              size_t paddingW,
+                              bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
@@ -967,18 +928,107 @@ public:
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode = true) {
     LOG(FATAL) << "Not implemeted";
   }
 
   /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
+   * Pooling 3D forward operation, pick out the largest element
+   * in the sizeX of value
    */
+  virtual void maxPool3DForward(Matrix& inputMat,
+                                Matrix& maxPoolIdx,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxPool3DBackward(Matrix& outGrad,
+                                 Matrix& maxPoolIdx,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DForward(Matrix& input,
+                                size_t channels,
+                                size_t imgSizeD,
+                                size_t imgSizeH,
+                                size_t imgSizeW,
+                                size_t outputD,
+                                size_t outputH,
+                                size_t outputW,
+                                size_t sizeZ,
+                                size_t sizeY,
+                                size_t sizeX,
+                                size_t strideD,
+                                size_t strideH,
+                                size_t strideW,
+                                size_t paddingD,
+                                size_t paddingH,
+                                size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPool3DBackward(Matrix& input,
+                                 size_t imgSizeD,
+                                 size_t imgSizeH,
+                                 size_t imgSizeW,
+                                 size_t outputD,
+                                 size_t outputH,
+                                 size_t outputW,
+                                 size_t sizeZ,
+                                 size_t sizeY,
+                                 size_t sizeX,
+                                 size_t strideD,
+                                 size_t strideH,
+                                 size_t strideW,
+                                 size_t paddingD,
+                                 size_t paddingH,
+                                 size_t paddingW,
+                                 real scaleTargets,
+                                 real scaleOutput) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+ * Input: one or more sequences. Each sequence contains some instances.
+ *
+ * Output: output size is the number of input sequences (NOT input
+ * instances).
+ *
+ * output[i] is set to max_input[i].
+ */
   virtual void maxSequenceForward(Matrix& input,
                                   const IVector& sequence,
                                   IVector& index) {
@@ -1081,6 +1131,42 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void vol2Col(real* data,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void col2Vol(real* trg,
+                       int channels,
+                       int depth,
+                       int height,
+                       int width,
+                       int filterD,
+                       int filterH,
+                       int filterW,
+                       int strideD,
+                       int strideH,
+                       int strideW,
+                       int paddingD,
+                       int paddingH,
+                       int paddingW,
+                       real alpha,
+                       real beta) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
   virtual void bilinearForward(const Matrix& in,
                                const size_t inImgH,
                                const size_t inImgW,
@@ -1334,34 +1420,6 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1373,7 +1431,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -1402,7 +1461,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1416,7 +1476,84 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
@@ -1444,6 +1581,38 @@ public:
                         const real ratioH,
                         const real ratioW);
 
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
   void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
 
   void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
@@ -1455,6 +1624,10 @@ public:
 };
 
 class CpuMatrix : public Matrix {
+private:
+  MatrixPtr sftmaxSum_;
+  MatrixPtr sftmaxDot_;
+
 public:
   CpuMatrix(size_t height, size_t width, bool trans = false);
   CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
@@ -1521,34 +1694,6 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1560,7 +1705,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      MatrixPtr maskMatP);
 
   void maxPoolBackward(Matrix& image,
                        size_t imgSizeH,
@@ -1589,7 +1735,8 @@ public:
                       size_t outputH,
                       size_t outputW,
                       size_t paddingH,
-                      size_t paddingW);
+                      size_t paddingW,
+                      bool excludeMode = true);
 
   void avgPoolBackward(Matrix& input,
                        size_t imgSizeH,
@@ -1603,7 +1750,84 @@ public:
                        real scaleTargets,
                        real scaleOutput,
                        size_t paddingH,
-                       size_t paddingW);
+                       size_t paddingW,
+                       bool excludeMode = true);
+
+  void maxPool3DForward(Matrix& inputMat,
+                        Matrix& maxPoolIdx,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void maxPool3DBackward(Matrix& outGrad,
+                         Matrix& maxPoolIdx,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
+
+  void avgPool3DForward(Matrix& input,
+                        size_t channels,
+                        size_t imgSizeD,
+                        size_t imgSizeH,
+                        size_t imgSizeW,
+                        size_t outputD,
+                        size_t outputH,
+                        size_t outputW,
+                        size_t sizeZ,
+                        size_t sizeY,
+                        size_t sizeX,
+                        size_t strideD,
+                        size_t strideH,
+                        size_t strideW,
+                        size_t paddingD,
+                        size_t paddingH,
+                        size_t paddingW);
+
+  void avgPool3DBackward(Matrix& input,
+                         size_t imgSizeD,
+                         size_t imgSizeH,
+                         size_t imgSizeW,
+                         size_t outputD,
+                         size_t outputH,
+                         size_t outputW,
+                         size_t sizeZ,
+                         size_t sizeY,
+                         size_t sizeX,
+                         size_t strideD,
+                         size_t strideH,
+                         size_t strideW,
+                         size_t paddingD,
+                         size_t paddingH,
+                         size_t paddingW,
+                         real scaleTargets,
+                         real scaleOutput);
 
   void maxSequenceForward(Matrix& input,
                           const IVector& sequence,
@@ -1813,6 +2037,38 @@ public:
                         const real ratioH,
                         const real ratioW);
 
+  void vol2Col(real* data,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW);
+
+  void col2Vol(real* trg,
+               int channels,
+               int depth,
+               int height,
+               int width,
+               int filterD,
+               int filterH,
+               int filterW,
+               int strideD,
+               int strideH,
+               int strideW,
+               int paddingD,
+               int paddingH,
+               int paddingW,
+               real alpha,
+               real beta);
+
   template <typename ExpressionType>
   void operator=(const ExpressionType& expr) {
     TensorCpuApply<real>(*this, expr);
@@ -1821,6 +2077,7 @@ public:
 
 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
   /* blockNum is number of partitions of the matrix  */
   SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
       : CpuMatrix(height, width, trans) {
@@ -1866,6 +2123,7 @@ private:
   ThreadLocal<CpuMatrixPtr> localBuf_;
   ThreadLocal<std::vector<int>> localBufRows_;
   ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };
 
 typedef struct { unsigned int col; } sparse_non_value_t;
diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp
new file mode 100644
index 0000000000..0f83149422
--- /dev/null
+++ b/paddle/math/NEONFunctions.cpp
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+
+#include "NEONFunctions.h"
+#include <arm_neon.h>
+
+namespace paddle {
+namespace neon {
+
+// b[i] = a[i] > 0.0f ? a[i] : 0.0f
+void relu(const float* a, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+  float32x4_t mb0, mb1, mb2, mb3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    mb0 = vmaxq_f32(ma0, zero);
+    mb1 = vmaxq_f32(ma1, zero);
+    mb2 = vmaxq_f32(ma2, zero);
+    mb3 = vmaxq_f32(ma3, zero);
+
+    vst1q_f32(b, mb0);
+    vst1q_f32(b + 4, mb1);
+    vst1q_f32(b + 8, mb2);
+    vst1q_f32(b + 12, mb3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : 0.0f;
+  }
+}
+
+// b[i] = a[i] > 0.0f ? a[i] : a[i] * w
+void prelu(const float* a, float w, float* b, int len) {
+  int offset = len % 16;
+  float32x4_t ma0, ma1, ma2, ma3;
+
+  float32x4_t zero = vdupq_n_f32(0.f);
+  float32x4_t vw = vdupq_n_f32(w);
+
+  for (int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = vld1q_f32(a);
+    ma1 = vld1q_f32(a + 4);
+    ma2 = vld1q_f32(a + 8);
+    ma3 = vld1q_f32(a + 12);
+
+    uint32x4_t flag0 = vcgtq_f32(ma0, zero);
+    uint32x4_t flag1 = vcgtq_f32(ma1, zero);
+    uint32x4_t flag2 = vcgtq_f32(ma2, zero);
+    uint32x4_t flag3 = vcgtq_f32(ma3, zero);
+
+    float32x4_t mul0 = vmulq_f32(ma0, vw);
+    float32x4_t mul1 = vmulq_f32(ma1, vw);
+    float32x4_t mul2 = vmulq_f32(ma2, vw);
+    float32x4_t mul3 = vmulq_f32(ma3, vw);
+
+    ma0 = vbslq_f32(flag0, ma0, mul0);
+    ma1 = vbslq_f32(flag1, ma1, mul1);
+    ma2 = vbslq_f32(flag2, ma2, mul2);
+    ma3 = vbslq_f32(flag3, ma3, mul3);
+
+    vst1q_f32(b, ma0);
+    vst1q_f32(b + 4, ma1);
+    vst1q_f32(b + 8, ma2);
+    vst1q_f32(b + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    b[i] = a[i] > 0.0f ? a[i] : a[i] * w;
+  }
+}
+
+}  // namespace neon
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h
new file mode 100644
index 0000000000..d67b2f47a8
--- /dev/null
+++ b/paddle/math/NEONFunctions.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace neon {
+
+void relu(const float* a, float* b, int len);
+void prelu(const float* a, float w, float* b, int len);
+
+}  // namespace neon
+}  // namespace paddle
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index dbb829c4e2..e457d71f1b 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -60,7 +60,7 @@ public:
    */
   inline real* get(int row) const {
     if (preallocatedBuf_) {
-      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      CHECK_LE((row)*width_ * sizeof(real), preallocatedBuf_->getSize());
       return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
     } else {
       CHECK_LE((row + 1) * width_, rowStore_.size());
@@ -99,7 +99,11 @@ public:
   /**
    * @brief clear local buffer. It only affect auto-growth buffer.
    */
-  inline void clear() { rowStore_.clear(); }
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
 
   /**
    * @brief get current number of rows.
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 439f11b79d..76909720f6 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
 }
 
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len);
 void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
+#endif
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
 void decayL1AvxImpl(
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 6370c77386..284b68d590 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
 }
 
 void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index f6cd5df338..e0a3c6d228 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@@ -231,6 +234,53 @@ public:
 private:
   using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
+};
+
+}  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
 };
 
 }  // namespace paddle
+
+#endif
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 8704eb038d..ca7a6806da 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@@ -313,3 +315,27 @@ private:
 };
 
 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207..a2ef731ecb 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -17,9 +17,13 @@ limitations under the License. */
 #include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
 DEFINE_int32(pool_limit_size,
              536870912,
              "maximum memory size managed by a memory pool, default is 512M");
+#else
+DEFINE_int32(pool_limit_size, 0, "default is 0");
+#endif
 
 namespace paddle {
 
@@ -32,9 +36,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/math/TrainingAlgorithmOp.cu b/paddle/math/TrainingAlgorithmOp.cu
index 72ff077270..fc746b8533 100644
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "BaseMatrix.h"
 #include "TrainingAlgorithmOp.h"
+#include "paddle/utils/Logging.h"
 
 #if __cplusplus > 199711L
 
@@ -32,10 +32,10 @@ void sparseMomentumApply(BaseMatrix& value,
                          real tau,
                          real learningRate) {
   auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
-  auto expr2 = momV.lazyAssign(
-    momV + (tau * alpha * gamma * learningRate) * grad);
-  auto expr3 = value.lazyAssign(
-    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+  auto expr2 =
+      momV.lazyAssign(momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign((tau / beta + (real)1 / alpha) * momU +
+                                ((real)1 / beta) * momV);
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -52,12 +52,12 @@ void adadeltaApply(BaseMatrix& value,
                    real momentum,
                    real decayRate) {
   auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
-  auto expr2 = lr.lazyAssign(
-    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
-  auto expr3 = accum_update.lazyAssign(
-    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
-  auto expr4 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr2 =
+      lr.lazyAssign(((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(rou * accum_update +
+                                       ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr5 = value.lazyAssign(value + mom);
 
   AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
@@ -74,10 +74,10 @@ void adagradApply(BaseMatrix& value,
                   real momentum,
                   real decayRate) {
   auto expr1 = accum.lazyAssign(accum + grad.square());
-  auto expr2 = lr.lazyAssign(
-    (accum_buffer + accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr2 =
+      lr.lazyAssign((accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr4 = value.lazyAssign(value + mom);
 
   AssignEvaluate(expr1, expr2, expr3, expr4);
@@ -98,8 +98,8 @@ void rmspropApply(BaseMatrix& value,
                   bool firstTime) {
   auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
   auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
-  auto expr4 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr5 = value.lazyAssign(value + mom);
 
   if (firstTime) {
@@ -107,8 +107,8 @@ void rmspropApply(BaseMatrix& value,
 
     AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
   } else {
-    auto expr1 = g.lazyAssign(
-      accumulatedRou * g + ((real)1 - rou) * grad.square());
+    auto expr1 =
+        g.lazyAssign(accumulatedRou * g + ((real)1 - rou) * grad.square());
 
     AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
   }
@@ -127,8 +127,8 @@ void decayedAdagradApply(BaseMatrix& value,
                          real decayRate,
                          bool firstTime) {
   auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
-  auto expr3 = mom.lazyAssign(
-    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr3 = mom.lazyAssign(mom * momentum -
+                              learningRate * lr * (grad + value * decayRate));
   auto expr4 = value.lazyAssign(value + mom);
 
   if (firstTime) {
@@ -136,8 +136,8 @@ void decayedAdagradApply(BaseMatrix& value,
 
     AssignEvaluate(expr1, expr2, expr3, expr4);
   } else {
-    auto expr1 = accum.lazyAssign(
-      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum +
+                                  ((real)1 - rou) * grad.square());
 
     AssignEvaluate(expr1, expr2, expr3, expr4);
   }
@@ -153,13 +153,12 @@ void adamApply(BaseMatrix& value,
                real beta2_power,
                real epsilon,
                real learningRate) {
-  real alpha = learningRate *
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
 
   auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
   auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
-  auto expr3 = value.lazyAssign(
-    value - (mom * alpha) / (v.sqrt() + epsilon));
+  auto expr3 = value.lazyAssign(value - (mom * alpha) / (v.sqrt() + epsilon));
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -173,10 +172,10 @@ void adamaxApply(BaseMatrix& value,
                  int64_t step,
                  real alpha) {
   auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
-  auto expr2 = u.lazyAssign(
-    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr2 =
+      u.lazyAssign((beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
   auto expr3 = value.lazyAssign(
-    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+      value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
 
   AssignEvaluate(expr1, expr2, expr3);
 }
@@ -322,8 +321,8 @@ void adamApply(BaseMatrix& value,
                real beta2_power,
                real epsilon,
                real learningRate) {
-  real alpha = learningRate *
-      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
 
   // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
   mom = beta1 * mom + ((real)1 - beta1) * grad;
@@ -331,7 +330,7 @@ void adamApply(BaseMatrix& value,
   // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
   v = beta2 * v + ((real)1 - beta2) * grad.square();
 
-  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+  value -= (mom * alpha) / (v.sqrt() + epsilon);
 }
 
 void adamaxApply(BaseMatrix& value,
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index eaa1cdce30..346008439c 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <memory>
 #include "Matrix.h"
 #include "hl_gpu.h"
+#include "hl_matrix.h"
 #include "hl_table_apply.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Logging.h"
@@ -99,6 +100,19 @@ MatrixPtr VectorT<int>::toOneHotSparseMatrix(size_t idRange, bool useGpu) {
   return mat;
 }
 
+template <>
+std::shared_ptr<VectorT<int>> VectorT<real>::castToInt() {
+  std::shared_ptr<VectorT<int>> ret = IVector::create(this->getSize(), useGpu_);
+  if (useGpu_) {
+    hl_vector_cast2int(ret->getData(), this->getData(), this->getSize());
+  } else {
+    for (size_t i = 0; i < getSize(); ++i) {
+      ret->getData()[i] = int(this->getData()[i]);
+    }
+  }
+  return ret;
+}
+
 template <class T>
 GpuVectorT<T>::GpuVectorT(size_t size)
     : VectorT<T>(size,
@@ -172,7 +186,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
 
 template <class T>
 void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_vector_select_from<T>(this->getData(),
                            this->getSize(),
                            src.getData(),
@@ -657,6 +671,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                     (void*)src.getData(),
                     sizeof(T) * this->getSize(),
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else {
     src.copyTo(this);
   }
@@ -848,7 +864,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
                                 size_t size)
     : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   SyncedFlag* flag = src.getSync();
   if (*flag == DATA_AT_CPU) {
     src.copyToGpu();  // will set synchronous data between CPU and GPU
@@ -859,7 +875,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
   cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
       size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
   gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
       size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
@@ -908,12 +924,13 @@ const T* CpuGpuVectorT<T>::getData(bool useGpu) const {
 // Operation will change data and need to reset sync_ & syncFlag_.
 #define MUTABLE_VECTOR_OP(OP, useGpu, args...) \
   do {                                         \
-    setSync(useGpu);                           \
     if (useGpu) {                              \
       copyToGpu();                             \
+      setSync(useGpu);                         \
       return gpuVectorT_->OP(args);            \
     } else {                                   \
       copyToCpu();                             \
+      setSync(useGpu);                         \
       return cpuVectorT_->OP(args);            \
     }                                          \
   } while (0)
@@ -1030,7 +1047,7 @@ void CpuGpuVectorT<T>::copyToCpu() {
     case DATA_AT_GPU:
       CHECK(gpuVectorT_);
       this->resizeOrCreate(gpuVectorT_->getSize(), false);
-      cpuVectorT_->copyFrom(*gpuVectorT_, HPPL_STREAM_DEFAULT);
+      cpuVectorT_->copyFrom(*gpuVectorT_);
       setSync(SYNCED);
       break;
     case DATA_AT_CPU:
@@ -1049,7 +1066,7 @@ void CpuGpuVectorT<T>::copyToGpu() {
     case DATA_AT_CPU:
       CHECK(cpuVectorT_);
       this->resizeOrCreate(cpuVectorT_->getSize(), true);
-      gpuVectorT_->copyFrom(*cpuVectorT_, HPPL_STREAM_DEFAULT);
+      gpuVectorT_->copyFrom(*cpuVectorT_);
       setSync(SYNCED);
       break;
     case DATA_AT_GPU:
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 9af6e30c9e..f965a58092 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -162,17 +162,24 @@ public:
    */
   std::shared_ptr<Matrix> toOneHotSparseMatrix(size_t idRange, bool useGpu);
 
+  /**
+   * @brief cast vector of "real" elements to "int" elements.
+   *
+   * @note: float -> int must be casted, or you'll get wrong data.
+   */
+  std::shared_ptr<VectorT<int>> castToInt();
+
   /**
    * This function will crash if the size of src and dest is different.
    */
   virtual void copyFrom(const VectorT<T>& src) = 0;
 
   /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
-   * and return immediately.
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
    *
-   * If not use GPU, this function is same as
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
    */
   virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
 
diff --git a/paddle/math/float16.h b/paddle/math/float16.h
new file mode 100644
index 0000000000..efebbce504
--- /dev/null
+++ b/paddle/math/float16.h
@@ -0,0 +1,739 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __GNUC__
+#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
+#else
+#define PADDLE_GNUC_VER 0
+#endif  // __GNUC__
+
+#ifdef __clang__
+#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__)
+#else
+#define PADDLE_CLANG_VER 0
+#endif  // __clang__
+
+#if defined(__CUDACC__) && CUDA_VERSION >= 7050
+#define PADDLE_CUDA_FP16
+#include <cuda_fp16.h>
+#endif
+
+#if defined(__arm__) || defined(__aarch64__)
+#define PADDLE_ARM
+#endif
+
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#define PADDLE_NEON
+#include <arm_neon.h>
+#endif
+
+#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \
+    (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37)
+#define PADDLE_WITH_NATIVE_FP16
+#endif
+
+#ifndef PADDLE_ARM
+#include <immintrin.h>
+#endif  // PADDLE_ARM
+
+#define PADDLE_ALIGN(x) __attribute__((aligned(x)))
+
+namespace paddle {
+
+// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
+// and aligned at least on a 2-byte boundary, which leads to efficient
+// memory access of float16 struct and also makes float16 compatible
+// with CUDA half, ARM float16_t, and Eigen::half data types.
+struct PADDLE_ALIGN(2) float16 {
+public:
+  uint16_t x;
+
+  // Constructors
+  HOSTDEVICE inline float16() : x(0) {}
+
+  HOSTDEVICE inline float16(const float16& h) : x(h.x) {}
+
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit float16(const half& h) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&h))->x;
+#else
+    x = h.x;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {}
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  // __fp16 is a native half precision data type for arm cpu,
+  // float16_t is an alias for __fp16
+  HOSTDEVICE inline explicit float16(const float16_t& h) {
+    x = *reinterpret_cast<const uint16_t*>(&h);
+  }
+#endif
+
+  HOSTDEVICE inline explicit float16(float val) {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = __float2half(val);
+    x = *reinterpret_cast<uint16_t*>(&tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float32x4_t tmp = vld1q_dup_f32(&val);
+    float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
+    x = *reinterpret_cast<uint16_t*>(&res);
+
+#elif defined(__F16C__)
+    x = _cvtss_sh(val, 0);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v, s;
+    v.f = val;
+    uint32_t sign = v.si & sigN;
+    v.si ^= sign;
+    sign >>= shiftSign;  // logical shift
+    s.si = mulN;
+    s.si = s.f * v.f;  // correct subnormals
+    v.si ^= (s.si ^ v.si) & -(minN > v.si);
+    v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN));
+    v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN));
+    v.ui >>= shift;  // logical shift
+    v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC);
+    v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC);
+    x = v.ui | sign;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {}
+
+  template <class T>
+  HOSTDEVICE inline explicit float16(const T& val)
+      : x(float16(static_cast<float>(val)).x) {}
+
+  HOSTDEVICE inline float16& operator=(const float16& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+// Assignment operators
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline float16& operator=(const half& rhs) {
+#if CUDA_VERSION >= 9000
+    x = reinterpret_cast<__half_raw*>(const_cast<half*>(&rhs))->x;
+#else
+    x = rhs.x;
+#endif
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) {
+    x = rhs.x;
+    return *this;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline float16& operator=(const float16_t& rhs) {
+    x = *reinterpret_cast<const uint16_t*>(&rhs);
+    return *this;
+  }
+#endif
+
+  HOSTDEVICE inline float16& operator=(bool b) {
+    x = b ? 0x3c00 : 0;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint8_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint16_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint32_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(int64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(uint64_t val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(float val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+  HOSTDEVICE inline float16& operator=(double val) {
+    x = float16(val).x;
+    return *this;
+  }
+
+// Conversion opertors
+#ifdef PADDLE_CUDA_FP16
+  HOSTDEVICE inline explicit operator half() const {
+#if CUDA_VERSION >= 9000
+    __half_raw h;
+    h.x = x;
+    return half(h);
+#else
+    half h;
+    h.x = x;
+    return h;
+#endif  // CUDA_VERSION >= 9000
+  }
+#endif  // PADDLE_CUDA_FP16
+
+  HOSTDEVICE inline explicit operator Eigen::half() const {
+    Eigen::half h;
+    h.x = x;
+    return h;
+  }
+
+#ifdef PADDLE_WITH_NATIVE_FP16
+  HOSTDEVICE inline explicit operator float16_t() const {
+    return *reinterpret_cast<const float16_t*>(this);
+  }
+#endif
+
+  HOSTDEVICE inline explicit operator float() const {
+#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+    half tmp = *reinterpret_cast<const half*>(this);
+    return __half2float(tmp);
+
+#elif defined(PADDLE_WITH_NATIVE_FP16)
+    float16x4_t res = vld1_dup_f16(reinterpret_cast<const float16_t*>(this));
+    return vgetq_lane_f32(vcvt_f32_f16(res), 0);
+
+#elif defined(__F16C__)
+    return _cvtsh_ss(this->x);
+
+#else
+    // Conversion routine adapted from
+    // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion
+    Bits v;
+    v.ui = this->x;
+    int32_t sign = v.si & sigC;
+    v.si ^= sign;
+    sign <<= shiftSign;
+    v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC);
+    v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC);
+    Bits s;
+    s.si = mulC;
+    s.f *= v.si;
+    int32_t mask = -(norC > v.si);
+    v.si <<= shift;
+    v.si ^= (s.si ^ v.si) & mask;
+    v.si |= sign;
+    return v.f;
+
+#endif
+  }
+
+  HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; }
+
+  HOSTDEVICE inline explicit operator int8_t() const {
+    return static_cast<int8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint8_t() const {
+    return static_cast<uint8_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int16_t() const {
+    return static_cast<int16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint16_t() const {
+    return static_cast<uint16_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int32_t() const {
+    return static_cast<int32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint32_t() const {
+    return static_cast<uint32_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator int64_t() const {
+    return static_cast<int64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator uint64_t() const {
+    return static_cast<uint64_t>(float(*this));
+  }
+
+  HOSTDEVICE inline explicit operator double() const {
+    return static_cast<double>(float(*this));
+  }
+
+private:
+  union Bits {
+    float f;
+    int32_t si;
+    uint32_t ui;
+  };
+
+  static const int shift = 13;
+  static const int shiftSign = 16;
+
+  static const int32_t infN = 0x7F800000;
+  static const int32_t maxN = 0x477FE000;  // max flt16 as flt32
+  static const int32_t minN = 0x38800000;  // min flt16 normal as flt32
+  static const int32_t sigN = 0x80000000;  // sign bit
+
+  static constexpr int32_t infC = infN >> shift;
+  static constexpr int32_t nanN = (infC + 1)
+                                  << shift;  // minimum flt16 nan as float32
+  static constexpr int32_t maxC = maxN >> shift;
+  static constexpr int32_t minC = minN >> shift;
+  static constexpr int32_t sigC = sigN >> shiftSign;
+
+  static const int32_t mulN = 0x52000000;  // (1 << 23) / minN
+  static const int32_t mulC = 0x33800000;  // minN / (1 << (23 - shift))
+  static const int32_t subC = 0x003FF;     // max flt32 subnormal downshifted
+  static const int32_t norC = 0x00400;     // min flt32 normal downshifted
+
+  static constexpr int32_t maxD = infC - maxC - 1;
+  static constexpr int32_t minD = minC - subC - 1;
+};
+
+// Arithmetic operators on GPU
+// CUDA 9.0 provides built-in arithmetic operators for half while
+// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are
+// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in
+// CUDA 9.0 regarding the half data type.
+#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000
+
+DEVICE inline half operator+(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hadd(a, b);
+#else
+  float res = float(float16(a)) + float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hsub(a, b);
+#else
+  float res = float(float16(a)) - float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator*(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hmul(a, b);
+#else
+  float res = float(float16(a)) * float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator/(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
+  float num = __half2float(a);
+  float denom = __half2float(b);
+  return __float2half(num / denom);
+#else
+  float res = float(float16(a)) / float(float16(b));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half operator-(const half& a) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hneg(a);
+#else
+  float res = -float(float16(a));
+  return half(float16(res));
+#endif
+}
+
+DEVICE inline half& operator+=(half& a, const half& b) {
+  a = a + b;
+  return a;
+}
+
+DEVICE inline half& operator-=(half& a, const half& b) {
+  a = a - b;
+  return a;
+}
+
+DEVICE inline half& operator*=(half& a, const half& b) {
+  a = a * b;
+  return a;
+}
+
+DEVICE inline half& operator/=(half& a, const half& b) {
+  a = a / b;
+  return a;
+}
+
+DEVICE inline bool operator==(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __heq(a, b);
+#else
+  return float(float16(a)) == float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator!=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hne(a, b);
+#else
+  return float(float16(a)) != float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hlt(a, b);
+#else
+  return float(float16(a)) < float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator<=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hle(a, b);
+#else
+  return float(float16(a)) <= float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hgt(a, b);
+#else
+  return float(float16(a)) > float(float16(b));
+#endif
+}
+
+DEVICE inline bool operator>=(const half& a, const half& b) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
+  return __hge(a, b);
+#else
+  return float(float16(a)) >= float(float16(b));
+#endif
+}
+
+#endif  // PADDLE_CUDA_FP16
+
+// Arithmetic operators on ARMv8.2-A CPU
+#if defined(PADDLE_WITH_NATIVE_FP16)
+HOST inline float16 operator+(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fadd h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fsub h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator*(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fmul h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator/(const float16& a, const float16& b) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fdiv h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0", "v1");
+  return res;
+}
+
+HOST inline float16 operator-(const float16& a) {
+  float16 res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "fneg h0, h0\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [res_ptr] "r"(&(res.x))
+      :  // clobbers
+      "memory", "v0");
+  return res;
+}
+
+HOST inline float16& operator+=(float16& a, const float16& b) {
+  a = a + b;
+  return a;
+}
+
+HOST inline float16& operator-=(float16& a, const float16& b) {
+  a = a - b;
+  return a;
+}
+
+HOST inline float16& operator*=(float16& a, const float16& b) {
+  a = a * b;
+  return a;
+}
+
+HOST inline float16& operator/=(float16& a, const float16& b) {
+  a = a / b;
+  return a;
+}
+
+HOST inline bool operator==(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmeq h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator!=(const float16& a, const float16& b) {
+  return !(a == b);
+}
+
+HOST inline bool operator<(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator<=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v1.h}[0], [%[a_ptr]]\n"
+      "ld1 {v0.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmgt h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+HOST inline bool operator>=(const float16& a, const float16& b) {
+  uint16_t res;
+  asm volatile(
+      "ld1 {v0.h}[0], [%[a_ptr]]\n"
+      "ld1 {v1.h}[0], [%[b_ptr]]\n"
+      "fcmge h0, h0, h1\n"
+      "st1 {v0.h}[0], [%[res_ptr]]\n"
+      :  // outputs
+      :  // inputs
+      [a_ptr] "r"(&(a.x)),
+      [b_ptr] "r"(&(b.x)),
+      [res_ptr] "r"(&res)
+      :  // clobbers
+      "memory", "v0", "v1");
+  return (res & 0xffff) != 0;
+}
+
+// Arithmetic operators, software emulated on other CPU
+#else
+HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) {
+  return float16(float(a) + float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) {
+  return float16(float(a) - float(b));
+}
+
+HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) {
+  return float16(float(a) * float(b));
+}
+
+HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) {
+  return float16(float(a) / float(b));
+}
+
+HOSTDEVICE inline float16 operator-(const float16& a) {
+  float16 res;
+  res.x = a.x ^ 0x8000;
+  return res;
+}
+
+HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) {
+  a = float16(float(a) + float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) {
+  a = float16(float(a) - float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) {
+  a = float16(float(a) * float(b));
+  return a;
+}
+
+HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) {
+  a = float16(float(a) / float(b));
+  return a;
+}
+
+HOSTDEVICE inline bool operator==(const float16& a, const float16& b) {
+  return float(a) == float(b);
+}
+
+HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) {
+  return float(a) != float(b);
+}
+
+HOSTDEVICE inline bool operator<(const float16& a, const float16& b) {
+  return float(a) < float(b);
+}
+
+HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) {
+  return float(a) <= float(b);
+}
+
+HOSTDEVICE inline bool operator>(const float16& a, const float16& b) {
+  return float(a) > float(b);
+}
+
+HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
+  return float(a) >= float(b);
+}
+#endif
+}  // namespace paddle
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index ceb96b2e25..dcd2a34583 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
@@ -20,6 +22,7 @@ if(WITH_GPU)
     link_paddle_test(test_Tensor)
     CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
     link_paddle_test(test_lazyAssign)
+    nv_test(test_float16_gpu SRCS test_float16.cu)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
@@ -31,3 +34,4 @@ add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)
 add_simple_unittest(test_Matrix)
+add_simple_unittest(test_float16)
diff --git a/paddle/math/tests/TensorCheck.h b/paddle/math/tests/TensorCheck.h
index 5bc4a03067..b998e5772e 100644
--- a/paddle/math/tests/TensorCheck.h
+++ b/paddle/math/tests/TensorCheck.h
@@ -169,7 +169,7 @@ void TensorCheck(AssertEq compare,
       count++;
     }
   }
-  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
 }
 
 template <typename AssertEq, typename Tensor1, typename Tensor2>
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 1ca70ea84c..1fecf659e5 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -68,7 +68,7 @@ void testPoolAllocator() {
 
 TEST(Allocator, Pool) {
   testPoolAllocator<CpuAllocator>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolAllocator<GpuAllocator>();
 #endif
 }
@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
   EXPECT_EQ(ptr1, ptr2);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MemoryHandle, Gpu) {
   int numGpu = hl_get_device_count();
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 22ce39701f..1766257860 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
  * implementation of CPU and GPU member function in
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 58bc43a38b..c72f89c824 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 04c856453d..25e0ba11de 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index e6b5dba446..d9f146f0d1 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
@@ -162,4 +162,4 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-#endif /* PADDLE_ONLY_CPU */
+#endif
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 1c21da5b76..2f99fa3581 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithArg to compares the
  * implementation of CPU and GPU member function in Matrix.cpp.
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index c0572dfdbf..8abbe8d82e 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -47,7 +47,7 @@ struct MatrixPara {
   SparseFormat format;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void test_sparse_matrix_mul(MatrixPara paraA,
                             MatrixPara paraB,
                             MatrixPara paraC) {
@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 40e38434fa..d03698dee2 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/math/Matrix.h"
 #include "TensorCheck.h"
+#include "paddle/math/Matrix.h"
 
 using paddle::Matrix;
 using paddle::CpuMatrix;
@@ -26,25 +26,25 @@ using paddle::GpuIVector;
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-#define INIT_UNARY(A1, A2)                  \
-    Tensor A1(height, width);               \
-    Tensor A2(height, width);               \
-    A1.randomizeUniform();                  \
-    A2.copyFrom(A1)
-#define INIT_BINARY(A1, A2, B)              \
-    INIT_UNARY(A1, A2);                     \
-    Tensor B(height, width);                \
-    B.randomizeUniform()
-#define INIT_TERNARY(A1, A2, B, C)          \
-    INIT_BINARY(A1, A2, B);                 \
-    Tensor C(height, width);                \
-    C.randomizeUniform()
-#define INIT_QUATERNARY(A1, A2, B, C, D)    \
-    INIT_TERNARY(A1, A2, B, C);             \
-    Tensor D(height, width);                \
-    D.randomizeUniform()
-
-template<typename Tensor>
+#define INIT_UNARY(A1, A2)  \
+  Tensor A1(height, width); \
+  Tensor A2(height, width); \
+  A1.randomizeUniform();    \
+  A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B) \
+  INIT_UNARY(A1, A2);          \
+  Tensor B(height, width);     \
+  B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C) \
+  INIT_BINARY(A1, A2, B);          \
+  Tensor C(height, width);         \
+  C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D) \
+  INIT_TERNARY(A1, A2, B, C);            \
+  Tensor D(height, width);               \
+  D.randomizeUniform()
+
+template <typename Tensor>
 struct TestUnaryMatrix {
   typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
 
@@ -59,7 +59,7 @@ struct TestUnaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestBinaryMatrix {
   typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
 
@@ -74,10 +74,10 @@ struct TestBinaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestTernaryMatrix {
-  typedef std::function<void(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)>
+      TernaryFunc;
 
   explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
     for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -90,10 +90,11 @@ struct TestTernaryMatrix {
   }
 };
 
-template<typename Tensor>
+template <typename Tensor>
 struct TestQuaternaryMatrix {
   typedef std::function<void(
-    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+      Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)>
+      QuaternaryFunc;
 
   explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
     for (auto height : {1, 11, 73, 128, 200, 330}) {
@@ -106,7 +107,7 @@ struct TestQuaternaryMatrix {
   }
 };
 
-template<typename Tensor, class T>
+template <typename Tensor, class T>
 struct TestUnaryVectorT {
   typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
 
@@ -142,11 +143,11 @@ void SetTensorValue(Matrix& matrix, real value) {
   }
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAddScalar(Tensor& A1, Tensor& A2) {
   real p1 = 2.5;
   real p2 = 3.0;
-  A1.add(p1);   // a += p
+  A1.add(p1);  // a += p
   A2 += p1;
   TensorCheckEqual(A1, A2);
 
@@ -155,7 +156,7 @@ void testTensorAddScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSubScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.subScalar(p);  // a -= p
@@ -163,7 +164,7 @@ void testTensorSubScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMulScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.mulScalar(p);  // a *= p
@@ -177,7 +178,7 @@ void testTensorMulScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDivScalar(Tensor& A1, Tensor& A2) {
   real p = 2.5;
   A1.divScalar(p);  // a /= p
@@ -185,44 +186,44 @@ void testTensorDivScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorNeg(Tensor& A1, Tensor& A2) {
   A1.neg();  // a = -a
   A2 = -A2;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2) {
   A1.abs2();  // a = a > 0 ? a : -a
   A2 = A2.abs();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2) {
   A1.square2();  // a = a * a
   A2 = A2.square();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2) {
   A1.reciprocal2();  // a = 1.0f / a
   A2 = A2.reciprocal();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2) {
   A1.sign2();  // a = (a > 0) - (a < 0)
   A2 = A2.sign();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2) {
-  A1.assign(1.5);   // a = p
+  A1.assign(1.5);  // a = p
   A2 = A2.constant(1.5);
   TensorCheckEqual(A1, A2);
 
@@ -235,7 +236,7 @@ void testTensorAssign(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
   testTensorAddScalar(A1, A2);
   testTensorSubScalar(A1, A2);
@@ -249,9 +250,9 @@ void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
   testTensorAssign(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
-  A1.add(2);   // a += p
+  A1.add(2);  // a += p
   A2 += 2;
   TensorCheckEqual(A1, A2);
 
@@ -266,46 +267,46 @@ void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
 TEST(Unary, BaseOp) {
   TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
   TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
-  TestUnaryVectorT<CpuIVector, int>
-    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+  TestUnaryVectorT<CpuIVector, int> testCpuIVector(
+      testUnaryBaseOpInt<CpuIVector>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
   TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
-  TestUnaryVectorT<GpuIVector, int>
-    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+  TestUnaryVectorT<GpuIVector, int> testGpuIVector(
+      testUnaryBaseOpInt<GpuIVector>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2) {
   A1.exp2();  // a = exp(a)
   A2 = A2.exp();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2) {
   A1.log2();  // a = log(a)
   A2 = A2.log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2) {
   A1.sqrt2();  // a = sqrt(a)
   A2 = A2.sqrt();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2) {
   A1.pow2(3.2);  // a = pow(a, p)
   A2 = A2.pow(3.2);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrMathOp(Tensor& A1, Tensor& A2) {
   testTensorExp(A1, A2);
   testTensorLog(A1, A2);
@@ -316,12 +317,12 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, MathOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorClip(Tensor& A1, Tensor& A2) {
   real p1 = 0.003f;
   real p2 = 0.877f;
@@ -331,7 +332,7 @@ void testTensorClip(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
   real p = 0.5f;
   A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
@@ -339,7 +340,7 @@ void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2) {
   /**
    * T lambda = p;
@@ -351,14 +352,15 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2) {
   real learningRate = 0.7f;
   real decayRate = 0.6f;
   A1.applyL1(learningRate, decayRate);
-  A2 = (A2 > (learningRate * decayRate)).condition(
-    (A2 - (learningRate * decayRate)),
-    (A2 < -(learningRate * decayRate)).condition(
-      (A2 + (learningRate * decayRate)), (real)0.0));
+  A2 = (A2 > (learningRate * decayRate))
+           .condition(
+               (A2 - (learningRate * decayRate)),
+               (A2 < -(learningRate * decayRate))
+                   .condition((A2 + (learningRate * decayRate)), (real)0.0));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
   testTensorClip(A1, A2);
   testTensorBiggerThanScalar(A1, A2);
@@ -372,12 +374,12 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, CompareOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.2;
@@ -406,7 +408,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.sub(B);  // a -= b
@@ -422,7 +424,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.mulScalar(B, p);  // a = b * p
@@ -442,7 +444,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 2.5;
   A1.divScalar(B, p);  // a = b / p
@@ -454,28 +456,28 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.assign(B);  // a = b
   A2 = B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
-  B.square2(A1);   // b = a * a
+  B.square2(A1);  // b = a * a
   A2 = B.square();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.squareDerivative(B);  // a *= 2.0 * b
   A2 = A2 * (real)2.0 * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
   B.reciprocal2(A1);  // b = 1.0f / a
   A2 = B.reciprocal();
@@ -490,33 +492,33 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
   real learningRate = 0.7f;
   real decayRate = 1.2f;
   A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
-  A2 *= (B.constant(1.0f) +
-    B.constant(learningRate * decayRate) * B).reciprocal();
+  A2 *= (B.constant(1.0f) + B.constant(learningRate * decayRate) * B)
+            .reciprocal();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.reciprocalDerivative(B);  // a *= -b * b
   A2 *= (-B) * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
   B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
   A2 = B.sign();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
   B.abs2(A1);  // b = a > 0.0f ? a : -a
   A2 = B.abs();
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
   testTensorAdd(A1, A2, B);
   testTensorSub(A1, A2, B);
@@ -534,12 +536,12 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, BaseOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = exp(b)
   A1.exp2(B);
@@ -547,14 +549,14 @@ void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.expDerivative(B);  // a *= b
   A2 *= B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = log(b)
   A1.log2(B);
@@ -562,7 +564,7 @@ void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = sqrt(b)
   A1.sqrt2(B);
@@ -570,7 +572,7 @@ void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   // a = 1.0f / sqrt(b)
   A1.invSqrt(B);
@@ -578,14 +580,14 @@ void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.pow2(B, 2.5f);  // a = pow(b, p)
   A2 = B.pow(2.5f);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * const T THRESHOLD = 40.0;
@@ -597,12 +599,14 @@ void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
 
   real THRESHOLD = 40.0;
   A2 = (B.constant(1.0f) +
-        (B > THRESHOLD).condition(
-          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+        (B > THRESHOLD)
+            .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))
+            .exp())
+           .log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * const T THRESHOLD = 40.0;
@@ -612,14 +616,16 @@ void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
    */
   A1.softreluDerivative(B);
   real THRESHOLD = 40.0;
-  A2 = A2 * (B.constant(1.0f) -
-             (B.constant(-1.0f) *
-              (B > THRESHOLD).condition(
-                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+  A2 = A2 *
+       (B.constant(1.0f) -
+        (B.constant(-1.0f) *
+         (B > THRESHOLD)
+             .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)))
+            .exp());
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
     const T THRESHOLD_MIN = -40.0;
@@ -632,46 +638,47 @@ void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
 
   const real THRESHOLD_MIN = -40.0;
   const real THRESHOLD_MAX = 13.0;
-  auto tmp = (B < THRESHOLD_MIN).condition(
-    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  auto tmp = (B < THRESHOLD_MIN)
+                 .condition(THRESHOLD_MIN,
+                            (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
   A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.sigmoidDerivative(B);  // a *= b * (1 - b)
   A2 *= B * (B.constant(1.0f) - B);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
   B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
   A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.tanhDerivative(B);  // a *= 1 - b * b
   A2 *= B.constant(1.0f) - B * B;
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.1;
   // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
   B.scaledTanh(A1, p1, p2);
   A2 = B.constant(p1) *
-      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
-       - (real)1.0);
+       (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0) -
+        (real)1.0);
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   real p1 = 2.5;
   real p2 = 3.1;
@@ -681,7 +688,7 @@ void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
   testTensorTanhDerivative(A1, A2, B);
   testTensorScaledTanhDerivative(A1, A2, B);
@@ -703,26 +710,26 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, MathOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
   B.relu(A1);  // b = a > 0.0f ? a : 0.0f
   A2 = (B > (real)0.0f).condition(B, (real)0.0f);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
   A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   /*
    * b = a > p1 ? a : p1
@@ -736,7 +743,7 @@ void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   SetTensorValue(B, 32.0f);
   /*
@@ -748,15 +755,15 @@ void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
   A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
-  A2 = (B > (real)0.0f).condition(A2,
-    (B < (real)0.0f).condition(-A2, (real)0.0f));
+  A2 = (B > (real)0.0f)
+           .condition(A2, (B < (real)0.0f).condition(-A2, (real)0.0f));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
   real p = 0.613;
   SetTensorValue(B, p);
@@ -765,7 +772,7 @@ void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
   /**
    * T lambda = p * b;
@@ -778,12 +785,13 @@ void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
   real decayRate = 0.6f;
   A1.applyL1(B, learningRate, decayRate);
   auto lambda = B.constant(learningRate * decayRate) * B;
-  A2 = (A2 > lambda).condition(
-    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  A2 = (A2 > lambda)
+           .condition((A2 - lambda),
+                      (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
   B.subScalar(0.5f);
   SetTensorValue(B, 0.0f);
@@ -802,12 +810,12 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, CompareOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.add(B, C);  // a = b + c
   A2 = B + C;
@@ -833,7 +841,7 @@ void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.sub(B, C);  // a = b - c
   A2 = B - C;
@@ -846,7 +854,7 @@ void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.dotMul(B, C);  // a = b * c
   A2 = B * C;
@@ -892,7 +900,7 @@ void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
   A2 = (B == (real)0.0).condition((real)0.0, B / C);
@@ -905,7 +913,7 @@ void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   real p1 = 1.5;
   real p2 = 2.5;
@@ -915,14 +923,14 @@ void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
   A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorSoftCrossEntropyBp(Tensor& A1,
                                   Tensor& A2,
                                   Tensor& B,
@@ -932,7 +940,7 @@ void testTensorSoftCrossEntropyBp(Tensor& A1,
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   testTensorAdd(A1, A2, B, C);
   testTensorSub(A1, A2, B, C);
@@ -947,35 +955,35 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, BaseOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropy(Tensor& A1,
                                        Tensor& A2,
                                        Tensor& B,
                                        Tensor& C) {
   A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
-  A2 = (C > (real)0.5).condition(
-    -(B.log()), -((B.constant(1.0f) - B).log()));
+  A2 = (C > (real)0.5).condition(-(B.log()), -((B.constant(1.0f) - B).log()));
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
                                          Tensor& A2,
                                          Tensor& B,
                                          Tensor& C) {
   // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
   A1.binaryLabelCrossEntropyBp(B, C);
-  A2 += (C > (real)0.5).condition(
-    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+  A2 += (C > (real)0.5)
+            .condition((B.constant(-1.0f) / B),
+                       (B.constant(1.0f) - B).reciprocal());
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLoss(Tensor& A1,
                                       Tensor& A2,
                                       Tensor& B,
@@ -991,13 +999,14 @@ void testTensorLogisticRegressionLoss(Tensor& A1,
    */
   A1.logisticRegressionLoss(B, C);
   real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
   A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorLogisticRegressionLossBp(Tensor& A1,
                                         Tensor& A2,
                                         Tensor& B,
@@ -1013,28 +1022,29 @@ void testTensorLogisticRegressionLossBp(Tensor& A1,
    */
   A1.logisticRegressionLossBp(B, C);
   real THRESHOLD = 40.0;
-  auto tmp = (B > THRESHOLD).condition(
-    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp =
+      (B > THRESHOLD)
+          .condition(THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
   auto tmp2 = tmp.exp();
   A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
   A2 = (B > C).condition((real)1.0f, (real)0.0f);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   A1.max2(B, C);  // a = (b > c) ? b : c
   A2 = (B > C).condition(B, C);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
   testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
   testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
@@ -1048,17 +1058,14 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, CompareOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
-void testQuaternaryAdd(Tensor& A1,
-                       Tensor& A2,
-                       Tensor& B,
-                       Tensor& C,
-                       Tensor& D) {
+template <typename Tensor>
+void testQuaternaryAdd(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
   // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
   // TensorCheckEqual(A1, A2);
@@ -1079,30 +1086,24 @@ void testQuaternaryAdd(Tensor& A1,
 TEST(Quaternary, BaseOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
 #endif
 }
 
-template<typename Tensor>
-void testTensorBiggerThan(Tensor& A1,
-                          Tensor& A2,
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
+template <typename Tensor>
+void testTensorBiggerThan(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
   A1.biggerThan(B, C, D);
-  A2 = ((B > C && D > (real)0.5)
-        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+  A2 = ((B > C && D > (real)0.5) || (B < C && D < (real)0.5))
+           .condition((real)1.0, (real)0.0);
   TensorCheckEqual(A1, A2);
 }
 
-template<typename Tensor>
-void testTensorRankLoss(Tensor& A1,
-                        Tensor& A2,
-                        Tensor& B,
-                        Tensor& C,
-                        Tensor& D) {
+template <typename Tensor>
+void testTensorRankLoss(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   /**
    * const T THRESHOLD = 40.0; a = b - c;
    * a = (a > THRESHOLD)
@@ -1114,19 +1115,17 @@ void testTensorRankLoss(Tensor& A1,
 
   real THRESHOLD = 40.0;
   auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
   A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
 
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
-void testTensorRankLossBp(Tensor& A1,
-                          Tensor& A2,
-                          Tensor& B,
-                          Tensor& C,
-                          Tensor& D) {
+template <typename Tensor>
+void testTensorRankLossBp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   /**
    * const T THRESHOLD = 40.0; a = b - c;
    * a = (a > THRESHOLD)
@@ -1137,20 +1136,18 @@ void testTensorRankLossBp(Tensor& A1,
   A1.rankLossBp(B, C, D);
   real THRESHOLD = 40.0;
   auto tmp = B - C;
-  auto tmp2 = (tmp > THRESHOLD).condition(
-    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp2 =
+      (tmp > THRESHOLD)
+          .condition(THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
   auto tmp3 = tmp2.exp();
   A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
 
   TensorCheckErr(A1, A2);
 }
 
-template<typename Tensor>
-void testQuaternaryCompareOp(Tensor& A1,
-                             Tensor& A2,
-                             Tensor& B,
-                             Tensor& C,
-                             Tensor& D) {
+template <typename Tensor>
+void testQuaternaryCompareOp(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D) {
   testTensorBiggerThan(A1, A2, B, C, D);
   testTensorRankLoss(A1, A2, B, C, D);
   testTensorRankLossBp(A1, A2, B, C, D);
@@ -1159,7 +1156,7 @@ void testQuaternaryCompareOp(Tensor& A1,
 TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 4a88844b43..5ae0aa036f 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
 typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
 
 void testCase(testMatrixFunc matrixFunc) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   for (auto useGpu : {false, true}) {
 #else
   for (auto useGpu : {false}) {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 4eb9837909..b70a619764 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   const int nx = 100;
   const int ny = 50;
diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp
new file mode 100644
index 0000000000..74cc55aa37
--- /dev/null
+++ b/paddle/math/tests/test_float16.cpp
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(float16, conversion_cpu) {
+  // Explicit conversion from Eigen::half
+  EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800);
+  EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555);
+  EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000);
+  EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000);
+  EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff);
+  EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00);
+
+  // Conversion from float
+  EXPECT_EQ(float16(1.0f).x, 0x3c00);
+  EXPECT_EQ(float16(0.5f).x, 0x3800);
+  EXPECT_EQ(float16(0.33333f).x, 0x3555);
+  EXPECT_EQ(float16(0.0f).x, 0x0000);
+  EXPECT_EQ(float16(-0.0f).x, 0x8000);
+  EXPECT_EQ(float16(65504.0f).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0f).x, 0x7c00);
+
+  // Conversion from double
+  EXPECT_EQ(float16(1.0).x, 0x3c00);
+  EXPECT_EQ(float16(0.5).x, 0x3800);
+  EXPECT_EQ(float16(0.33333).x, 0x3555);
+  EXPECT_EQ(float16(0.0).x, 0x0000);
+  EXPECT_EQ(float16(-0.0).x, 0x8000);
+  EXPECT_EQ(float16(65504.0).x, 0x7bff);
+  EXPECT_EQ(float16(65536.0).x, 0x7c00);
+
+  // Conversion from int
+  EXPECT_EQ(float16(-1).x, 0xbc00);
+  EXPECT_EQ(float16(0).x, 0x0000);
+  EXPECT_EQ(float16(1).x, 0x3c00);
+  EXPECT_EQ(float16(2).x, 0x4000);
+  EXPECT_EQ(float16(3).x, 0x4200);
+
+  // Conversion from bool
+  EXPECT_EQ(float16(true).x, 0x3c00);
+  EXPECT_EQ(float16(false).x, 0x0000);
+
+  // Default constructor
+  float16 v_def;
+  EXPECT_EQ(v_def.x, 0x0000);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = v_def;
+  EXPECT_EQ(v_assign.x, 0x0000);
+  v_assign = Eigen::half(1.0f);
+  EXPECT_EQ(v_assign.x, 0x3c00);
+  v_assign = 0.5f;
+  EXPECT_EQ(v_assign.x, 0x3800);
+  v_assign = 0.33333;
+  EXPECT_EQ(v_assign.x, 0x3555);
+  v_assign = -1;
+  EXPECT_EQ(v_assign.x, 0xbc00);
+  v_assign = true;
+  EXPECT_EQ(v_assign.x, 0x3c00);
+
+  // Conversion operator
+  EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00);
+  EXPECT_EQ(float(float16(0.5f)), 0.5f);
+  EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001);
+  EXPECT_EQ(int(float16(-1)), -1);
+  EXPECT_EQ(bool(float16(true)), true);
+}
+
+TEST(float16, arithmetic_cpu) {
+  EXPECT_EQ(float(float16(1) + float16(1)), 2);
+  EXPECT_EQ(float(float16(5) + float16(-5)), 0);
+  EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001);
+  EXPECT_EQ(float(float16(3) - float16(5)), -2);
+  EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001);
+  EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01);
+  EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01);
+  EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001);
+  EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f);
+  EXPECT_EQ(float(-float16(512.0f)), -512.0f);
+  EXPECT_EQ(float(-float16(-512.0f)), 512.0f);
+}
+
+TEST(float16, comparison_cpu) {
+  EXPECT_TRUE(float16(1.0f) == float16(1.0f));
+  EXPECT_FALSE(float16(-1.0f) == float16(-0.5f));
+  EXPECT_TRUE(float16(1.0f) != float16(0.5f));
+  EXPECT_FALSE(float16(-1.0f) != float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) < float16(2.0f));
+  EXPECT_FALSE(float16(-1.0f) < float16(-1.0f));
+  EXPECT_TRUE(float16(1.0f) <= float16(1.0f));
+  EXPECT_TRUE(float16(2.0f) > float16(1.0f));
+  EXPECT_FALSE(float16(-2.0f) > float16(-2.0f));
+  EXPECT_TRUE(float16(2.0f) >= float16(2.0f));
+
+  EXPECT_TRUE(float16(0.0f) == float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) <= float16(-0.0f));
+  EXPECT_TRUE(float16(0.0f) >= float16(-0.0f));
+  EXPECT_FALSE(float16(0.0f) < float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) < float16(0.0f));
+  EXPECT_FALSE(float16(0.0f) > float16(-0.0f));
+  EXPECT_FALSE(float16(-0.0f) > float16(0.0f));
+}
+
+}  // namespace paddle
diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu
new file mode 100644
index 0000000000..4b520feaaf
--- /dev/null
+++ b/paddle/math/tests/test_float16.cu
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/math/float16.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/utils/Logging.h"
+
+#define ARITHMETIC_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, half* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define COMPOUND_KERNEL(op_type, sign) \
+  __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; }
+
+#define COMPARISON_KERNEL(op_type, sign)                                 \
+  __global__ void op_type(const half* in1, const half* in2, bool* out) { \
+    out[0] = in1[0] sign in2[0];                                         \
+  }
+
+#define ARITHMETIC_KERNEL_LAUNCH(op_type)                     \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2, *out;                                    \
+    half *d_in1, *d_in2, *d_out;                              \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    cudaMalloc((void**)&d_out, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    out = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                   \
+    cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(out[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    free(out);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+    cudaFree(d_out);                                          \
+  }
+
+#define COMPOUND_KERNEL_LAUNCH(op_type)                       \
+  void Test##op_type(float v_in1, float v_in2, float v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";           \
+    half *in1, *in2;                                          \
+    half *d_in1, *d_in2;                                      \
+    int size = sizeof(half);                                  \
+    cudaMalloc((void**)&d_in1, size);                         \
+    cudaMalloc((void**)&d_in2, size);                         \
+    in1 = (half*)malloc(size);                                \
+    in2 = (half*)malloc(size);                                \
+    in1[0] = half(float16(v_in1));                            \
+    in2[0] = half(float16(v_in2));                            \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);     \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);     \
+    op_type<<<1, 1>>>(d_in1, d_in2);                          \
+    cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost);     \
+    EXPECT_EQ(float(float16(in1[0])), v_out);                 \
+    free(in1);                                                \
+    free(in2);                                                \
+    cudaFree(d_in1);                                          \
+    cudaFree(d_in2);                                          \
+  }
+
+#define COMPARISON_KERNEL_LAUNCH(op_type)                    \
+  void Test##op_type(float v_in1, float v_in2, bool v_out) { \
+    LOG(INFO) << "Test " << #op_type << " on GPU!";          \
+    half *in1, *in2;                                         \
+    half *d_in1, *d_in2;                                     \
+    bool *out, *d_out;                                       \
+    int size = sizeof(half);                                 \
+    cudaMalloc((void**)&d_in1, size);                        \
+    cudaMalloc((void**)&d_in2, size);                        \
+    cudaMalloc((void**)&d_out, 1);                           \
+    in1 = (half*)malloc(size);                               \
+    in2 = (half*)malloc(size);                               \
+    out = (bool*)malloc(1);                                  \
+    in1[0] = half(float16(v_in1));                           \
+    in2[0] = half(float16(v_in2));                           \
+    cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);    \
+    cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);    \
+    op_type<<<1, 1>>>(d_in1, d_in2, d_out);                  \
+    cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost);       \
+    EXPECT_EQ(out[0], v_out);                                \
+    free(in1);                                               \
+    free(in2);                                               \
+    free(out);                                               \
+    cudaFree(d_in1);                                         \
+    cudaFree(d_in2);                                         \
+    cudaFree(d_out);                                         \
+  }
+
+#ifdef PADDLE_CUDA_FP16
+namespace paddle {
+
+#if CUDA_VERSION < 9000
+ARITHMETIC_KERNEL(Add, +)
+ARITHMETIC_KERNEL(Sub, -)
+ARITHMETIC_KERNEL(Mul, *)
+ARITHMETIC_KERNEL(Div, /)
+
+ARITHMETIC_KERNEL_LAUNCH(Add)
+ARITHMETIC_KERNEL_LAUNCH(Sub)
+ARITHMETIC_KERNEL_LAUNCH(Mul)
+ARITHMETIC_KERNEL_LAUNCH(Div)
+
+// Negative sign kernel
+__global__ void Neg(half* in) { in[0] = -in[0]; }
+
+void TestNeg(float v_in, float v_out) {
+  LOG(INFO) << "Test Neg on GPU!";
+  half *in, *d_in;
+  int size = sizeof(half);
+  cudaMalloc((void**)&d_in, size);
+  in = (half*)malloc(size);
+  in[0] = half(float16(v_in));
+  cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
+  Neg<<<1, 1>>>(d_in);
+  cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
+  EXPECT_EQ(float(float16(in[0])), v_out);
+  free(in);
+  cudaFree(d_in);
+}
+
+COMPOUND_KERNEL(AddAssign, +=)
+COMPOUND_KERNEL(SubAssign, -=)
+COMPOUND_KERNEL(MulAssign, *=)
+COMPOUND_KERNEL(DivAssign, /=)
+
+COMPOUND_KERNEL_LAUNCH(AddAssign)
+COMPOUND_KERNEL_LAUNCH(SubAssign)
+COMPOUND_KERNEL_LAUNCH(MulAssign)
+COMPOUND_KERNEL_LAUNCH(DivAssign)
+
+COMPARISON_KERNEL(Equal, ==)
+COMPARISON_KERNEL(NotEqual, !=)
+COMPARISON_KERNEL(Less, <)
+COMPARISON_KERNEL(LessEqual, <=)
+COMPARISON_KERNEL(Greater, >)
+COMPARISON_KERNEL(GreaterEqual, >=)
+
+COMPARISON_KERNEL_LAUNCH(Equal)
+COMPARISON_KERNEL_LAUNCH(NotEqual)
+COMPARISON_KERNEL_LAUNCH(Less)
+COMPARISON_KERNEL_LAUNCH(LessEqual)
+COMPARISON_KERNEL_LAUNCH(Greater)
+COMPARISON_KERNEL_LAUNCH(GreaterEqual)
+
+TEST(float16, arithmetic_on_gpu) {
+  TestAdd(1, 2, 3);
+  TestSub(2, 1, 1);
+  TestMul(2, 3, 6);
+  TestDiv(6, 2, 3);
+  TestNeg(1, -1);
+}
+
+TEST(float16, compound_on_gpu) {
+  TestAddAssign(1, 2, 3);
+  TestSubAssign(2, 1, 1);
+  TestMulAssign(2, 3, 6);
+  TestDivAssign(6, 2, 3);
+}
+
+TEST(float16, comparision_on_gpu) {
+  TestEqual(1, 1, true);
+  TestEqual(1, 2, false);
+  TestNotEqual(2, 3, true);
+  TestNotEqual(2, 2, false);
+  TestLess(3, 4, true);
+  TestLess(3, 3, false);
+  TestLessEqual(3, 3, true);
+  TestLessEqual(3, 2, false);
+  TestGreater(4, 3, true);
+  TestGreater(4, 4, false);
+  TestGreaterEqual(4, 4, true);
+  TestGreaterEqual(4, 5, false);
+}
+#endif  // CUDA_VERSION
+
+TEST(float16, conversion_on_gpu) {
+  // Explicit conversion to and from cuda half
+  EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
+  EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
+  EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
+  EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
+  EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
+  EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
+  EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
+
+  // Assignment operator
+  float16 v_assign;
+  v_assign = half(float16(1.0f));
+  EXPECT_EQ(v_assign.x, 0x3c00);
+}
+
+}  // namespace paddle
+#endif  // PADDLE_CUDA_FP16
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 786d863a53..04f23cff55 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include "PerfUtils.h"
+#include "TensorCheck.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/TensorAssign.h"
-#include "TensorCheck.h"
-#include "PerfUtils.h"
 
 using paddle::BaseMatrix;
 using paddle::CpuMatrix;
@@ -27,14 +27,28 @@ using autotest::TensorCheckErr;
 typedef std::function<void(int height, int width)> testMatrixFunc;
 void testMatrixCase(testMatrixFunc matrixFunc) {
   for (auto height : {1}) {
-    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
-                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+    for (auto width : {1,
+                       32,
+                       64,
+                       128,
+                       512,
+                       1024,
+                       4096,
+                       32768,
+                       65536,
+                       131072,
+                       262144,
+                       524288,
+                       1048576,
+                       2097152,
+                       4194304,
+                       8388608}) {
       matrixFunc(height, width);
     }
   }
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testLazyAssign(int height, int width) {
   Tensor A1(height, width);
   Tensor A2(height, width);
@@ -49,40 +63,39 @@ void testLazyAssign(int height, int width) {
 
   EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
 
-  EXPRESSION_PERFORMANCE(
-    auto expr1 = A2.lazyAssign(B + C);
-    auto expr2 = A2.lazyAssign(A2 * D);
-    AssignEvaluate(expr1, expr2););
+  EXPRESSION_PERFORMANCE(auto expr1 = A2.lazyAssign(B + C);
+                         auto expr2 = A2.lazyAssign(A2 * D);
+                         AssignEvaluate(expr1, expr2););
 
   TensorCheckErr(A1, A2);
 }
 
-TEST(lazyAssign, CPU) {
-  testMatrixCase(testLazyAssign<CpuMatrix>);
-}
+TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
-TEST(lazyAssign, GPU) {
-  testMatrixCase(testLazyAssign<GpuMatrix>);
-}
+#ifdef PADDLE_WITH_GPU
+TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
 #endif
 
-template<typename Tensor>
-void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
-     real p1, real p2, real p3) {
+template <typename Tensor>
+void sgdUpdateTensor(
+    Tensor& A, Tensor& B, Tensor& C, Tensor& D, real p1, real p2, real p3) {
   C = C * p2 - D * (B + A * p3) * p1;
   A += C;
 }
 
-void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
-    BaseMatrix& C, BaseMatrix& D,
-    real p1, real p2, real p3) {
+void sgdUpdateLazyAssign(BaseMatrix& A,
+                         BaseMatrix& B,
+                         BaseMatrix& C,
+                         BaseMatrix& D,
+                         real p1,
+                         real p2,
+                         real p3) {
   auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
   auto expr2 = A.lazyAssign(A + C);
   AssignEvaluate(expr1, expr2);
 }
 
-template<typename Tensor>
+template <typename Tensor>
 void testSgdUpdate(int height, int width) {
   Tensor A1(height, width);
   Tensor A2(height, width);
@@ -113,16 +126,13 @@ void testSgdUpdate(int height, int width) {
    * a = a + c;
    */
   // BaseMatrix API
-  EXPRESSION_PERFORMANCE(
-  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+  EXPRESSION_PERFORMANCE(A1.sgdUpdate(B, C1, D, p1, p2, p3););
 
   // Tensor expression
-  EXPRESSION_PERFORMANCE(
-    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+  EXPRESSION_PERFORMANCE(sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
 
   // lazyAssign
-  EXPRESSION_PERFORMANCE(
-    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+  EXPRESSION_PERFORMANCE(sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
 
   TensorCheckErr(A1, A2);
   TensorCheckErr(A1, A3);
@@ -130,12 +140,8 @@ void testSgdUpdate(int height, int width) {
   TensorCheckErr(C1, C3);
 }
 
-TEST(sgdUpdate, CPU) {
-  testMatrixCase(testSgdUpdate<CpuMatrix>);
-}
+TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
-TEST(sgdUpdate, GPU) {
-  testMatrixCase(testSgdUpdate<GpuMatrix>);
-}
+#ifdef PADDLE_WITH_GPU
+TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 5a0dffe086..afb8d9d599 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
+#include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
@@ -79,8 +80,8 @@ void testMatrixMaxSequence(int batchSize, int inputDim) {
 }
 
 TEST(Matrix, maxSequence) {
-  for (auto batchSize : {1, 10, 128, 1000, 6000}) {
-    for (auto inputDim : {1, 32, 100, 512}) {
+  for (auto batchSize : {1, 3, 997}) {   // prime numbers close to 1, 4, 1024
+    for (auto inputDim : {1, 7, 131}) {  // prime numbers close to 1, 8, 128
       VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
       testMatrixMaxSequence(batchSize, inputDim);
     }
@@ -240,14 +241,10 @@ TEST(Matrix, unary) {
     // inverse matrix
     testMatrixInverse(height);
 #else
-    LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n"
-                 << "Failed to find lapack library in current system.\n"
-                 << "To address this issue, Please adopt one of the following "
-                    "approaches: \n"
-                 << "1. Simply issue `sudo apt-get install liblapacke-dev` to "
-                    "avoid re-build source code. \n"
-                 << "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle "
-                    "source code.";
+    LOG(WARNING) << "This version of PaddlePaddle was not built with LAPACK"
+                 << "support so we cannot test matrix inverse. To test "
+                 << "matrix inverse, please install LAPACKE "
+                 << "and MKL/Openblas, and re-build PaddlePaddle.";
 #endif
   }
 }
@@ -341,8 +338,8 @@ void testMatrixSoftmaxBp(int height, int width) {
 }
 
 TEST(Matrix, softmax) {
-  for (auto height : {1, 11, 73, 128, 200}) {
-    for (auto width : {1, 32, 100, 512, 1000}) {
+  for (auto height : {1, 3, 131}) {    // prime numbers close to 1, 4, 127
+    for (auto width : {1, 17, 251}) {  // prime numbers close to 1, 16, 256
       VLOG(3) << " height=" << height << " width=" << width;
 
       testMatrixSoftmax(height, width);
@@ -527,7 +524,7 @@ void testVectorRowFunc(int size) {
 }
 
 TEST(Vector, rowFunc) {
-  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
     VLOG(3) << " size=" << size;
     testVectorRowFunc(size);
   }
@@ -604,7 +601,7 @@ void testVectorIsEqual(int size) {
 }
 
 TEST(Vector, Equal) {
-  for (auto size : {1, 5, 31, 90, 150, 500, 1000, 4000}) {
+  for (auto size : {1, 3, 997}) {  // prime numbers close to 1, 4, 1024
     VLOG(3) << " size=" << size;
     testVectorReset<int>(size);
     testVectorReset<real>(size);
@@ -635,9 +632,8 @@ void testMatrixTopK(int samples, int dim, int beamSize) {
 }
 
 TEST(Matrix, topK) {
-  for (auto samples : {1, 5, 31, 90, 150, 500}) {
-    for (auto dim :
-         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+  for (auto samples : {1, 17, 131}) {  // prime numbers close to 1, 16, 127
+    for (auto dim : {1, 3, 997}) {     // prime numbers close to 1, 4, 1024
       for (auto beamSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
         if (beamSize > dim) continue;
         VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
@@ -650,6 +646,7 @@ TEST(Matrix, topK) {
 
 void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
   int nnz = samples * dim * ratio;
+  if (nnz < 1) nnz = 1;  // Because sparseRand in MathUtil.cpp requires this.
   MatrixPtr cpuSrc = std::make_shared<CpuSparseMatrix>(samples, dim, nnz);
   MatrixPtr gpuSrc = std::make_shared<GpuSparseMatrix>(samples, dim, nnz);
   MatrixPtr cpuVal = std::make_shared<CpuMatrix>(samples, beamSize);
@@ -683,9 +680,9 @@ void testSMatrixTopK(int samples, int dim, int beamSize, real ratio) {
 }
 
 TEST(SMatrix, topK) {
-  for (auto samples : {1, 5, 100}) {
-    for (auto dim : {10000, 10000, 50000}) {
-      for (auto beamSize : {1, 5, 40, 100, 500}) {
+  for (auto samples : {1, 3, 61}) {
+    for (auto dim : {1, 3, 61}) {
+      for (auto beamSize : {1, 3, 61}) {
         for (auto ratio : {0.01, 0.001}) {
           if (beamSize > dim) continue;
           VLOG(3) << " samples=" << samples << " beamSize=" << beamSize
@@ -806,10 +803,9 @@ void testClassificationError(int numSamples, int dim, int topkSize) {
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
-    for (auto dim :
-         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
-      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+  for (auto numSamples : {1, 3, 31}) {
+    for (auto dim : {1, 3, 31}) {
+      for (auto topkSize : {1, 3, (int)rand() % dim + 1}) {
         if (topkSize > dim) continue;
         VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
                 << " dim= " << dim;
@@ -829,9 +825,8 @@ void testMaxPoolFwdBwd(int numSamples,
                        int strideW,
                        int padH,
                        int padW) {
-  int outH = 0, outW = 0;
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
 
   int inWidth = imgSizeH * imgSizeW * channels;
   MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -931,9 +926,8 @@ void testAvgPoolFwdBwd(int numSamples,
                        int strideW,
                        int padH,
                        int padW) {
-  int outH = 0, outW = 0;
-  outH = (imgSizeH - ksizeH + 2 * padH + strideH - 1) / strideH + 1;
-  outW = (imgSizeW - ksizeW + 2 * padW + strideW - 1) / strideW + 1;
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
 
   int inWidth = imgSizeH * imgSizeW * channels;
   MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
@@ -1016,13 +1010,15 @@ void testAvgPoolFwdBwd(int numSamples,
   TensorCheckErr(*inputGrad, *inputGpuGrad);
 }
 
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
 TEST(Matrix, PoolFwdBwd) {
-  for (auto numSamples : {5, 32}) {
-    for (auto channels : {1, 9, 32}) {
-      for (auto imgSizeH : {14, 28}) {
-        for (auto imgSizeW : {16, 30}) {
-          for (auto sizeX : {2, 5}) {
-            for (auto sizeY : {2, 5}) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {1, 3}) {
+      for (auto imgSizeH : {13, 17}) {
+        for (auto imgSizeW : {17, 19}) {
+          for (auto sizeX : {2, 3}) {
+            for (auto sizeY : {2, 3}) {
               for (auto sH : {1, 2}) {
                 for (auto sW : {1, 2}) {
                   for (auto pH : {0, (sizeY - 1) / 2}) {
@@ -1127,4 +1123,576 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 31;
+  const size_t width = 53;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  for (int i = 0; i < int(cpuSequence->getSize()); ++i) {
+    (cpuSequence->getData())[i] += 1;  // so no way that maxSeqLen is 0;
+  }
+
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  printf("numSeq = %ld, maxSeqLen = %ld\n", numSeq, maxSeqLen);
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  // hl_sequence2batch_copy_padding(gBatch->getData(),
+  //                                gpuInput->getData(),
+  //                                cpuSequence->getData(),
+  //                                inputDim,
+  //                                maxSeqLen,
+  //                                numSeq,
+  //                                false,
+  //                                true);
+  // cCheck->copyFrom(*gBatch);
+
+  // int* seqStart = cpuSequence->getData();
+  // float* batchData = cBatch->getData();
+  // float* seqData = cpuInput->getData();
+  // for (size_t i = 0; i < maxSeqLen; i++) {
+  //   for (size_t j = 0; j < numSeq; j++) {
+  //     size_t sequenceStart = seqStart[j];
+  //     size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+  //     if (i < sequenceLength) {
+  //       memcpy(batchData + (i * numSeq + j) * inputDim,
+  //              seqData + (sequenceStart + i) * inputDim,
+  //              inputDim * sizeof(real));
+  //     } else {
+  //       memset(batchData + (i * numSeq + j) * inputDim,
+  //              0,
+  //              inputDim * sizeof(real));
+  //     }
+  //   }
+  // }
+
+  // TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {1, 3, 17}) {
+    for (auto inputDim : {1, 3, 31}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
+void testMaxPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = channels * imgSizeD * imgSizeH * imgSizeW;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr maxIdx = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr maxIdxGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->maxPool3DForward(*input,
+                           *maxIdx,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+  targetGpu->maxPool3DForward(*inputGpu,
+                              *maxIdxGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  targetCheck->copyFrom(*targetGpu);
+  checkMatrixEqual(target, targetCheck);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->maxPool3DBackward(*targetGrad,
+                               *maxIdx,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+  inputGpuGrad->maxPool3DBackward(*targetGpuGrad,
+                                  *maxIdxGpu,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  MatrixPtr targetBwdCheck =
+      CpuMatrix::create(numSamples, inWidth, false, false);
+  targetBwdCheck->copyFrom(*inputGpuGrad);
+  checkMatrixEqual(inputGrad, targetBwdCheck);
+}
+
+void testAvgPool3DFwdBwd(int numSamples,
+                         int channels,
+                         int imgSizeD,
+                         int imgSizeH,
+                         int imgSizeW,
+                         int ksizeD,
+                         int ksizeH,
+                         int ksizeW,
+                         int strideD,
+                         int strideH,
+                         int strideW,
+                         int padD,
+                         int padH,
+                         int padW) {
+  int outD = outputSize(imgSizeD, ksizeD, padD, strideD, true);
+  int outH = outputSize(imgSizeH, ksizeH, padH, strideH, true);
+  int outW = outputSize(imgSizeW, ksizeW, padW, strideW, true);
+
+  int inWidth = imgSizeD * imgSizeH * imgSizeW * channels;
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+
+  int outWidth = channels * outD * outH * outW;
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+
+  input->randomizeUniform();
+  target->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  targetGpu->copyFrom(*target);
+
+  target->avgPool3DForward(*input,
+                           channels,
+                           imgSizeD,
+                           imgSizeH,
+                           imgSizeW,
+                           outD,
+                           outH,
+                           outW,
+                           ksizeD,
+                           ksizeH,
+                           ksizeW,
+                           strideD,
+                           strideH,
+                           strideW,
+                           padD,
+                           padH,
+                           padW);
+
+  targetGpu->avgPool3DForward(*inputGpu,
+                              channels,
+                              imgSizeD,
+                              imgSizeH,
+                              imgSizeW,
+                              outD,
+                              outH,
+                              outW,
+                              ksizeD,
+                              ksizeH,
+                              ksizeW,
+                              strideD,
+                              strideH,
+                              strideW,
+                              padD,
+                              padH,
+                              padW);
+
+  TensorCheckErr(*target, *targetGpu);
+
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
+
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+
+  inputGrad->avgPool3DBackward(*targetGrad,
+                               imgSizeD,
+                               imgSizeH,
+                               imgSizeW,
+                               outD,
+                               outH,
+                               outW,
+                               ksizeD,
+                               ksizeH,
+                               ksizeW,
+                               strideD,
+                               strideH,
+                               strideW,
+                               padD,
+                               padH,
+                               padW,
+                               1.0,
+                               1.0);
+
+  inputGpuGrad->avgPool3DBackward(*targetGpuGrad,
+                                  imgSizeD,
+                                  imgSizeH,
+                                  imgSizeW,
+                                  outD,
+                                  outH,
+                                  outW,
+                                  ksizeD,
+                                  ksizeH,
+                                  ksizeW,
+                                  strideD,
+                                  strideH,
+                                  strideW,
+                                  padD,
+                                  padH,
+                                  padW,
+                                  1.0,
+                                  1.0);
+  TensorCheckErr(*inputGrad, *inputGpuGrad);
+}
+
+// TODO(yi): I noticed many such blindly combinatorial tests in this
+// file.  They are no help to locate defects at all.
+TEST(Matrix, Pool3DFwdBwd) {
+  for (auto numSamples : {1, 3}) {
+    for (auto channels : {3}) {
+      for (auto imgSizeD : {9, 16}) {
+        for (auto imgSizeH : {9, 32}) {
+          for (auto imgSizeW : {9, 32}) {
+            for (auto sizeX : {3}) {
+              for (auto sizeY : {3}) {
+                for (auto sizeZ : {3}) {
+                  for (auto sD : {2}) {
+                    for (auto sH : {2}) {
+                      for (auto sW : {2}) {
+                        for (auto pD : {0, (sizeZ - 1) / 2}) {
+                          for (auto pH : {0, (sizeY - 1) / 2}) {
+                            for (auto pW : {0, (sizeX - 1) / 2}) {
+                              VLOG(3) << " numSamples=" << numSamples
+                                      << " channels=" << channels
+                                      << " imgSizeD=" << imgSizeD
+                                      << " imgSizeH=" << imgSizeH
+                                      << " imgSizeW=" << imgSizeW
+                                      << " sizeX=" << sizeX
+                                      << " sizeY=" << sizeY
+                                      << " sizeZ=" << sizeZ << " strideD=" << sD
+                                      << " strideH=" << sH << " strideW=" << sW
+                                      << " padingD=" << pD << " padingH=" << pH
+                                      << " padingW=" << pW;
+
+                              testMaxPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                              testAvgPool3DFwdBwd(numSamples,
+                                                  channels,
+                                                  imgSizeD,
+                                                  imgSizeH,
+                                                  imgSizeW,
+                                                  sizeX,
+                                                  sizeY,
+                                                  sizeZ,
+                                                  sD,
+                                                  sH,
+                                                  sW,
+                                                  pD,
+                                                  pH,
+                                                  pW);
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  //  for (auto numSamples : {1, 3}) {
+  //    for (auto channels : {1, 3}) {
+  //      for (auto imgSizeD : {9,16}) {
+  //      for (auto imgSizeH : {9, 32}) {
+  //        for (auto imgSizeW : {9, 32}) {
+  //          for (auto sizeX : {2, 3}) {
+  //            for (auto sizeY : {2, 3}) {
+  //            for (auto sizeZ : {2,3}){
+  //              for (auto sD : {1, 2}) {
+  //              for (auto sH : {1, 2}) {
+  //                for (auto sW : {1, 2}) {
+  //                  for (auto pD : {0, (sizeZ - 1) / 2}){
+  //                  for (auto pH : {0, (sizeY - 1) / 2}) {
+  //                    for (auto pW : {0, (sizeX - 1) / 2}) {
+  //                      VLOG(3) << " numSamples=" << numSamples
+  //                              << " channels=" << channels
+  //                              << " imgSizeD=" << imgSizeD
+  //                              << " imgSizeH=" << imgSizeH
+  //                              << " imgSizeW=" << imgSizeW
+  //                              << " sizeX=" << sizeX
+  //                              << " sizeY=" << sizeY
+  //                              << " sizeZ=" << sizeZ
+  //                              << " strideD=" << sD
+  //                              << " strideH=" << sH
+  //                              << " strideW=" << sW
+  //                              << " padingD=" << pD
+  //                              << " padingH=" << pH
+  //                              << " padingW=" << pW;
+  //
+  //                      testMaxPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                      testAvgPool3DFwdBwd(numSamples,
+  //                                        channels,
+  //                                        imgSizeD,
+  //                                        imgSizeH,
+  //                                        imgSizeW,
+  //                                        sizeX,
+  //                                        sizeY,
+  //                                        sizeZ,
+  //                                        sD,
+  //                                        sH,
+  //                                        sW,
+  //                                        pD,
+  //                                        pH,
+  //                                        pW);
+  //                    }
+  //                  }
+  //                }
+  //              }
+  //            }
+  //            }
+  //          }
+  //        }
+  //      }
+  //      }
+  //    }
+  //    }
+  //  }
+  //  }
+}
+
+void testMatrixCol2Vol(int depth, int height, int width) {
+  int channel = 3;
+  int filterX = 3, filterY = 4, filterZ = 5;
+  int strideX = 2, strideY = 2, strideZ = 2;
+  int padX = 1, padY = 1, padZ = 1;
+
+  MatrixPtr cpuImage =
+      std::make_shared<CpuMatrix>(channel, depth * height * width);
+  MatrixPtr gpuImage =
+      std::make_shared<GpuMatrix>(channel, depth * height * width);
+  cpuImage->randomizeUniform();
+  gpuImage->copyFrom(*cpuImage);
+
+  int outD = outputSize(depth, filterZ, padZ, strideZ, true);
+  int outH = outputSize(height, filterY, padY, strideY, true);
+  int outW = outputSize(width, filterX, padX, strideX, true);
+
+  int colBufHeight = channel * filterZ * filterY * filterX;
+  int colBufWidth = outD * outH * outW;
+  MatrixPtr cpuColBuf = std::make_shared<CpuMatrix>(colBufHeight, colBufWidth);
+  MatrixPtr gpuColBuf = std::make_shared<GpuMatrix>(colBufHeight, colBufWidth);
+  cpuColBuf->vol2Col(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  gpuColBuf->vol2Col(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX);
+  TensorCheckEqual(*cpuColBuf, *gpuColBuf);
+
+  cpuColBuf->randomizeUniform();
+  gpuColBuf->copyFrom(*cpuColBuf);
+  cpuColBuf->col2Vol(cpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  gpuColBuf->col2Vol(gpuImage->getData(),
+                     channel,
+                     depth,
+                     height,
+                     width,
+                     filterZ,
+                     filterY,
+                     filterX,
+                     strideZ,
+                     strideY,
+                     strideX,
+                     padZ,
+                     padY,
+                     padX,
+                     1.0,
+                     1.0);
+  TensorCheckErr(*cpuImage, *gpuImage);
+}
+
+TEST(Matrix, col2Vol) {
+  for (auto depth : {9, 16, 64}) {
+    for (auto height : {9, 11, 128}) {
+      for (auto width : {9, 32, 128}) {
+        VLOG(3) << "depth=" << depth << " height=" << height
+                << " width=" << width;
+        testMatrixCol2Vol(depth, height, width);
+      }
+    }
+  }
+}
+
 #endif
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 60ebae0153..c7c07c817a 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index a9185a4b24..2b2a391b9d 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
 //  so disable when
 /// only cpu version.
diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
new file mode 120000
index 0000000000..7d28cb3924
--- /dev/null
+++ b/paddle/memory/.clang-format
@@ -0,0 +1 @@
+../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
new file mode 100644
index 0000000000..496098f804
--- /dev/null
+++ b/paddle/memory/CMakeLists.txt
@@ -0,0 +1,23 @@
+add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc DEPS place enforce)
+cc_library(memcpy SRCS memcpy.cc DEPS place)
+
+cc_library(paddle_memory
+    DEPS
+    memory
+    memcpy
+    meta_data
+    meta_cache
+    memory_block
+    buddy_allocator
+    system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB MEMORY_HEADERS *.h)
+  file(GLOB MEMORY_DETAIL_HEADERS detail/*.h)
+  install(FILES ${MEMORY_HEADERS} DESTINATION include/paddle/memory)
+  install(FILES ${MEMORY_DETAIL_HEADERS} DESTINATION include/paddle/memory/detail)
+endif()
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
new file mode 100644
index 0000000000..7cf61d089b
--- /dev/null
+++ b/paddle/memory/README.md
@@ -0,0 +1,141 @@
+# Region-based Heterogeneous Memory Management
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::CUDAPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::CUDAPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<CUDAPlace>(CUDAPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+    size_t index; // allocator id
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
new file mode 100644
index 0000000000..b9c3fc31c1
--- /dev/null
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -0,0 +1,15 @@
+if(${WITH_GPU})
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
+else(${WITH_GPU})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
+endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
new file mode 100644
index 0000000000..2bc2c06a15
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -0,0 +1,329 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  VLOG(10) << "BuddyAllocator Disconstructor makes sure that all of these "
+              "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    VLOG(10) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(10) << "Allocate " << unaligned_size << " bytes from chunk size "
+           << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    VLOG(10) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    VLOG(10) << "Allocation from existing memory block " << std::get<2>(*it)
+             << " at address "
+             << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(10) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    VLOG(10) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    VLOG(10) << "Merging this block " << block << " with its right buddy "
+             << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    VLOG(10) << "Merging this block " << block << " with its left buddy "
+             << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  VLOG(10) << "Inserting free block (" << block << ", "
+           << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  VLOG(10) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifdef PADDLE_WITH_CUDA
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  VLOG(10) << "Creating and inserting new block " << p
+           << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  VLOG(10) << "Split block (" << block << ", " << block->total_size(cache_)
+           << ") into";
+  block->split(cache_, size);
+
+  VLOG(10) << "Left block (" << block << ", " << block->total_size(cache_)
+           << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      VLOG(10) << "Insert right block (" << block->right_buddy(cache_) << ", "
+               << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    VLOG(10) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    VLOG(10) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
new file mode 100644
index 0000000000..4e0135dd65
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+
+#include <mutex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class BuddyAllocator {
+ public:
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
+  ~BuddyAllocator();
+
+ public:
+  void* Alloc(size_t unaligned_size);
+  void Free(void* ptr);
+  size_t Used();
+
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
+ private:
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;
+
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
+
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
+
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
+
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
+
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
+
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000..f50eceba09
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000..a4ca51b31b
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000..2bacca7510
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "glog/logging.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    auto* meta = reinterpret_cast<const Metadata*>(block);
+    VLOG(10) << "Load MetaData type=" << meta->type;
+    PADDLE_ASSERT(meta->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000..db8ffd49ae
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  explicit MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock* memory_block);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock* memory_block, const Metadata& meta_data);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock* memory_block);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
new file mode 100644
index 0000000000..dc57d4d237
--- /dev/null
+++ b/paddle/memory/detail/meta_data.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
new file mode 100644
index 0000000000..6b83c42eb8
--- /dev/null
+++ b/paddle/memory/detail/meta_data.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
new file mode 100644
index 0000000000..509250debc
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+
+#include <stdlib.h>    // for malloc and free
+#include <sys/mman.h>  // for mlock and munlock
+#include <algorithm>   // for std::max
+
+#include "gflags/gflags.h"
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, true, "If set, allocate cpu pinned memory.");
+DECLARE_double(fraction_of_gpu_memory_to_use);
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  index = 0;  // unlock memory
+
+  void* p;
+
+#ifdef PADDLE_WITH_MKLDNN
+  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
+  // memory alignment
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0);
+#else
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0);
+#endif
+  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
+  }
+
+  return p;
+}
+
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
+    munlock(p, size);
+  }
+  free(p);
+}
+
+bool CPUAllocator::UseGpu() const { return false; }
+
+#ifdef PADDLE_WITH_CUDA
+
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
+  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) return nullptr;
+  void* p;
+  cudaError_t result = cudaMalloc(&p, size);
+  if (result == cudaSuccess) {
+    index = 0;
+    gpu_alloc_size_ += size;
+    return p;
+  } else {
+    LOG(WARNING)
+        << "Cannot malloc " << size / 1024.0 / 1024.0
+        << " MB GPU memory. Please shrink FLAGS_fraction_of_gpu_memory_to_use "
+           "environment variable to a lower value. Current value is "
+        << FLAGS_fraction_of_gpu_memory_to_use;
+    return nullptr;
+  }
+}
+
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  if (err != cudaErrorCudartUnloading) {
+    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
+  }
+}
+
+bool GPUAllocator::UseGpu() const { return true; }
+
+#endif
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
new file mode 100644
index 0000000000..552cab4f96
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
+class SystemAllocator {
+ public:
+  virtual ~SystemAllocator() {}
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
+};
+
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+};
+
+#ifdef PADDLE_WITH_CUDA
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
+};
+#endif
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
new file mode 100644
index 0000000000..6a8558937b
--- /dev/null
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(use_pinned_memory);
+
+void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+  bool freed = false;
+  {
+    size_t index;
+    void* p = a.Alloc(index, size);
+    if (size > 0) {
+      EXPECT_NE(p, nullptr);
+    } else {
+      EXPECT_EQ(p, nullptr);
+    }
+
+    int* i = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(i, [&](void* p) {
+      freed = true;
+      a.Free(p, size, index);
+    });
+  }
+  EXPECT_TRUE(freed);
+}
+
+TEST(CPUAllocator, NoLockMem) {
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+TEST(CPUAllocator, LockMem) {
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(GPUAllocator, Alloc) {
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+#endif
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
new file mode 100644
index 0000000000..b46141aafd
--- /dev/null
+++ b/paddle/memory/memcpy.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <>
+void Copy<platform::CPUPlace, platform::CUDAPlace>(
+    platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CPUPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::CUDAPlace, platform::CUDAPlace>(
+    platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place,
+    const void* src, size_t num, cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
new file mode 100644
index 0000000000..29c20e1860
--- /dev/null
+++ b/paddle/memory/memcpy.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifdef PADDLE_WITH_CUDA
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+
+#endif
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
new file mode 100644
index 0000000000..1a73a94567
--- /dev/null
+++ b/paddle/memory/memory.cc
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+
+#include "glog/logging.h"
+
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/gpu_info.h"
+
+DECLARE_double(fraction_of_gpu_memory_to_use);
+
+namespace paddle {
+namespace memory {
+
+using BuddyAllocator = detail::BuddyAllocator;
+
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+  return a;
+}
+
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
+  void* p = GetCPUBuddyAllocator()->Alloc(size);
+  VLOG(10) << "  pointer=" << p;
+  return p;
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = nullptr;
+    }
+  }
+  platform::SetDeviceId(gpu_id);
+  if (!as[gpu_id]) {
+    as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator,
+                                    platform::GpuMinChunkSize(),
+                                    platform::GpuMaxChunkSize());
+    VLOG(10) << "\n\nNOTE: each GPU device use "
+             << FLAGS_fraction_of_gpu_memory_to_use * 100
+             << "% of GPU memory.\n"
+             << "You can set GFlags environment variable '"
+             << "FLAGS_fraction_of_gpu_memory_to_use"
+             << "' to change the fraction of GPU usage.\n\n";
+  }
+  return as[gpu_id];
+}
+
+template <>
+size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+template <>
+void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::CUDAPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
+}
+
+template <>
+void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+#endif
+
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+  return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+  return Used(gpu);
+#else
+  PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+  return boost::apply_visitor(Usage(), p);
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
new file mode 100644
index 0000000000..7012b6d331
--- /dev/null
+++ b/paddle/memory/memory.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+struct Usage : public boost::static_visitor<size_t> {
+  size_t operator()(const platform::CPUPlace& cpu) const;
+  size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  explicit PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000..b3f699f9b7
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = cpu;
+  EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+size_t align(size_t size, paddle::platform::CUDAPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CUDAPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::platform::Place place = gpu;
+  EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::CUDAPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
new file mode 120000
index 0000000000..7d28cb3924
--- /dev/null
+++ b/paddle/operators/.clang-format
@@ -0,0 +1 @@
+../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
new file mode 100644
index 0000000000..b2e73b6f23
--- /dev/null
+++ b/paddle/operators/CMakeLists.txt
@@ -0,0 +1,197 @@
+file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+set(DEPS_OPS "")
+set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/pybind/pybind.h)
+file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt.  DO NOT EDIT!\n\n")
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
+    set(cc_srcs)
+    set(cu_srcs)
+    set(cu_cc_srcs)
+    set(op_common_deps operator op_registry math_function)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    set(pybind_flag 0)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    list(LENGTH op_library_SRCS op_library_SRCS_len)
+    if (${op_library_SRCS_len} EQUAL 0)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+            list(APPEND cc_srcs ${TARGET}.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+            list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
+        endif()
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+            list(APPEND cu_srcs ${TARGET}.cu)
+        endif()
+    else()
+        foreach(src ${op_library_SRCS})
+            if (${src} MATCHES ".*\\.cu$")
+                list(APPEND cu_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cu.cc$")
+                list(APPEND cu_cc_srcs ${src})
+            elseif(${src} MATCHES ".*\\.cc$")
+                list(APPEND cc_srcs ${src})
+            else()
+                message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+            endif()
+        endforeach()
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH op_library_DEPS op_library_DEPS_len)
+    if (${op_library_DEPS_len} GREATER 0)
+        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+
+    # Define operators that don't need pybind here.
+    foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+        if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+            set(pybind_flag 1)
+        endif()
+    endforeach()
+
+    # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+    # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+    # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+    file(READ ${TARGET}.cc TARGET_CONTENT)
+    string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+    if (one_register STREQUAL "")
+        string(REPLACE "_op" "" TARGET "${TARGET}")
+    else ()
+        string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+        string(REPLACE "," "" TARGET "${TARGET}")
+    endif()
+
+    # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
+    string(REPLACE "_op" "" TARGET "${TARGET}")
+    if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_CPU_ONLY_OP
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH cu_cc_srcs cu_cc_srcs_len)
+    if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
+        set(pybind_flag 1)
+    endif()
+
+    # pybind USE_OP
+    if (${pybind_flag} EQUAL 0)
+        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
+    endif()
+endfunction()
+
+add_subdirectory(math)
+add_subdirectory(nccl)
+
+if(WITH_GPU)
+    op_library(nccl_op DEPS nccl_common)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
+else()
+    set(DEPS_OPS ${DEPS_OPS} nccl_op)
+endif()
+
+if(WITH_DISTRIBUTE)
+    add_subdirectory(detail)
+    set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+    set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    op_library(send_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    op_library(recv_op DEPS ${DISTRIBUTE_DEPS})
+    set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+else()
+    set(DEPS_OPS ${DEPS_OPS} send_op recv_op)
+endif()
+
+op_library(cond_op DEPS framework_proto tensor net_op)
+op_library(cross_entropy_op DEPS cross_entropy)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(softmax_op DEPS softmax)
+op_library(detection_output_op DEPS softmax)
+op_library(sequence_softmax_op DEPS softmax)
+op_library(sum_op DEPS selected_rows_functor)
+op_library(sgd_op DEPS selected_rows_functor)
+op_library(print_op DEPS lod_tensor)
+op_library(adagrad_op DEPS selected_rows_functor)
+op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
+op_library(pool_with_index_op DEPS pooling)
+op_library(lod_rank_table_op DEPS lod_rank_table)
+op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
+op_library(array_to_lod_tensor_op DEPS lod_rank_table_op)
+op_library(max_sequence_len_op DEPS lod_rank_table)
+op_library(sequence_conv_op DEPS context_project)
+op_library(sequence_pool_op DEPS sequence_pooling)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(lstmp_op DEPS sequence2batch lstm_compute)
+op_library(gru_op DEPS sequence2batch gru_compute)
+op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale math_function)
+op_library(cos_sim_op DEPS cos_sim_functor)
+op_library(parallel_do_op DEPS executor)
+
+# Regist multiple Kernel to pybind
+if (WITH_GPU)
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
+op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
+  conv_transpose_cudnn_op.cu.cc DEPS vol2col)
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(pool2d, CUDNN);\n")
+file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(conv2d_transpose, CUDNN);\n")
+else()
+op_library(conv_op SRCS conv_op.cc DEPS vol2col)
+op_library(pool_op SRCS pool_op.cc DEPS pooling)
+op_library(conv_transpose_op SRCS conv_transpose_op.cc DEPS vol2col)
+endif()
+
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+op_library(save_combine_op DEPS lod_tensor)
+op_library(load_combine_op DEPS lod_tensor)
+
+list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
+foreach(src ${GENERAL_OPS})
+    op_library(${src})
+endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
+set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
+
+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
+cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
+cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
+cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_search_op)
+cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+if(WITH_GPU)
+    cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
+endif()
+cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
new file mode 100644
index 0000000000..8e8a3c7dd3
--- /dev/null
+++ b/paddle/operators/accuracy_op.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/accuracy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AccuracyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input (Out) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input (Indices) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input (Label) of accuracy op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
+                   "Output (Accuracy) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
+                   "Output (Correct) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Total"),
+                   "Output (Total) of AccuracyOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Out");
+    auto label_dim = ctx->GetInputDim("Label");
+    // Assume indices has same shape as inference, because
+    // it's the output of topk.
+
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
+    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
+                      "the inference tensor's num_rows must be"
+                      " the same as label.");
+
+    ctx->SetOutputDim("Accuracy", {1});
+    ctx->SetOutputDim("Correct", {1});
+    ctx->SetOutputDim("Total", {1});
+    ctx->ShareLoD("Out", /*->*/ "Accuracy");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AccuracyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // TODO(typhoonzero): support both inference value and indices.
+    AddInput("Out", "The network output of topk (inferences)");
+    AddInput("Indices", "The the network output of topk (indices)");
+    AddInput("Label", "Label of the training data");
+    // TODO(typhoonzero): AddInput("Weight", ...
+    AddOutput("Accuracy", "The accuracy of current batch");
+    AddOutput("Correct", "The correct samples count of current batch");
+    AddOutput("Total", "The samples count of current batch");
+
+    AddComment(R"DOC(
+Accuracy Operator. 
+
+It will print accuracy rate for classification.
+The accuracy is calculated as follows:
+
+$$accuracy = \frac{NumOfCorrectPredicts}{NumOfAllSamples}$$
+
+Both the input Out and Label can carry the LoD (Level of Details)
+information, or not. But the output only shares the LoD information 
+with the input Out(Inference).
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+// FIXME(typhoonzero): types of T is for infernece data.
+// label data is always int.
+REGISTER_OP_CPU_KERNEL(accuracy,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+                       ops::AccuracyKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
new file mode 100644
index 0000000000..0aadd5af41
--- /dev/null
+++ b/paddle/operators/accuracy_op.cu
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/execution_policy.h>
+#include <thrust/reduce.h>
+#include "paddle/operators/accuracy_op.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D,
+                                   const int64_t* Xdata,
+                                   const int64_t* labeldata, int* correct_data,
+                                   float* accuracy, int* total_data) {
+  int count = 0;
+  __shared__ int total[BlockSize];
+
+  // support only 1 block
+  for (int i = threadIdx.x; i < (N); i += BlockSize) {
+    for (int j = 0; j < D; ++j) {
+      if (Xdata[i * D + j] == labeldata[i]) {
+        ++count;
+        break;
+      }
+    }
+  }
+  total[threadIdx.x] = count;
+  __syncthreads();
+
+  // reduce the count with init value 0, and output accuracy.
+  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
+  if (threadIdx.x == 0) {
+    *correct_data = result;
+    *accuracy = static_cast<float>(result) / static_cast<float>(N);
+    *total_data = N;
+  }
+}
+
+template <typename T>
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+    // FIXME(typhoonzero): only support indices currently
+    // if add support for output values, how to detect the data type?
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    int num_samples = static_cast<int>(inference->dims()[0]);
+    size_t infer_width = inference->dims()[1];
+    auto stream = ctx.cuda_device_context().stream();
+    platform::GpuMemsetAsync(accuracy_data, 0, sizeof(float), stream);
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    AccuracyCudaKernel<
+        PADDLE_CUDA_NUM_THREADS><<<1, PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_samples, infer_width, indices_data, label_data, correct_data,
+        accuracy_data, total_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
+REGISTER_OP_CUDA_KERNEL(accuracy,
+                        paddle::operators::AccuracyOpCUDAKernel<float>,
+                        paddle::operators::AccuracyOpCUDAKernel<double>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
new file mode 100644
index 0000000000..04104a695f
--- /dev/null
+++ b/paddle/operators/accuracy_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class AccuracyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* indices = ctx.Input<Tensor>("Indices");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
+    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
+
+    const int64_t* indices_data = indices->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    size_t num_samples = inference->dims()[0];
+    size_t class_dim = inference->dims()[1];
+    *accuracy_data = 0.0f;
+
+    if (num_samples == 0) {
+      return;
+    }
+
+    int num_correct = 0;
+    // assume inference is already the topk of the output
+    for (size_t i = 0; i < num_samples; ++i) {
+      PADDLE_ENFORCE_GE(label_data[i], 0, "label must >= 0");
+      for (size_t j = 0; j < class_dim; ++j) {
+        if (indices_data[i * class_dim + j] == label_data[i]) {
+          ++num_correct;
+          break;
+        }
+      }
+    }
+
+    *correct_data = num_correct;
+    *total_data = num_samples;
+    *accuracy_data =
+        static_cast<float>(num_correct) / static_cast<float>(num_samples);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
new file mode 100644
index 0000000000..4188858a90
--- /dev/null
+++ b/paddle/operators/activation_op.cc
@@ -0,0 +1,615 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/activation_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ActivationOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ActivationOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sigmoid operator");
+    AddOutput("Out", "Output of Sigmoid operator");
+    AddComment(R"DOC(
+Sigmoid Activation Operator
+
+$$out = \frac{1}{1 + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LogSigmoid operator");
+    AddOutput("Out", "Output of LogSigmoid operator");
+    AddComment(R"DOC(
+Logsigmoid Activation Operator
+
+$$out = \log \frac{1}{1 + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Exp operator");
+    AddOutput("Out", "Output of Exp operator");
+    AddComment(R"DOC(
+Exp Activation Operator.
+
+$out = e^x$
+
+)DOC");
+  }
+};
+
+class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu operator");
+    AddOutput("Out", "Output of Relu operator");
+    AddComment(R"DOC(
+Relu Activation Operator.
+
+$out = \max(x, 0)$
+
+)DOC");
+  }
+};
+
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LeakyReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Out", "Output of LeakyRelu operator");
+    AddAttr<float>("alpha", "The small negative slope").SetDefault(0.02f);
+    AddComment(R"DOC(
+LeakyRelu Activation Operator.
+
+$out = \max(x, \alpha * x)$
+
+)DOC");
+  }
+};
+
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Out", "Output of Softshrink operator");
+    AddAttr<float>("lambda", "non-negative offset").SetDefault(0.5f);
+    AddComment(R"DOC(
+Softshrink Activation Operator.
+
+$$
+out = \begin{cases} 
+    x - \lambda, \text{if } x > \lambda \\
+    x + \lambda, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Tanh operator");
+    AddOutput("Out", "Output of Tanh operator");
+    AddComment(R"DOC(
+Tanh Activation Operator.
+
+$$out = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of TanhShrink operator");
+    AddOutput("Out", "Output of TanhShrink operator");
+    AddComment(R"DOC(
+TanhShrink Activation Operator.
+
+$$out = x - \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}}$$
+
+)DOC");
+  }
+};
+
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardShrinkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Out", "Output of HardShrink operator");
+    AddAttr<float>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(0.5f);
+    AddComment(R"DOC(
+HardShrink Activation Operator.
+
+$$
+out = \begin{cases} 
+    x, \text{if } x > \lambda \\
+    x, \text{if } x < -\lambda \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SqrtOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Sqrt operator");
+    AddOutput("Out", "Output of Sqrt operator");
+    AddComment(R"DOC(
+Sqrt Activation Operator.
+
+$out = \sqrt{x}$
+
+)DOC");
+  }
+};
+
+class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AbsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Abs operator");
+    AddOutput("Out", "Output of Abs operator");
+    AddComment(R"DOC(
+Abs Activation Operator.
+
+$out = |x|$
+
+)DOC");
+  }
+};
+
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Out", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$out = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Out", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$out = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Out", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$out = [x]$
+
+)DOC");
+  }
+};
+
+class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReciprocalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Reciprocal operator");
+    AddOutput("Out", "Output of Reciprocal operator");
+    AddComment(R"DOC(
+Reciprocal Activation Operator.
+
+$$out = \frac{1}{x}$$
+
+)DOC");
+  }
+};
+
+class LogOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Log operator");
+    AddOutput("Out", "Output of Log operator");
+    AddComment(R"DOC(
+Log Activation Operator.
+
+$out = \ln(x)$
+
+Natural logarithm of x.
+
+)DOC");
+  }
+};
+
+class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquareOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Square operator");
+    AddOutput("Out", "Output of Square operator");
+    AddComment(R"DOC(
+Square Activation Operator.
+
+$out = x^2$
+
+)DOC");
+  }
+};
+
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftplusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softplus operator");
+    AddOutput("Out", "Output of Softplus operator");
+    AddComment(R"DOC(
+Softplus Activation Operator.
+
+$out = \ln(1 + e^{x})$
+
+)DOC");
+  }
+};
+
+class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftsignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softsign operator");
+    AddOutput("Out", "Output of Softsign operator");
+    AddComment(R"DOC(
+Softsign Activation Operator.
+
+$$out = \frac{x}{1 + |x|}$$
+
+)DOC");
+  }
+};
+
+class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of BRelu operator");
+    AddOutput("Out", "Output of BRelu operator");
+    AddAttr<float>("t_min", "The min marginal value of BRelu")
+        .SetDefault(static_cast<float>(0));
+    AddAttr<float>("t_max", "The max marginal value of BRelu")
+        .SetDefault(static_cast<float>(24));
+    AddComment(R"DOC(
+BRelu Activation Operator.
+
+$out = \max(\min(x, t_{min}), t_{max})$
+
+)DOC");
+  }
+};
+
+class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of SoftRelu operator");
+    AddOutput("Out", "Output of SoftRelu operator");
+    AddAttr<float>("threshold", "The threshold value of SoftRelu")
+        .SetDefault(40.0f);
+    AddComment(R"DOC(
+SoftRelu Activation Operator.
+
+$out = \ln(1 + \exp(\max(\min(x, threshold), threshold))$
+
+)DOC");
+  }
+};
+
+class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ELUOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of ELU operator");
+    AddOutput("Out", "Output of ELU operator");
+    AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
+    AddComment(R"DOC(
+ELU Activation Operator.
+
+Applies the following element-wise computation on the input according to
+https://arxiv.org/abs/1511.07289.
+
+$out = \max(0, x) + \min(0, \alpha * (e^x - 1))$
+
+)DOC");
+  }
+};
+
+class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Relu6OpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu6 operator");
+    AddOutput("Out", "Output of Relu6 operator");
+    AddAttr<float>("threshold", "The threshold value of Relu6")
+        .SetDefault(6.0f);
+    AddComment(R"DOC(
+Relu6 Activation Operator.
+
+$out = \min(\max(0, x), 6)$
+
+)DOC");
+  }
+};
+
+class PowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PowOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Pow operator");
+    AddOutput("Out", "Output of Pow operator");
+    AddAttr<float>("factor", "The exponential factor of Pow").SetDefault(1.0f);
+    AddComment(R"DOC(
+Pow Activation Operator.
+
+$out = x^{factor}$
+
+)DOC");
+  }
+};
+
+class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  STanhOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of STanh operator");
+    AddOutput("Out", "Output of STanh operator");
+    AddAttr<float>("scale_a", "The scale parameter of a for the input")
+        .SetDefault(2.0f / 3.0f);
+    AddAttr<float>("scale_b", "The scale parameter of b for the input")
+        .SetDefault(1.7159f);
+    AddComment(R"DOC(
+STanh Activation Operator.
+
+$$out = b * \frac{e^{a * x} - e^{-a * x}}{e^{a * x} + e^{-a * x}}$$
+
+)DOC");
+  }
+};
+
+class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ThresholdedReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of ThresholdedRelu operator");
+    AddOutput("Out", "Output of ThresholdedRelu operator");
+    AddAttr<float>("threshold", "The threshold location of activation")
+        .SetDefault(1.0f);
+    AddComment(R"DOC(
+ThresholdedRelu Activation Operator.
+
+$$
+out = \begin{cases} 
+    x, \text{if } x > threshold \\
+    0,  \text{otherwise}
+    \end{cases}
+$$
+
+)DOC");
+  }
+};
+
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Out", "Output of HardSigmoid operator");
+    AddAttr<float>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(0.2f);
+    AddAttr<float>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(0.5f);
+    AddComment(R"DOC(
+HardSigmoid Activation Operator.
+
+Segment-wise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391), 
+which is much faster than sigmoid.
+
+$out = \max(0, \min(1, slope * x + shift))$
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set according to the above reference.
+It is recommended to use the defaults for this activation.
+
+)DOC");
+  }
+};
+
+class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SwishOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Swish operator");
+    AddOutput("Out", "Output of Swish operator");
+    AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
+    AddComment(R"DOC(
+Swish Activation Operator.
+
+$$out = \frac{x}{1 + e^{- \beta x}}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
+            logsigmoid_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+            tanh_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
+            softshrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
+            reciprocal_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker, brelu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker,
+            leaky_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker, soft_relu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker, elu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker, relu6_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker, pow_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker, stanh_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker,
+            hard_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker,
+            thresholded_relu_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad,
+            ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CPUDeviceContext, \
+                                      ops::functor<float>>,               \
+      ops::ActivationKernel<paddle::platform::CPUDeviceContext,           \
+                            ops::functor<double>>);                       \
+  REGISTER_OP_CPU_KERNEL(                                                 \
+      act_type##_grad,                                                    \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<float>>,                \
+      ops::ActivationGradKernel<paddle::platform::CPUDeviceContext,       \
+                                ops::grad_functor<double>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
new file mode 100644
index 0000000000..b9ccdf639c
--- /dev/null
+++ b/paddle/operators/activation_op.cu
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/activation_op.h"
+
+namespace ops = paddle::operators;
+
+#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, functor, grad_functor)   \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type, ops::ActivationKernel<paddle::platform::CUDADeviceContext, \
+                                      ops::functor<float>>,                \
+      ops::ActivationKernel<paddle::platform::CUDADeviceContext,           \
+                            ops::functor<double>>);                        \
+  REGISTER_OP_CUDA_KERNEL(                                                 \
+      act_type##_grad,                                                     \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<float>>,                 \
+      ops::ActivationGradKernel<paddle::platform::CUDADeviceContext,       \
+                                ops::grad_functor<double>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
new file mode 100644
index 0000000000..c0809abc05
--- /dev/null
+++ b/paddle/operators/activation_op.h
@@ -0,0 +1,799 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename Functor>
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
+                          "Cannot get input tensor X, variable name = %s",
+                          context.op().Input("X"));
+
+    auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
+                            "Cannot get output tensor Out, variable name = %s",
+                            context.op().Output("Out"));
+    Out.mutable_data<T>(context.GetPlace());
+    auto x = framework::EigenVector<T>::Flatten(X);
+    auto out = framework::EigenVector<T>::Flatten(Out);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(*place, x, out);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class ActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
+ public:
+  using T = typename Functor::ELEMENT_TYPE;
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<framework::Tensor>("X");
+    auto* Out = context.Input<framework::Tensor>("Out");
+    auto* dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
+    functor(*place, x, out, dout, dx);
+  }
+};
+
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
+// sigmoid(x) = 1 / (1 + exp(-x))
+template <typename T>
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+  }
+};
+
+template <typename T>
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out * (static_cast<T>(1) - out);
+  }
+};
+
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// out = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    out.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dout * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+};
+
+// exp(x) = e^x
+template <typename T>
+struct ExpFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.exp();
+  }
+};
+
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * out;
+  }
+};
+
+// relu(x) = max(x, 0)
+template <typename T>
+struct ReluFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>();
+  }
+};
+
+// tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) - out * out);
+  }
+};
+
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x - x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x.tanh() * x.tanh());
+  }
+};
+
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
+    out.device(d) = x * (temp1 + temp2);
+  }
+};
+
+template <typename T>
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > static_cast<T>(threshold)).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < -lambda; 0
+// otherwise
+template <typename T>
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    out.device(d) = temp1 * (x - lambdaT) + temp2 * (x + lambdaT);
+  }
+};
+
+template <typename T>
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto lambdaT = static_cast<T>(lambda);
+    auto temp1 = (x > lambdaT).template cast<T>().eval();
+    auto temp2 = (x < -lambdaT).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+// sqrt(x) = x^(1/2)
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.sqrt();
+  }
+};
+
+template <typename T>
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
+  }
+};
+
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.floor();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.round();
+  }
+};
+
+// abs(x) = |x|
+template <typename T>
+struct AbsFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.abs();
+  }
+};
+
+template <typename T>
+struct AbsGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * x.sign();
+  }
+};
+
+// reciprocal(x) = 1 / x
+template <typename T>
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = static_cast<T>(1) / x;
+  }
+};
+
+template <typename T>
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(-1) * out * out;
+  }
+};
+
+// log(x) = natural logarithm of x
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
+  }
+};
+
+template <typename T>
+struct LogGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
+  }
+};
+
+// square(x) = x^2
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.square();
+  }
+};
+
+template <typename T>
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(2) * x;
+  }
+};
+
+template <typename T>
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(t_min)).cwiseMin(static_cast<T>(t_max));
+  }
+};
+
+template <typename T>
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(t_min)) * (x < static_cast<T>(t_max)))
+                       .template cast<T>();
+  }
+};
+
+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        x.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(threshold));
+  }
+};
+
+template <typename T>
+struct Relu6GradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((x > static_cast<T>(0)) * (x < static_cast<T>(threshold)))
+                       .template cast<T>();
+  }
+};
+
+// softplus(x) = log(1 + exp(x))
+// When x is a very large positive number, exp(x) may explode to inf,
+// Using trick below for numerical stability
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
+  }
+};
+
+// d(softplus(x))/dx = exp(x) / (1 + exp(x))
+// For numerical stability:
+// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
+// exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    dx.device(d) =
+        dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+  }
+};
+
+// softsign(x) = x / (1 + |x|)
+template <typename T>
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    out.device(d) = x / (static_cast<T>(1) + x.abs());
+  }
+};
+
+// d(softsign(x))/dx = 1 / (1 + |x|)^2
+// Taken from https://en.wikipedia.org/wiki/Activation_function
+template <typename T>
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) {
+    dx.device(d) =
+        dout * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+  }
+};
+
+template <typename T>
+struct SoftReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
+    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+
+template <typename T>
+struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = ((x > -tmp) * (x < tmp)).template cast<T>().eval();
+    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
+  }
+};
+
+template <typename T>
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
+  }
+};
+
+template <typename T>
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(alpha) *
+                 (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
+    dx.device(d) = dout * (temp1 + temp2).template cast<T>();
+  }
+};
+
+template <typename T>
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.cwiseMax(static_cast<T>(0)) +
+                    (static_cast<T>(alpha) * (x.exp() - static_cast<T>(1)))
+                        .cwiseMin(static_cast<T>(0));
+  }
+};
+
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() +
+                   dout * (out + static_cast<T>(alpha)) *
+                       (x < static_cast<T>(0)).template cast<T>();
+  }
+};
+
+// FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198
+template <typename T>
+struct PowFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.pow(static_cast<T>(factor));
+  }
+};
+
+template <typename T>
+struct PowGradFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(factor) *
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
+  }
+};
+
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) =
+        static_cast<T>(scale_b) * (static_cast<T>(scale_a) * x).tanh();
+  }
+};
+
+template <typename T>
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto a = static_cast<T>(scale_a);
+    auto b = static_cast<T>(scale_b);
+    auto temp = (a * x).tanh() * (a * x).tanh();
+    dx.device(d) = dout * a * b * (static_cast<T>(1) - temp);
+  }
+};
+
+template <typename T>
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto th = static_cast<T>(threshold);
+    out.device(d) = (x > th).template cast<T>() * x;
+  }
+};
+
+template <typename T>
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto th = static_cast<T>(threshold);
+    dx.device(d) = dout * (x > th).template cast<T>();
+  }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    out.device(d) =
+        temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout *
+                   ((out > static_cast<T>(0)) * (out < static_cast<T>(1)))
+                       .template cast<T>() *
+                   static_cast<T>(slope);
+  }
+};
+
+template <typename T>
+struct SwishFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x / (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+  }
+};
+
+template <typename T>
+struct SwishGradFunctor : public BaseActivationFunctor<T> {
+  float beta;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"beta", &beta}};
+  }
+
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    auto temp1 = static_cast<T>(1) /
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(swish, SwishFunctor, SwishGradFunctor);                    \
+  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
new file mode 100644
index 0000000000..d8a9491c82
--- /dev/null
+++ b/paddle/operators/adadelta_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adadelta_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdadeltaOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
+                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
+                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredGradOut"),
+        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredUpdateOut"),
+        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "param and grad input of AdadeltaOp should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
+                      "Param and AvgSquaredGrad input of AdadeltaOp "
+                      "should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
+                      "Param and AvgSquaredUpdate input of AdadeltaOp "
+                      "should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
+  }
+};
+
+class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdadeltaOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("AvgSquaredGrad", "(Tensor) Input average of squared gradient");
+    AddInput("AvgSquaredUpdate",
+             "(Tensor) Input average of squared parameter updates");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("AvgSquaredGradOut",
+              "(Tensor) Output average of squared gradient");
+    AddOutput("AvgSquaredUpdateOut",
+              "(Tensor) Output average of squared parameter updates");
+
+    AddAttr<float>("rho",
+                   "(float, default 0.95) Exponential decay rate "
+                   "for squared gradients.")
+        .SetDefault(0.95f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) Constant for "
+                   "numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Adadelta Optimizer.
+
+Adadelta optimizer is implemented as explained in:
+https://arxiv.org/abs/1212.5701
+Adadelta is a per-dimension adaptive learning rate method used
+for gradient descent.
+
+Adadelta updates are as follows:
+
+$$
+avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\
+param\_update =  - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\
+avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\
+param\_out = param + param\_update
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
new file mode 100644
index 0000000000..91294a0d5d
--- /dev/null
+++ b/paddle/operators/adadelta_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adadelta_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdadeltaOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
new file mode 100644
index 0000000000..819d0845db
--- /dev/null
+++ b/paddle/operators/adadelta_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AdadeltaOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto avg_squared_grad_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
+    auto avg_squared_update_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T rho = static_cast<T>(ctx.Attr<float>("rho"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    // Squared gradient accumulator
+    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
+    // Squared updates accumulator
+    auto avg_squared_update = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto avg_squared_grad_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
+    auto avg_squared_update_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    avg_squared_grad_out.device(place) =
+        rho * avg_squared_grad + (1 - rho) * grad.square();
+    auto update =
+        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
+             .sqrt() *
+        grad;
+    avg_squared_update_out.device(place) =
+        rho * avg_squared_update + (1 - rho) * update.square();
+    param_out.device(place) = param + update;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
new file mode 100644
index 0000000000..c83318a272
--- /dev/null
+++ b/paddle/operators/adagrad_op.cc
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adagrad_op.h"
+
+#include <cmath>
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+class AdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdagradOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdagradOp should have the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdagradOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Adaptive Gradient Algorithm (Adagrad).
+
+The update is done as follows:
+
+$$moment\_out = moment + grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon}
+$$
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have the epsilon attribute. It is added here in our implementation
+as also proposed here: http://cs231n.github.io/neural-networks-3/#ada
+for numerical stability to avoid the division by zero error.
+
+)DOC");
+  }
+};
+
+namespace {
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_width = grad.value().dims()[1];
+    math::scatter::MergeAdd<platform::CPUDeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto& merge_rows = grad_merge.rows();
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+
+    // 2. m += g_m * g_m
+    math::scatter::Mul<platform::CPUDeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CPUDeviceContext, T> functor;
+    functor(context, grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    for (size_t i = 0; i < merge_rows.size(); i++) {
+      for (int64_t j = 0; j < grad_width; j++) {
+        param_data[merge_rows[i] * grad_width + j] -=
+            lr[0] * grad_merge_data[i * grad_width + j] /
+            (std::sqrt(moment_data[merge_rows[i] * grad_width + j]) + epsilon);
+      }
+    }
+  }
+};
+
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CPUDeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
new file mode 100644
index 0000000000..4e57938792
--- /dev/null
+++ b/paddle/operators/adagrad_op.cu
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adagrad_op.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T, int block_size>
+__global__ void MergeGradKernel(const T* grad, const int64_t* grad_rows,
+                                T* grad_merge, const int64_t* grad_merge_rows,
+                                size_t grad_merge_rows_size,
+                                int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t grad_merge_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < grad_merge_rows_size; i++) {
+      if (grad_rows[ty] == grad_merge_rows[i]) {
+        grad_merge_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  grad += ty * row_numel;
+  grad_merge += grad_merge_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
+  }
+}
+
+template <typename T, int block_size>
+__global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows,
+                                           const T* learning_rate, T* param,
+                                           T* moment, int64_t row_numel,
+                                           T epsilon) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  grad += ty * row_numel;
+  param += rows[ty] * row_numel;
+  moment += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(param + index,
+                                    -1.0 * learning_rate[0] * grad[index] /
+                                        (sqrt(moment[index]) + epsilon));
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param) {
+    // 1. g_m.rows = set(g.rows)
+    auto grad_width = grad.value().dims()[1];
+    math::scatter::MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto grad_merge = merge_func(context, grad);
+    auto* grad_merge_data = grad_merge.mutable_value()->template data<T>();
+    auto& merge_rows = grad_merge.rows();
+    // 2. m += g_m * g_m
+    math::scatter::Mul<platform::CUDADeviceContext, T> sqare_func;
+    auto grad_square = sqare_func(context, grad_merge, grad_merge);
+
+    math::SelectedRowsAddToTensor<platform::CUDADeviceContext, T> functor;
+    functor(context, grad_square, moment);
+
+    // 3. update parameter
+    auto* lr = learning_rate.data<T>();
+    auto* param_data = param->data<T>();
+    auto* moment_data = moment->data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid2(1, merge_rows.size());
+    SparseAdagradFunctorKernel<
+        T, 256><<<grid2, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(grad_merge_data, grad_merge.rows().data(),
+                                   lr, param_data, moment_data, grad_width,
+                                   epsilon);
+  }
+};
+
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, float>;
+template struct SparseAdagradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adagrad, ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdagradOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
new file mode 100644
index 0000000000..66f5b0f449
--- /dev/null
+++ b/paddle/operators/adagrad_op.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+struct SparseAdagradFunctor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& grad,
+                  const framework::Tensor& learning_rate, T epsilon,
+                  framework::Tensor* moment, framework::Tensor* param);
+};
+
+template <typename DeviceContext, typename T>
+class AdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto* moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto* grad_var = ctx.InputVar("Grad");
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto param = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Param"));
+      auto grad = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Grad"));
+      auto moment = framework::EigenVector<T>::Flatten(
+          *ctx.Input<framework::Tensor>("Moment"));
+      auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+      auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+      auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+      auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+      moment_out.device(*place) = moment + grad * grad;
+      Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+      if (platform::is_cpu_place(ctx.GetPlace())) {
+        auto* lr = learning_rate->data<T>();
+        param_out.device(*place) =
+            param - lr[0] * grad / (moment_out.sqrt() + epsilon);
+      } else {
+        auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+        param_out.device(*place) =
+            param -
+            lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+      }
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto* param_tensor = ctx.Input<framework::Tensor>("Param");
+      PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
+
+      auto* moment_tensor = ctx.Input<framework::Tensor>("Moment");
+      PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor);
+
+      SparseAdagradFunctor<DeviceContext, T> functor;
+      functor(ctx.template device_context<DeviceContext>(),
+              *ctx.Input<framework::SelectedRows>("Grad"),
+              *ctx.Input<framework::Tensor>("LearningRate"), epsilon,
+              moment_out_tensor, param_out_tensor);
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
new file mode 100644
index 0000000000..03527de936
--- /dev/null
+++ b/paddle/operators/adam_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                      "Beta2 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment1 input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and Moment2 input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddComment(R"DOC(
+Adam Optimizer.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper : https://arxiv.org/abs/1412.6980.
+Adam is a first-order gradient-based optimization method based on
+adaptive estimates of lower-order moments.
+
+Adam updates:
+
+$$
+moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\
+moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\
+learning\_rate = learning\_rate *
+                  \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
new file mode 100644
index 0000000000..94f840c188
--- /dev/null
+++ b/paddle/operators/adam_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adam_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adam, ops::AdamOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
new file mode 100644
index 0000000000..9cc34bdded
--- /dev/null
+++ b/paddle/operators/adam_op.h
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>  // for sqrt in CPU and CUDA
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+namespace scatter = paddle::operators::math::scatter;
+
+template <typename T>
+struct AdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+              const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2,
+              T* mom2_out, const T* lr, const T* grad, const T* param,
+              T* param_out)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    // Merge all memory access together.
+    T g = grad_[i];
+    T mom1 = moment1_[i];
+    T mom2 = moment2_[i];
+    T lr = *lr_;
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    T p = param_[i];
+
+    // Calculation
+    lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+    mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+    mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+    p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+    // Write back to global memory
+    moment1_out_[i] = mom1;
+    moment2_out_[i] = mom2;
+    param_out_[i] = p;
+  }
+};
+
+template <typename T>
+struct SparseAdamFunctor {
+  T beta1_;
+  T beta2_;
+  T epsilon_;
+
+  const T* beta1_pow_;
+  const T* beta2_pow_;
+  const T* moment1_;
+  T* moment1_out_;
+  const T* moment2_;
+  T* moment2_out_;
+  const T* lr_;
+  const T* grad_;
+  const T* param_;
+  T* param_out_;
+
+  const int64_t* rows_;
+  int64_t row_numel_;
+
+  SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
+                    const T* beta2_pow, const T* mom1, T* mom1_out,
+                    const T* mom2, T* mom2_out, const T* lr, const T* grad,
+                    const T* param, T* param_out, const int64_t* rows,
+                    int64_t row_numel)
+      : beta1_(beta1),
+        beta2_(beta2),
+        epsilon_(epsilon),
+        beta1_pow_(beta1_pow),
+        beta2_pow_(beta2_pow),
+        moment1_(mom1),
+        moment1_out_(mom1_out),
+        moment2_(mom2),
+        moment2_out_(mom2_out),
+        lr_(lr),
+        grad_(grad),
+        param_(param),
+        param_out_(param_out),
+        rows_(rows),
+        row_numel_(row_numel) {}
+
+  inline HOSTDEVICE void operator()(size_t i) const {
+    T beta1_pow = *beta1_pow_;
+    T beta2_pow = *beta2_pow_;
+    for (int64_t j = 0; j < row_numel_; ++j) {
+      T g = grad_[i * row_numel_ + j];
+      T mom1 = moment1_[rows_[i] * row_numel_ + j];
+      T mom2 = moment2_[rows_[i] * row_numel_ + j];
+      T lr = *lr_;
+      T p = param_[rows_[i] * row_numel_ + j];
+
+      lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
+      mom1 = beta1_ * mom1 + (1 - beta1_) * g;
+      mom2 = beta2_ * mom2 + (1 - beta2_) * g * g;
+      p -= lr * (mom1 / (sqrt(mom2) + epsilon_));
+
+      moment1_out_[rows_[i] * row_numel_ + j] = mom1;
+      moment2_out_[rows_[i] * row_numel_ + j] = mom2;
+      param_out_[rows_[i] * row_numel_ + j] = p;
+    }  // for col id
+  }
+};
+
+template <typename DeviceContext, typename T>
+class AdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using paddle::framework::LoDTensor;
+    using paddle::operators::detail::Ref;
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    auto& param = Ref(ctx.Input<LoDTensor>("Param"), "Must set Param");
+    // auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+    auto* grad_var = ctx.InputVar("Grad");
+    auto& mom1 = Ref(ctx.Input<LoDTensor>("Moment1"), "Must set Moment1");
+    auto& mom2 = Ref(ctx.Input<LoDTensor>("Moment2"), "Must set Moment2");
+    auto& lr =
+        Ref(ctx.Input<LoDTensor>("LearningRate"), "Must set LearningRate");
+
+    auto& beta1_pow =
+        Ref(ctx.Input<LoDTensor>("Beta1Pow"), "Must set Beta1Pow");
+    auto& beta2_pow =
+        Ref(ctx.Input<LoDTensor>("Beta2Pow"), "Must set Beta2Pow");
+
+    auto& param_out =
+        Ref(ctx.Output<LoDTensor>("ParamOut"), "Must set ParamOut");
+    auto& mom1_out =
+        Ref(ctx.Output<LoDTensor>("Moment1Out"), "Must set Moment1Out");
+    auto& mom2_out =
+        Ref(ctx.Output<LoDTensor>("Moment2Out"), "Must set Moment1Out");
+
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      auto& grad = Ref(ctx.Input<LoDTensor>("Grad"), "Must set Grad");
+      AdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad.template data<T>(),
+          param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()));
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          param.numel());
+      for_range(functor);
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      auto& grad =
+          Ref(ctx.Input<framework::SelectedRows>("Grad"), "Must set Grad");
+      // merge duplicated rows if any.
+      scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto grad_merge =
+          merge_func(ctx.template device_context<DeviceContext>(), grad);
+      auto& grad_tensor = grad_merge.value();
+      const T* grad_data = grad_tensor.template data<T>();
+      auto* rows = grad_merge.rows().data();
+      auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
+
+      SparseAdamFunctor<T> functor(
+          beta1, beta2, epsilon, beta1_pow.template data<T>(),
+          beta2_pow.template data<T>(), mom1.template data<T>(),
+          mom1_out.template mutable_data<T>(ctx.GetPlace()),
+          mom2.template data<T>(),
+          mom2_out.template mutable_data<T>(ctx.GetPlace()),
+          lr.template data<T>(), grad_data, param.template data<T>(),
+          param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(ctx.device_context()),
+          grad_merge.rows().size());
+      for_range(functor);
+    } else {
+      PADDLE_THROW("Variable type not supported by adam_op");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
new file mode 100644
index 0000000000..3b0b714184
--- /dev/null
+++ b/paddle/operators/adamax_op.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adamax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
+                   "Input(InfNorm) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
+                   "Output(InfNormOut) of AdamaxOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("InfNorm"),
+        "Param and InfNorm input of AdamaxOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+    ctx->SetOutputDim("InfNormOut", param_dims);
+  }
+};
+
+class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment", "(Tensor) First moment");
+    AddInput("InfNorm",
+             "(Tensor) "
+             "Input exponentially weighted infinity norm");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output first moment");
+    AddOutput("InfNormOut",
+              "(Tensor) "
+              "Output exponentially weighted infinity norm");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "1st moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the weighted "
+                   "infinity norm estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddComment(R"DOC(
+Adamax Optimizer.
+
+We implement the Adamax optimizer from Section 7 of the Adam
+paper: https://arxiv.org/abs/1412.6980. Adamax is a variant of the
+Adam algorithm based on the infinity norm.
+
+Adamax updates:
+
+$$
+moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\
+inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\
+learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\
+param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out}
+$$
+
+The original paper does not have an epsilon attribute.
+However, it is added here for numerical stability to prevent the
+division by 0 error.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu
new file mode 100644
index 0000000000..8f87bb2867
--- /dev/null
+++ b/paddle/operators/adamax_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adamax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    adamax, ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::AdamaxOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
new file mode 100644
index 0000000000..172c179c5f
--- /dev/null
+++ b/paddle/operators/adamax_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class AdamaxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto inf_norm = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("InfNorm"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto inf_norm_out =
+        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(*place) =
+        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
+    auto lr_t = lr / (1 - beta1_pow);
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(*place) =
+        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
new file mode 100644
index 0000000000..3fdad5ad9b
--- /dev/null
+++ b/paddle/operators/array_operator.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+class ArrayOp : public framework::OperatorBase {
+ public:
+  ArrayOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  size_t GetOffset(const framework::Scope &scope,
+                   const platform::Place &place) const {
+    auto *i = scope.FindVar(Input("I"));
+    PADDLE_ENFORCE(i != nullptr, "I must be set");
+    auto &i_tensor = i->Get<framework::LoDTensor>();
+    PADDLE_ENFORCE_EQ(i_tensor.numel(), 1);
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    size_t offset;
+    if (platform::is_gpu_place(i_tensor.place())) {
+      // FIXME: Avoid copy from GPU to CPU
+      framework::Tensor t;
+      framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
+      dev_ctx.Wait();
+      offset = static_cast<size_t>(*t.data<int64_t>());
+    } else {
+      offset = static_cast<size_t>(*i_tensor.data<int64_t>());
+    }
+    VLOG(10) << " Offset = " << offset;
+    return offset;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
new file mode 100644
index 0000000000..ba5c6bd3c6
--- /dev/null
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <numeric>
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class ArrayToLoDTensorOp : public framework::OperatorBase {
+ public:
+  ArrayToLoDTensorOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    // Check dims, place and data type of input's elements and infer output's
+    // dim
+    PADDLE_ENFORCE(!x.empty(), "There's no element in the input array.");
+    int rank = x[0].dims().size();
+    platform::Place place = x[0].place();
+    std::type_index data_type = x[0].type();
+    framework::DDim ins_dims = framework::slice_ddim(x[0].dims(), 1, rank);
+    int64_t batch_size = x[0].dims()[0];
+    for (size_t i = 1; i < x.size(); ++i) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x[i].dims(), 1, rank), ins_dims,
+                        "The dimension of the %zu'th element in LoDTensorArray "
+                        "differs from previous ones.",
+                        i);
+      PADDLE_ENFORCE(platform::places_are_same_class(x[i].place(), place),
+                     "The place class of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      PADDLE_ENFORCE(x[i].type() == data_type,
+                     "The date type of the %zu'th element in LoDTensorArray "
+                     "differs from previous ones.",
+                     i);
+      batch_size += x[i].dims()[0];
+    }
+    auto ins_dim_vec = framework::vectorize(ins_dims);
+    ins_dim_vec.insert(ins_dim_vec.begin(), batch_size);
+    framework::DDim out_dims = framework::make_ddim(ins_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto &table_items = rank_table.items();
+    std::vector<size_t> table_item_idx(table_items.size());
+    // table_item_idx = range(table_items_idx.size())
+    std::iota(table_item_idx.begin(), table_item_idx.end(), 0);
+    std::sort(table_item_idx.begin(), table_item_idx.end(),
+              [&](size_t a, size_t b) {
+                return table_items[a].index < table_items[b].index;
+              });
+
+    // Build LoDTensor `out`
+    framework::LoD *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+    auto prefix_lod = rank_table.coarse_lod();
+    prefix_lod.emplace_back();
+    auto &cur_level_lod = prefix_lod.back();
+    cur_level_lod.push_back(0);
+    for (size_t idx : table_item_idx) {
+      cur_level_lod.push_back(cur_level_lod.back() + table_items[idx].length);
+      for (size_t x_idx = 0; x_idx < table_items[idx].length; ++x_idx) {
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x[x_idx].lod(), idx, idx + 1, 0);
+
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(out_lod, lod_length);
+
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        VLOG(10) << "idx=" << idx << " x_idx=" << x_idx << " ["
+                 << ", " << end_offset << "]";
+        // Copy data
+        PADDLE_ENFORCE_GE(end_offset, start_offset);
+        size_t len = end_offset - start_offset;
+        if (len == 0) {
+          continue;
+        }
+        auto slice = out->Slice(out_offset, out_offset + len);
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
+                        dev_ctx, &slice);
+        out_offset += len;
+      }
+    }
+    out_lod->insert(out_lod->begin(), prefix_lod.begin(), prefix_lod.end());
+  }
+};
+
+class ArrayToLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ArrayToLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(std::vector<LodTensor>) A vector of tensors that is going to "
+             "be casted to a big LoDTensor.");
+    AddInput("RankTable",
+             "(LoDRankTable) RankTable provides the coarse lod infomation to "
+             "build the output LoDTensor. See "
+             "'paddle/framework/lod_rank_table.h' for more details.");
+    AddOutput("Out", "(LoDTensor) The LoDTensor formed by input tensor array.");
+    AddComment(
+        R"DOC(This Op build a big LoDTensor from a std::vector<LoDTensor> 
+          and a LoDRankTable. It is supposed to be used in getting dynamic RNN's
+          outputs back to a normal LoDTensor. The std::vector<LoDTensor> 
+          would be the output of RNN Op and the LoDRankTable would be build 
+          with RNN's input.)DOC");
+  }
+};
+
+class ArrayToLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "ArrayToLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("RankTable"),
+                   "ArrayToLoDTensorOp must has input RankTable.");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ArrayToLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("lod_tensor_to_array");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(array_to_lod_tensor, ops::ArrayToLoDTensorOp,
+                  ops::ArrayToLoDTensorOpProtoMaker,
+                  ops::ArrayToLoDTensorInferShape,
+                  ops::ArrayToLoDTensorGradMaker);
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
new file mode 100644
index 0000000000..e04aa2d28c
--- /dev/null
+++ b/paddle/operators/assign_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_type.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+class AssignFunctor {
+ public:
+  AssignFunctor(framework::Variable *out,
+                const platform::DeviceContext &dev_ctx)
+      : out_(out), dev_ctx_(dev_ctx) {}
+
+  void operator()(const framework::LoDTensor &lod_tensor) const {
+    auto &out_tensor = *out_->GetMutable<framework::LoDTensor>();
+    copy_tensor(lod_tensor, &out_tensor);
+  }
+
+  void operator()(const framework::LoDTensorArray &array) const {
+    auto &out_array = *out_->GetMutable<framework::LoDTensorArray>();
+    out_array.resize(array.size());
+    for (size_t i = 0; i < array.size(); ++i) {
+      copy_tensor(array[i], &out_array[i]);
+    }
+  }
+
+  void operator()(const framework::SelectedRows &rows) const {
+    framework::SelectedRows &out_rows =
+        *out_->GetMutable<framework::SelectedRows>();
+    out_rows.set_rows(rows.rows());
+    out_rows.set_height(rows.height());
+    auto &t = rows.value();
+    auto *m = out_rows.mutable_value();
+    framework::Copy(t, t.place(), dev_ctx_, m);
+  }
+
+  template <typename T>
+  void operator()(const T &v) const {
+    PADDLE_THROW("Not support type for assign op %s", typeid(T).name());
+  }
+
+ private:
+  void copy_tensor(const framework::LoDTensor &lod_tensor,
+                   framework::LoDTensor *out) const {
+    auto &out_tensor = *out;
+    Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
+    out_tensor.set_lod(lod_tensor.lod());
+  }
+
+  framework::Variable *out_;
+  const platform::DeviceContext &dev_ctx_;
+};
+
+class AssignOp : public framework::OperatorBase {
+ public:
+  AssignOp(const std::string &type, const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) {
+      return;
+    }
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(
+        out != nullptr,
+        "The Output(Out) should not be null if the Input(X) is set.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::VisitVarType(*x, AssignFunctor(out, dev_ctx));
+  }
+};
+
+class AssignOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, SelectedRows or LoDTensorArray) The input variable "
+             "could be LoDTensor, SelectedRows or LoDTensorArray.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(LoDTensor, SelectedRows or LoDTensorArray) The type of output "
+              "is the same as input X.");
+    AddComment(R"DOC(Assign Operator
+
+Out = X,  when type in [LoDTensor/SelectedRows/LoDTensorArray]
+raise error if the type is not listed above.
+)DOC");
+  }
+};
+
+class AssignInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    if (context->HasInput("X")) {
+      auto type = context->GetInputsVarType("X")[0];
+      if (type == framework::proto::VarDesc_VarType_SELECTED_ROWS ||
+          type == framework::proto::VarDesc_VarType_LOD_TENSOR) {
+        context->SetOutputDim("Out", context->GetInputDim("X"));
+      }
+    }
+  }
+};
+
+class AssignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("assign");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(assign, ops::AssignOp, ops::AssignGradMaker,
+                  ops::AssignInferShape, ops::AssignOpProtoMaker);
diff --git a/paddle/operators/assign_value_op.cc b/paddle/operators/assign_value_op.cc
new file mode 100644
index 0000000000..8e3a530489
--- /dev/null
+++ b/paddle/operators/assign_value_op.cc
@@ -0,0 +1,73 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AssignValueOp : public framework::OperatorWithKernel {
+ public:
+  AssignValueOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AssignValueOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
+  }
+};
+
+class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Shape of values.");
+    AddAttr<int>("dtype", "data type of values")
+        .InEnum({framework::proto::DataType::INT32,
+                 framework::proto::DataType::FP32});
+    AddAttr<std::vector<float>>("fp32_values", "store the float values")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("int32_values", "store the int values")
+        .SetDefault({});
+    AddComment(R"DOC(
+AssignValue operator
+
+$$Out = values$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/operators/assign_value_op.cu.cc
new file mode 100644
index 0000000000..b17e201500
--- /dev/null
+++ b/paddle/operators/assign_value_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.h b/paddle/operators/assign_value_op.h
new file mode 100644
index 0000000000..ec98c53513
--- /dev/null
+++ b/paddle/operators/assign_value_op.h
@@ -0,0 +1,50 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AssignValueKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int dtype = ctx.Attr<int>("dtype");
+    const char* value_name = nullptr;
+    switch (dtype) {
+      case framework::proto::DataType::INT32:
+        value_name = "int32_values";
+        break;
+      case framework::proto::DataType::FP32:
+        value_name = "fp32_values";
+        break;
+      default:
+        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        break;
+    }
+    auto values = ctx.Attr<std::vector<T>>(value_name);
+    framework::CopyFromVector(values, ctx.device_context(), out);
+    out->Resize(framework::make_ddim(shape));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc
new file mode 100644
index 0000000000..b6494f9509
--- /dev/null
+++ b/paddle/operators/auc_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/auc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AucOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input of Indices should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input of Label should not be null.");
+    auto inference_height = ctx->GetInputDim("Out")[0];
+    auto label_height = ctx->GetInputDim("Label")[0];
+
+    PADDLE_ENFORCE_EQ(inference_height, label_height,
+                      "Out and Label should have same height.");
+
+    ctx->SetOutputDim("AUC", {1});
+    ctx->ShareLoD("Out", /*->*/ "AUC");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
+        ctx.device_context());
+  }
+};
+
+class AucOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AucOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Out",
+             "A floating point 2D tensor, values are in the range [0, 1]."
+             "Each row is sorted in descending order. This input should be the"
+             "output of topk."
+             "Typically, this tensor indicates the probability of each label");
+    AddInput("Indices",
+             "An int 2D tensor, indicating the indices of original"
+             "tensor before sorting. Typically, this tensor indicates which "
+             "label the probability stands for.");
+    AddInput("Label",
+             "A 2D int tensor indicating the label of the training data."
+             "The height is batch size and width is always 1.");
+    // TODO(typhoonzero): support weight input
+    AddOutput("AUC",
+              "A scalar representing the "
+              "current area-under-the-curve.");
+
+    AddAttr<std::string>("curve", "Curve type, can be 'ROC' or 'PR'.")
+        .SetDefault("ROC");
+    AddAttr<int>("num_thresholds",
+                 "The number of thresholds to use when discretizing the"
+                 " roc curve.")
+        .SetDefault(200);
+
+    AddComment(R"DOC(
+Area Under The Curve (AUC) Operator.
+
+This implementation computes the AUC according to forward output and label.
+It is used very widely in binary classification evaluation. As a note:
+If input label contains values other than 0 and 1, it will be cast
+to bool. You can find the relevant definitions here:
+https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve
+
+There are two types of possible curves:
+1. ROC: Receiver operating characteristic
+2. PR: Precision Recall
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(auc, ops::AucOp, ops::AucOpMaker);
+REGISTER_OP_CPU_KERNEL(auc, ops::AucKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h
new file mode 100644
index 0000000000..b80509e2a9
--- /dev/null
+++ b/paddle/operators/auc_op.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class AucKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* inference = ctx.Input<Tensor>("Out");
+    auto* label = ctx.Input<Tensor>("Label");
+    auto* auc = ctx.Output<Tensor>("AUC");
+
+    float* auc_data = auc->mutable_data<float>(ctx.GetPlace());
+
+    std::string curve = ctx.Attr<std::string>("curve");
+    int num_thresholds = ctx.Attr<int>("num_thresholds");
+    std::vector<float> thresholds_list;
+    thresholds_list.reserve(num_thresholds);
+    for (int i = 1; i < num_thresholds - 1; i++) {
+      thresholds_list[i] = (float)i / (num_thresholds - 1);
+    }
+    const float kEpsilon = 1e-7;
+    thresholds_list[0] = 0.0f - kEpsilon;
+    thresholds_list[num_thresholds - 1] = 1.0f + kEpsilon;
+
+    size_t batch_size = inference->dims()[0];
+    size_t inference_width = inference->dims()[1];
+
+    const T* inference_data = inference->data<T>();
+    const int64_t* label_data = label->data<int64_t>();
+
+    // Create local tensor for storing the curve: TP, FN, TN, FP
+    // TODO(typhoonzero): use eigen op to caculate these values.
+    Tensor true_positive, false_positive, true_negative, false_negative;
+
+    true_positive.Resize({num_thresholds});
+    false_negative.Resize({num_thresholds});
+    true_negative.Resize({num_thresholds});
+    false_positive.Resize({num_thresholds});
+
+    int64_t* tp_data = true_positive.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fn_data = false_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* tn_data = true_negative.mutable_data<int64_t>(ctx.GetPlace());
+    int64_t* fp_data = false_positive.mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int idx_thresh = 0; idx_thresh < num_thresholds; idx_thresh++) {
+      // caculate TP, FN, TN, FP for current thresh
+      int64_t tp = 0, fn = 0, tn = 0, fp = 0;
+      for (size_t i = 0; i < batch_size; i++) {
+        // NOTE: label_data used as bool, labels >0 will be treated as true.
+        if (label_data[i]) {
+          // use first(max) data in each row
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
+            tp++;
+          } else {
+            fn++;
+          }
+        } else {
+          if (inference_data[i * inference_width] >=
+              (thresholds_list[idx_thresh])) {
+            fp++;
+          } else {
+            tn++;
+          }
+        }
+      }
+      // store rates
+      tp_data[idx_thresh] = tp;
+      fn_data[idx_thresh] = fn;
+      tn_data[idx_thresh] = tn;
+      fp_data[idx_thresh] = fp;
+    }
+    // epsilon to avoid divide by zero.
+    float epsilon = 1e-6;
+    // Riemann sum to caculate auc.
+    Tensor tp_rate, fp_rate, rec_rate;
+    tp_rate.Resize({num_thresholds});
+    fp_rate.Resize({num_thresholds});
+    rec_rate.Resize({num_thresholds});
+    float* tp_rate_data = tp_rate.mutable_data<float>(ctx.GetPlace());
+    float* fp_rate_data = fp_rate.mutable_data<float>(ctx.GetPlace());
+    float* rec_rate_data = rec_rate.mutable_data<float>(ctx.GetPlace());
+    for (int i = 0; i < num_thresholds; i++) {
+      tp_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fn_data[i] + epsilon);
+      fp_rate_data[i] = (float)fp_data[i] / (fp_data[i] + tn_data[i] + epsilon);
+      rec_rate_data[i] =
+          ((float)tp_data[i] + epsilon) / (tp_data[i] + fp_data[i] + epsilon);
+    }
+    *auc_data = 0.0f;
+    if (curve == "ROC") {
+      for (int i = 0; i < num_thresholds - 1; i++) {
+        auto dx = fp_rate_data[i] - fp_rate_data[i + 1];
+        auto y = (tp_rate_data[i] + tp_rate_data[i + 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    } else if (curve == "PR") {
+      for (int i = 1; i < num_thresholds; i++) {
+        auto dx = tp_rate_data[i] - tp_rate_data[i - 1];
+        auto y = (rec_rate_data[i] + rec_rate_data[i - 1]) / 2.0f;
+        *auc_data = *auc_data + dx * y;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
new file mode 100644
index 0000000000..0e984c38ba
--- /dev/null
+++ b/paddle/operators/batch_norm_op.cc
@@ -0,0 +1,448 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+class BatchNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
+
+    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                      "Mean and MeanOut should share the same memory");
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
+                      ctx->Outputs("VarianceOut")[0],
+                      "Variance and VarianceOut should share the same memory");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
+    const int64_t C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+
+    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("MeanOut", {C});
+    ctx->SetOutputDim("VarianceOut", {C});
+    ctx->SetOutputDim("SavedMean", {C});
+    ctx->SetOutputDim("SavedVariance", {C});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BatchNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<float>("momentum", "").SetDefault(0.9);
+    AddAttr<float>("epsilon", "")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<std::string>("data_layout", "").SetDefault("NCHW");
+    AddInput("X", "The input tensor");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output");
+    AddInput("Mean",
+             "The global mean (for training) or "
+             "estimated mean (for testing)");
+    AddInput("Variance",
+             "The global variance (for training) "
+             "or estimated Variance (for testing)");
+    AddOutput("Y", "result after normalization");
+    AddOutput("MeanOut",
+              "Share memory with Mean. "
+              "Store the global mean when training");
+    AddOutput("VarianceOut",
+              "Share memory with Variance. "
+              "Store the global Variance when training");
+    AddOutput("SavedMean",
+              "Mean of the current mini batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddOutput("SavedVariance",
+              "Variance of the current mini batch, "
+              "will apply to output when training")
+        .AsIntermediate();
+    AddComment(R"DOC(
+Batch Normalization.
+
+Batch Norm has been implemented as discussed in the paper:
+https://arxiv.org/pdf/1502.03167.pdf
+Can be used as a normalizer function for conv2d and fully_connected operations.
+The required data format for this layer is one of the following:
+1. NHWC `[batch, in_height, in_width, in_channels]`
+2. NCHW `[batch, in_channels, in_height, in_width]`
+
+)DOC");
+  }
+};
+
+template <typename T>
+class BatchNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<T>(ctx.GetPlace());
+
+    if (!is_test) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(
+          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      switch (data_layout) {
+        case DataLayout::kNCHW: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        case DataLayout::kNHWC: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_mean_e += x_arr.col(i);
+          }
+          saved_mean_e /= N * sample_size;
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_variance_e +=
+                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+      }
+
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+      running_mean_arr =
+          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+      running_var_arr =
+          running_var_arr * momentum + saved_variance_e * (1. - momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (is_test) {
+      ConstEigenVectorArrayMap<T> var_arr(
+          ctx.Input<Tensor>("Variance")->data<T>(), C);
+      inv_std = (var_arr + epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+    ConstEigenVectorArrayMap<T> mean_arr(
+        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
+                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
+                               N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      case DataLayout::kNHWC: {
+        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
+                         N * sample_size) =
+            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
+             new_scale)
+                .colwise() +
+            new_bias;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %d", data_layout);
+    }
+  }
+};
+
+class BatchNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+
+    // check output
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const DataLayout data_layout = framework::StringToDataLayout(
+        ctx->Attrs().Get<std::string>("data_layout"));
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class BatchNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    // SavedVariance have been reverted in forward operator
+    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                          : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    // d_bias = np.sum(d_y, axis=0)
+    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+
+    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
+                                      C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
+                                       C);
+
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+
+    switch (data_layout) {
+      case DataLayout::kNCHW: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
+                                 sample_size, N * C);
+        d_x_arr.setZero();
+
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_bias_arr(c) += d_y_arr.col(nc).sum();
+          d_scale_arr(c) +=
+              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                  .sum();
+        }
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_x_arr.col(nc) +=
+              scale_inv_var_nhw(c) *
+              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        }
+        break;
+      }
+      case DataLayout::kNHWC: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
+                                 N * sample_size);
+        d_x_arr.setZero();
+
+        const auto d_y_row_sum = d_y_arr.rowwise().sum();
+        const auto x_minus_mean = x_arr.colwise() - mean_arr;
+        const auto d_y_mul_x_minus_mean_row_sum =
+            (d_y_arr * x_minus_mean).rowwise().sum();
+        const auto inv_var_sqr = inv_var_arr * inv_var_arr;
+        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+          d_bias_arr += d_y_arr.col(nhw);
+          d_scale_arr +=
+              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          d_x_arr.col(nhw) +=
+              scale_inv_var_nhw *
+              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+               x_minus_mean.col(nhw) * inv_var_sqr *
+                   d_y_mul_x_minus_mean_row_sum);
+        }
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %s", data_layout_str);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
new file mode 100644
index 0000000000..3d17725ab4
--- /dev/null
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -0,0 +1,278 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/batch_norm_op.h"
+#include "paddle/framework/data_layout.h"
+
+#include <cfloat>
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DataLayout = framework::DataLayout;
+template <typename T>
+using CudnnDataType = platform::CudnnDataType<T>;
+
+void ExtractNCWHD(const framework::DDim &dims, const DataLayout &data_layout,
+                  int *N, int *C, int *H, int *W, int *D) {
+  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = data_layout == DataLayout::kNCHW ? dims[1] : dims[dims.size() - 1];
+    *H = data_layout == DataLayout::kNCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (data_layout == DataLayout::kNCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (data_layout == DataLayout::kNCHW ? dims[4] : dims[3])
+             : 1;
+  }
+}
+
+template <typename T>
+class BatchNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+    VLOG(1) << "Setting descriptors.";
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (data_layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * D * C, 1, W * D * C, D * C, C};
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<T>(ctx.GetPlace());
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(dev_ctx, saved_mean, 0);
+    functor(dev_ctx, saved_variance, 0);
+
+    auto handle = dev_ctx.cudnn_handle();
+
+    // Now, depending on whether we are running test or not, we have two paths.
+    if (is_test) {
+      // only when test we use input to do computation.
+      const auto *est_mean = ctx.Input<Tensor>("Mean");
+      const auto *est_var = ctx.Input<Tensor>("Variance");
+      // Run inference mode.
+      PADDLE_ENFORCE_EQ(est_mean->dims().size(), 1UL);
+      PADDLE_ENFORCE_EQ(est_var->dims().size(), 1UL);
+      PADDLE_ENFORCE_EQ(est_mean->dims()[0], C);
+      PADDLE_ENFORCE_EQ(est_var->dims()[0], C);
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardInference(
+          handle,
+          // Note: PERSISTENT not implemented for inference
+          CUDNN_BATCHNORM_SPATIAL, CudnnDataType<T>::kOne(),
+          CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+          data_desc_, y->template mutable_data<T>(ctx.GetPlace()),
+          bn_param_desc_, scale->template data<T>(), bias->template data<T>(),
+          est_mean->template data<T>(), est_var->template data<T>(), epsilon));
+    } else {
+      // Run training mode.
+      // obtain running mean and running inv var, and see if we need to
+      // initialize them.
+      double this_factor = 1. - momentum;
+
+      CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationForwardTraining(
+          handle, mode_, CudnnDataType<T>::kOne(), CudnnDataType<T>::kZero(),
+          data_desc_, x->template data<T>(), data_desc_,
+          y->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+          scale->template data<T>(), bias->template data<T>(), this_factor,
+          mean_out->template mutable_data<T>(ctx.GetPlace()),
+          variance_out->template mutable_data<T>(ctx.GetPlace()), epsilon,
+          saved_mean->template mutable_data<T>(ctx.GetPlace()),
+          saved_variance->template mutable_data<T>(ctx.GetPlace())));
+    }
+
+    // clean when exit.
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+template <typename T>
+class BatchNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    double epsilon = static_cast<double>(ctx.Attr<float>("epsilon"));
+    const std::string data_layout_str = ctx.Attr<std::string>("data_layout");
+    const DataLayout data_layout =
+        framework::StringToDataLayout(data_layout_str);
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
+    int N, C, H, W, D;
+    ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D);
+
+    PADDLE_ENFORCE_EQ(scale->dims().size(), 1UL);
+    PADDLE_ENFORCE_EQ(scale->dims()[0], C);
+
+    // ------------------- cudnn descriptors ---------------------
+    cudnnTensorDescriptor_t data_desc_;
+    cudnnTensorDescriptor_t bn_param_desc_;
+    cudnnBatchNormMode_t mode_;
+
+    CUDNN_ENFORCE(platform::dynload::cudnnCreateTensorDescriptor(&data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_));
+    if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) {
+      LOG(ERROR) << "Provided epsilon is smaller than "
+                 << "CUDNN_BN_MIN_EPSILON. Setting it to "
+                 << "CUDNN_BN_MIN_EPSILON instead.";
+    }
+    epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
+#if CUDNN_VERSION_MIN(7, 0, 0)
+    mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT;
+#else
+    mode_ = CUDNN_BATCHNORM_SPATIAL;
+#endif
+
+    std::vector<int> dims;
+    std::vector<int> strides;
+    if (data_layout == DataLayout::kNCHW) {
+      dims = {N, C, H, W, D};
+      strides = {C * H * W * D, H * W * D, W * D, D, 1};
+    } else {
+      dims = {N, C, H, W, D};
+      strides = {H * W * C * D, 1, W * D * C, D * C, C};
+    }
+    CUDNN_ENFORCE(platform::dynload::cudnnSetTensorNdDescriptor(
+        data_desc_, CudnnDataType<T>::type,
+        x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data()));
+    CUDNN_ENFORCE(platform::dynload::cudnnDeriveBNTensorDescriptor(
+        bn_param_desc_, data_desc_, mode_));
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    const auto *saved_var = ctx.Input<Tensor>("SavedVariance");
+    const void *saved_mean_data = saved_mean->template data<T>();
+    const void *saved_var_data = saved_var->template data<T>();
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward(
+        dev_ctx.cudnn_handle(), mode_, CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), CudnnDataType<T>::kOne(),
+        CudnnDataType<T>::kZero(), data_desc_, x->template data<T>(),
+        data_desc_, d_y->template data<T>(), data_desc_,
+        d_x->template mutable_data<T>(ctx.GetPlace()), bn_param_desc_,
+        scale->template data<T>(),
+        d_scale->template mutable_data<T>(ctx.GetPlace()),
+        d_bias->template mutable_data<T>(ctx.GetPlace()), epsilon,
+        saved_mean_data, saved_var_data));
+
+    // clean when exit.
+    CUDNN_ENFORCE(platform::dynload::cudnnDestroyTensorDescriptor(data_desc_));
+    CUDNN_ENFORCE(
+        platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm,
+    ops::BatchNormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h
new file mode 100644
index 0000000000..a817ef41fc
--- /dev/null
+++ b/paddle/operators/batch_norm_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class BatchNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class BatchNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op.cc b/paddle/operators/beam_search_decode_op.cc
new file mode 100644
index 0000000000..72e05607b0
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct BeamSearchDecodeFunctor {
+  BeamSearchDecodeFunctor(const LoDTensorArray& step_ids,
+                          const LoDTensorArray& step_scores,
+                          LoDTensor* id_tensor, LoDTensor* score_tensor)
+      : step_ids_(step_ids),
+        step_scores_(step_scores),
+        id_tensor_(id_tensor),
+        score_tensor_(score_tensor) {}
+
+  template <typename T>
+  void operator()() const;
+
+  const LoDTensorArray& step_ids_;
+  const LoDTensorArray& step_scores_;
+  LoDTensor* id_tensor_;
+  LoDTensor* score_tensor_;
+};
+
+template <typename T>
+void BeamSearchDecodeFunctor::operator()() const {
+  BeamSearchDecoder<T> beam_search_decoder;
+  beam_search_decoder.PackAllSteps(step_ids_, step_scores_, id_tensor_,
+                                   score_tensor_);
+}
+
+template <>
+void BeamSearchDecodeFunctor::operator()<bool>() const {
+  PADDLE_THROW("beam search decode op does not support bool!");
+}
+
+class BeamSearchDecodeOp : public framework::OperatorBase {
+ public:
+  BeamSearchDecodeOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& dev_ctx = *pool.Get(dev_place);
+
+    framework::ExecutionContext ctx(*this, scope, dev_ctx);
+
+    const LoDTensorArray* ids = ctx.Input<LoDTensorArray>("Ids");
+    const LoDTensorArray* scores = ctx.Input<LoDTensorArray>("Scores");
+    const size_t step_num = ids->size();
+    PADDLE_ENFORCE_GT(step_num, 0UL,
+                      "beam search steps should be larger than 0");
+    const size_t source_num = ids->at(0).lod().at(0).size() - 1;
+    PADDLE_ENFORCE_GT(source_num, 0UL, "source num should be larger than 0");
+
+    for (size_t i = 0; i < step_num; ++i) {
+      PADDLE_ENFORCE_EQ(ids->at(i).lod().size(), 2UL,
+                        "Level of LodTensor should be 2");
+    }
+
+    // prepare output
+    LoDTensor* sentenceIds = ctx.Output<LoDTensor>("SentenceIds");
+    LoDTensor* sentenceScores = ctx.Output<LoDTensor>("SentenceScores");
+
+    framework::VisitDataType(
+        framework::ToDataType(scores->at(0).type()),
+        BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores));
+  }
+};
+
+class BeamSearchDecodeOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchDecodeOpProtoMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddInput("Scores",
+             "(LodTensorArray)"
+             "score of the candidate words in each step");
+    AddOutput("SentenceIds",
+              "(LodTensor)"
+              "All possible result sentences of word ids");
+    AddOutput("SentenceScores",
+              "(LodTensor)"
+              "All possible result sentences of word scores");
+    AddComment(R"DOC(
+Pack the result of Beam search op into SentenceIds and SentenceScores.
+)DOC");
+  }
+};
+
+class BeamSearchDecodeInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("Ids"),
+                   "BeamSearchDecodeOp must has input Ids");
+    PADDLE_ENFORCE(context->HasInput("Scores"),
+                   "BeamSearchDecodeOp must has input Scores");
+    PADDLE_ENFORCE(context->HasOutput("SentenceIds"),
+                   "BeamSearchDecodeOp must has output SentenceIds");
+    PADDLE_ENFORCE(context->HasOutput("SentenceScores"),
+                   "BeamSearchDecodeOp must has output SentenceScores");
+  }
+};
+
+class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    for (auto& o : op_desc.Output("SentenceIds")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto& o : op_desc.Output("SentenceScores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search_decode, paddle::operators::BeamSearchDecodeOp,
+                  paddle::operators::BeamSearchDecodeOpProtoMaker,
+                  paddle::operators::BeamSearchDecodeInferShape,
+                  paddle::operators::BeamSearchDecodeInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
new file mode 100644
index 0000000000..3b1c6cd7a1
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op.h
@@ -0,0 +1,280 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using LoDTensorArray = framework::LoDTensorArray;
+
+// all the lod have 2 levels.
+// The First is source level, the second is sentence level.
+// source level describe how many candidate words for this source.
+// sentence level describe these candidates belong to which prefix
+const size_t kSourceLevel = 0;
+const size_t kSentenceLevel = 1;
+
+template <typename T>
+struct BeamNode {
+  BeamNode(int64_t word_id, T score) : word_id_(word_id), score_(score) {}
+
+  ~BeamNode() {
+    if (parent_) {
+      parent_->DropKid(this);
+      if (parent_->kids_.size() == 0UL) {
+        delete parent_;
+      }
+    }
+    VLOG(3) << "Delete BeamNode root with word_id:" << this->word_id_;
+  }
+
+  void AppendTo(BeamNode* parent) {
+    parent_ = parent;
+    parent->kids_.insert(this);
+  }
+
+  void DropKid(BeamNode* kid) { kids_.erase(kid); }
+
+  BeamNode* parent_ = nullptr;
+  std::unordered_set<BeamNode*> kids_;
+  int64_t word_id_;
+  T score_;
+};
+
+template <typename T>
+using BeamNodeVector = std::vector<std::unique_ptr<BeamNode<T>>>;
+
+template <typename T>
+struct Sentence {
+  std::vector<int64_t> word_ids;
+  std::vector<T> scores;
+};
+
+template <typename T>
+using SentenceVector = std::vector<Sentence<T>>;
+
+template <typename T>
+struct BeamSearchDecoder {
+  /**
+   * make a BeamNode and all it's related prefix BeanNode into a Sentence.
+   */
+  Sentence<T> MakeSentence(const BeamNode<T>* node) const;
+
+  /**
+   * Param:
+   *  cur_ids: LoDTensor of One step for word ID
+   *  cur_scores: LoDTensor of One Step for word score
+   *  prefixes_list: prefixes for each source sentence.
+   *  sentence_vector_list: result sentence_vector for each source sentence.
+   * Return:
+   *  a new prefixes list for each source of current step
+   */
+  std::vector<BeamNodeVector<T>> PackTwoSteps(
+      const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+      std::vector<BeamNodeVector<T>>& prefixes_list,
+      std::vector<SentenceVector<T>>* sentence_vector_list) const;
+
+  /**
+   * convert the result sentence_vector for each source sentence into two
+   * LodTensor.
+   * One is all candidate sentences with word id, one is all candidate sentences
+   * with word score.
+   * Param:
+   *  sentence_vector_list: sentence_vector for each source sentence.
+   *  id_tensor: result LoDTensor for sentences of id.
+   *  score_tensor: result LoDTensor for sentences of score.
+   */
+  void ConvertSentenceVectorToLodTensor(
+      std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+      LoDTensor* score_tensor) const;
+
+  /**
+   * Pack all steps of id/score LodTensor into sentence LoDTensor
+   * it's main logic is:
+   * ```python
+   *   prefix
+   *   result_sentence
+   *   result_lod_tensor
+   *
+   *   for (step in steps):
+   *     prefix = PackTwoSteps(prefix, step, &result_sentence)
+   *   ConvertSentenceVector<T>ToLodTensor(result_sentence, &result_lod_tensor)
+   * ```
+   */
+  void PackAllSteps(const LoDTensorArray& step_ids,
+                    const LoDTensorArray& step_scores, LoDTensor* id_tensor,
+                    LoDTensor* score_tensor) const;
+};
+
+template <typename T>
+Sentence<T> BeamSearchDecoder<T>::MakeSentence(const BeamNode<T>* node) const {
+  Sentence<T> sentence;
+  while (node != nullptr) {
+    sentence.word_ids.emplace_back(node->word_id_);
+    sentence.scores.emplace_back(node->score_);
+    node = node->parent_;
+  }
+
+  std::reverse(std::begin(sentence.word_ids), std::end(sentence.word_ids));
+  std::reverse(std::begin(sentence.scores), std::end(sentence.scores));
+
+  return sentence;
+}
+
+template <typename T>
+std::vector<BeamNodeVector<T>> BeamSearchDecoder<T>::PackTwoSteps(
+    const LoDTensor& cur_ids, const LoDTensor& cur_scores,
+    std::vector<BeamNodeVector<T>>& prefixes_list,
+    std::vector<SentenceVector<T>>* sentence_vector_list) const {
+  std::vector<BeamNodeVector<T>> result;
+
+  for (size_t src_idx = 0; src_idx < cur_ids.lod()[kSourceLevel].size() - 1;
+       ++src_idx) {
+    size_t src_start = cur_ids.lod().at(kSourceLevel)[src_idx];
+    size_t src_end = cur_ids.lod().at(kSourceLevel)[src_idx + 1];
+
+    BeamNodeVector<T> beam_nodes;
+
+    // if prefixes size is 0, it means this is the first step. In this step,
+    // all candidate id is the start of candidate sentences.
+    if (prefixes_list.empty()) {
+      PADDLE_ENFORCE_EQ(cur_ids.lod().at(kSourceLevel).back(),
+                        cur_ids.lod().at(kSentenceLevel).back(),
+                        "in the first step");
+      for (size_t id_idx = src_start; id_idx < src_end; ++id_idx) {
+        beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(new BeamNode<T>(
+            cur_ids.data<int64_t>()[id_idx], cur_scores.data<T>()[id_idx])));
+      }
+    } else {
+      BeamNodeVector<T>& prefixes = prefixes_list[src_idx];
+      SentenceVector<T>& sentence_vector = (*sentence_vector_list)[src_idx];
+
+      PADDLE_ENFORCE_EQ(src_end - src_start, prefixes.size(),
+                        "prefix and candidate set number should be the same");
+
+      auto candidate_offset = cur_ids.lod()[kSentenceLevel];
+      for (size_t prefix_idx = 0; prefix_idx < prefixes.size(); ++prefix_idx) {
+        std::unique_ptr<BeamNode<T>>& prefix = prefixes[prefix_idx];
+        size_t candidate_start = candidate_offset[src_start + prefix_idx];
+        size_t candidate_end = candidate_offset[src_start + prefix_idx + 1];
+        if (candidate_start == candidate_end) {
+          VLOG(3) << "this sentence has no more candidate, "
+                     "add to result sentence and rm it from beam tree";
+          sentence_vector.push_back(MakeSentence(prefix.get()));
+          prefix.reset();
+        } else {
+          for (size_t candidate_idx = candidate_start;
+               candidate_idx < candidate_end; ++candidate_idx) {
+            auto* candidate =
+                new BeamNode<T>(cur_ids.data<int64_t>()[candidate_idx],
+                                cur_scores.data<T>()[candidate_idx]);
+            candidate->AppendTo(prefix.get());
+            beam_nodes.push_back(std::unique_ptr<BeamNode<T>>(candidate));
+          }
+          prefix.release();
+        }
+      }
+    }
+    result.push_back(std::move(beam_nodes));
+  }
+  return result;
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
+    std::vector<SentenceVector<T>> sentence_vector_list, LoDTensor* id_tensor,
+    LoDTensor* score_tensor) const {
+  size_t src_num = sentence_vector_list.size();
+
+  PADDLE_ENFORCE_NE(src_num, 0, "src_num should not be 0");
+
+  std::vector<size_t> source_level_lod = {0};
+  std::vector<size_t> sentence_level_lod = {0};
+  std::vector<int64_t> id_data;
+  std::vector<T> score_data;
+
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (Sentence<T>& sentence : sentence_vector_list[src_idx]) {
+      id_data.insert(id_data.end(), sentence.word_ids.begin(),
+                     sentence.word_ids.end());
+      score_data.insert(score_data.end(), sentence.scores.begin(),
+                        sentence.scores.end());
+      sentence_level_lod.push_back(sentence_level_lod.back() +
+                                   sentence.word_ids.size());
+    }
+    source_level_lod.push_back(source_level_lod.back() +
+                               sentence_vector_list[src_idx].size());
+  }
+
+  auto cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext cpu_ctx(*cpu_place);
+
+  framework::LoD lod;
+  lod.push_back(source_level_lod);
+  lod.push_back(sentence_level_lod);
+
+  id_tensor->set_lod(lod);
+  id_tensor->Resize({static_cast<int64_t>(id_data.size())});
+  id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
+
+  score_tensor->set_lod(lod);
+  score_tensor->Resize({static_cast<int64_t>(score_data.size())});
+  score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
+}
+
+template <typename T>
+void BeamSearchDecoder<T>::PackAllSteps(const LoDTensorArray& step_ids,
+                                        const LoDTensorArray& step_scores,
+                                        LoDTensor* id_tensor,
+                                        LoDTensor* score_tensor) const {
+  PADDLE_ENFORCE(!step_ids.empty(), "step num should be larger than 0");
+  PADDLE_ENFORCE_EQ(step_ids.size(), step_scores.size(),
+                    "step_ids and step_scores should be the same");
+  const size_t step_num = step_ids.size();
+  const size_t src_num = step_ids.at(0).lod().at(kSourceLevel).size() - 1;
+
+  PADDLE_ENFORCE_GT(src_num, 0UL, "source num should be larger than 0");
+
+  // previous prefixes for each step,
+  // the init length is 0, means this is the first step.
+  std::vector<BeamNodeVector<T>> beamnode_vector_list(0);
+  std::vector<SentenceVector<T>> sentence_vector_list(src_num);
+
+  // pack all steps for one batch first, then another batch
+  for (size_t step_id = 0; step_id < step_num; ++step_id) {
+    beamnode_vector_list =
+        PackTwoSteps(step_ids.at(step_id), step_scores.at(step_id),
+                     beamnode_vector_list, &sentence_vector_list);
+  }
+  // append last beam_node to result
+  for (size_t src_idx = 0; src_idx < src_num; ++src_idx) {
+    for (auto& beam_node : beamnode_vector_list.at(src_idx)) {
+      sentence_vector_list[src_idx].push_back(MakeSentence(beam_node.get()));
+      beam_node.reset();
+    }
+  }
+
+  ConvertSentenceVectorToLodTensor(sentence_vector_list, id_tensor,
+                                   score_tensor);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_decode_op_test.cc b/paddle/operators/beam_search_decode_op_test.cc
new file mode 100644
index 0000000000..5ac23991f3
--- /dev/null
+++ b/paddle/operators/beam_search_decode_op_test.cc
@@ -0,0 +1,221 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_decode_op.h"
+#include "gtest/gtest.h"
+
+using CPUPlace = paddle::platform::CPUPlace;
+using LoD = paddle::framework::LoD;
+using LoDTensor = paddle::framework::LoDTensor;
+using LoDTensorArray = paddle::framework::LoDTensorArray;
+
+template <typename T>
+using BeamNode = paddle::operators::BeamNode<T>;
+template <typename T>
+using BeamSearchDecoder = paddle::operators::BeamSearchDecoder<T>;
+template <typename T>
+using Sentence = paddle::operators::Sentence<T>;
+template <typename T>
+using BeamNodeVector = paddle::operators::BeamNodeVector<T>;
+template <typename T>
+using SentenceVector = paddle::operators::SentenceVector<T>;
+
+namespace paddle {
+namespace test {
+
+void GenerateExample(const std::vector<size_t>& level_0,
+                     const std::vector<size_t>& level_1,
+                     const std::vector<int>& data, LoDTensorArray* ids,
+                     LoDTensorArray* scores) {
+  PADDLE_ENFORCE_EQ(level_0.back(), level_1.size() - 1,
+                    "source level is used to describe candidate set");
+  PADDLE_ENFORCE_EQ(level_1.back(), data.size(),
+                    "the lowest level is used to describe data"
+                    ", so it's last element should be data length");
+
+  CPUPlace place;
+
+  LoD lod;
+  lod.push_back(level_0);
+  lod.push_back(level_1);
+
+  // Ids
+  LoDTensor tensor_id;
+  tensor_id.set_lod(lod);
+  tensor_id.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  int64_t* id_ptr = tensor_id.mutable_data<int64_t>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    id_ptr[i] = static_cast<int64_t>(data.at(i));
+  }
+
+  // Scores
+  LoDTensor tensor_score;
+  tensor_score.set_lod(lod);
+  tensor_score.Resize({static_cast<int64_t>(data.size())});
+  // malloc memory
+  float* score_ptr = tensor_score.mutable_data<float>(place);
+  for (size_t i = 0; i < data.size(); ++i) {
+    score_ptr[i] = static_cast<float>(data.at(i));
+  }
+
+  ids->push_back(tensor_id);
+  scores->push_back(tensor_score);
+}
+
+}  // namespace test
+}  // namespace paddle
+
+TEST(BeamSearchDecodeOp, DeleteBeamNode) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* b2 = new BeamNode<float>(2, 2);
+  auto* b3 = new BeamNode<float>(3, 3);
+
+  b1->AppendTo(root);
+  b2->AppendTo(root);
+  b3->AppendTo(b1);
+
+  delete b3;
+  delete b2;
+}
+
+TEST(BeamSearchDecodeOp, MakeSentence) {
+  auto* root = new BeamNode<float>(0, 0);
+  auto* b1 = new BeamNode<float>(1, 1);
+  auto* end = new BeamNode<float>(2, 2);
+  b1->AppendTo(root);
+  end->AppendTo(b1);
+
+  BeamSearchDecoder<float> helper;
+  Sentence<float> sentence = helper.MakeSentence(end);
+  delete end;
+
+  std::vector<int64_t> expect_ids = {0, 1, 2};
+  ASSERT_EQ(sentence.word_ids, expect_ids);
+
+  std::vector<float> expect_scores = {0, 1, 2};
+  ASSERT_EQ(sentence.scores, expect_scores);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoStepsFistStep) {
+  CPUPlace place;
+
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 2, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  BeamSearchDecoder<float> helper;
+  beamnode_vector_list = helper.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+  ASSERT_EQ(beamnode_vector_list.size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 2UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 4UL);
+}
+
+TEST(BeamSearchDecodeOp, PackTwoSteps) {
+  CPUPlace place;
+
+  // first source has three prefix
+  BeamNodeVector<float> source0_prefixes;
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(1, 1)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(0, 0)));
+  source0_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(3, 3)));
+
+  // second source has two prefix
+  BeamNodeVector<float> source1_prefixes;
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(4, 4)));
+  source1_prefixes.push_back(
+      std::unique_ptr<BeamNode<float>>(new BeamNode<float>(5, 5)));
+
+  std::vector<BeamNodeVector<float>> beamnode_vector_list;
+  std::vector<SentenceVector<float>> sentence_vector_list(
+      2, SentenceVector<float>());
+
+  beamnode_vector_list.push_back(std::move(source0_prefixes));
+  beamnode_vector_list.push_back(std::move(source1_prefixes));
+
+  // generate data for one step
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 5},
+                                std::vector<size_t>{0, 1, 1, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  BeamSearchDecoder<float> helper1;
+  beamnode_vector_list = helper1.PackTwoSteps(
+      ids[0], scores[0], beamnode_vector_list, &sentence_vector_list);
+
+  ASSERT_EQ(sentence_vector_list[0].size(), 1UL);
+  ASSERT_EQ(sentence_vector_list[1].size(), 0UL);
+  ASSERT_EQ(beamnode_vector_list[0].size(), 3UL);
+  ASSERT_EQ(beamnode_vector_list[1].size(), 2UL);
+}
+
+TEST(BeamSearchDecodeOp, PackAllSteps) {
+  CPUPlace place;
+
+  // we will constuct a sample data with 3 steps and 2 source sentences
+  LoDTensorArray ids;
+  LoDTensorArray scores;
+
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 2, 3, 4, 5, 6},
+      std::vector<int>{1, 2, 3, 4, 5, 6}, &ids, &scores);
+  paddle::test::GenerateExample(
+      std::vector<size_t>{0, 3, 6}, std::vector<size_t>{0, 1, 1, 3, 5, 5, 6},
+      std::vector<int>{0, 1, 2, 3, 4, 5}, &ids, &scores);
+  paddle::test::GenerateExample(std::vector<size_t>{0, 3, 6},
+                                std::vector<size_t>{0, 0, 1, 2, 3, 4, 5},
+                                std::vector<int>{0, 1, 2, 3, 4}, &ids, &scores);
+
+  ASSERT_EQ(ids.size(), 3UL);
+  ASSERT_EQ(scores.size(), 3UL);
+
+  BeamSearchDecoder<float> helper;
+
+  LoDTensor id_tensor;
+  LoDTensor score_tensor;
+  helper.PackAllSteps(ids, scores, &id_tensor, &score_tensor);
+
+  LoD lod = id_tensor.lod();
+  std::vector<size_t> expect_source_lod = {0, 4, 8};
+  EXPECT_EQ(lod[0], expect_source_lod);
+  std::vector<size_t> expect_sentence_lod = {0, 1, 3, 6, 9, 10, 13, 16, 19};
+  EXPECT_EQ(lod[1], expect_sentence_lod);
+  // 2| 1, 0| 3, 1, 0| 3, 2, 1| 5| 4, 3, 2| 4, 4, 3| 6, 5, 4
+  std::vector<int> expect_data = {2, 1, 0, 3, 1, 0, 3, 2, 1, 5,
+                                  4, 3, 2, 4, 4, 3, 6, 5, 4};
+  ASSERT_EQ(id_tensor.dims()[0], static_cast<int64_t>(expect_data.size()));
+  for (size_t i = 0; i < expect_data.size(); ++i) {
+    ASSERT_EQ(id_tensor.data<int64_t>()[i],
+              static_cast<int64_t>(expect_data[i]));
+  }
+  for (int64_t i = 0; i < id_tensor.dims()[0]; ++i) {
+    ASSERT_EQ(score_tensor.data<float>()[i],
+              static_cast<float>(id_tensor.data<int64_t>()[i]));
+  }
+}
diff --git a/paddle/operators/beam_search_op.cc b/paddle/operators/beam_search_op.cc
new file mode 100644
index 0000000000..844ade40eb
--- /dev/null
+++ b/paddle/operators/beam_search_op.cc
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/beam_search_op.h"
+
+#include <map>
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+void BeamSearch::operator()(const framework::LoDTensor &pre_ids,
+                            framework::LoDTensor *selected_ids,
+                            framework::LoDTensor *selected_scores) {
+  auto abs_lod = framework::ToAbsOffset(ids_->lod());
+  auto &high_level = abs_lod[lod_level_];
+
+  auto items = SelectTopBeamSizeItems();
+  auto selected_items = ToMap(items, high_level.back());
+  VLOG(3) << "selected_items:";
+  for (size_t i = 0; i < selected_items.size(); ++i) {
+    VLOG(3) << "offset:" << i;
+    for (auto &item : selected_items[i]) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+  PruneEndidCandidates(pre_ids, &selected_items);
+  // calculate the output tensor's height
+  size_t num_instances = std::accumulate(
+      std::begin(selected_items), std::end(selected_items), 0,
+      [](size_t a, std::vector<Item> &b) { return a + b.size(); });
+  // the output tensor shape should be [num_instances, 1]
+  auto dims = framework::make_ddim(
+      std::vector<int64_t>({static_cast<int>(num_instances), 1}));
+  selected_ids->Resize(dims);
+  selected_scores->Resize(dims);
+
+  std::map<size_t /*offset*/, std::vector<Item>> hash;
+  framework::LoD new_lod;
+  auto *ids_data = selected_ids->mutable_data<int64_t>(platform::CPUPlace());
+  auto *scores_data =
+      selected_scores->mutable_data<float>(platform::CPUPlace());
+
+  // fill in data
+  std::vector<size_t> low_level;
+  size_t low_offset = 0;
+  for (auto &items : selected_items) {
+    low_level.push_back(low_offset);
+    sort(items.begin(), items.end(), [](const Item &a, const Item &b) {
+      if (a.offset < b.offset) {
+        return true;
+      }
+      return a.id < b.id;
+    });
+    for (auto &item : items) {
+      ids_data[low_offset] = item.id;
+      scores_data[low_offset] = item.score;
+      low_offset++;
+    }
+  }
+  low_level.push_back(low_offset);
+
+  // fill lod
+  framework::LoD lod(2);
+  lod[0].assign(high_level.begin(), high_level.end());
+  lod[1].assign(low_level.begin(), low_level.end());
+  if (!framework::CheckLoD(lod)) {
+    PADDLE_THROW("lod %s is not right", framework::LoDToString(lod));
+  }
+  selected_ids->set_lod(lod);
+  selected_scores->set_lod(lod);
+}
+
+int BeamSearch::PruneEndidCandidates(const framework::LoDTensor &pre_ids,
+                                     std::vector<std::vector<Item>> *items) {
+  auto *pre_ids_data = pre_ids.data<int64_t>();
+
+  int res = 0;
+  for (size_t offset = 0; offset < items->size(); offset++) {
+    auto prefix_id = pre_ids_data[offset];
+    if (prefix_id == end_id_) {
+      items->at(offset).clear();
+    } else {
+      res++;
+    }
+  }
+
+  return res;
+}
+
+std::vector<std::vector<BeamSearch::Item>> BeamSearch::ToMap(
+    const std::vector<std::vector<Item>> &items, size_t element_num) {
+  std::vector<std::vector<Item>> result;
+  result.resize(element_num);
+  for (auto &entries : items) {
+    for (const auto &item : entries) {
+      result[item.offset].push_back(item);
+    }
+  }
+  return result;
+}
+
+std::vector<std::vector<BeamSearch::Item>>
+BeamSearch::SelectTopBeamSizeItems() {
+  std::vector<std::vector<Item>> result;
+  std::vector<Item> items;
+  // for each source sentence, select the top beam_size items across all
+  // candidate sets.
+  while (NextItemSet(&items)) {
+    std::nth_element(std::begin(items), std::begin(items) + beam_size_,
+                     std::end(items), [](const Item &a, const Item &b) {
+                       // TODO(superjom) make score's comparation customizable.
+                       // partial sort in descending order
+                       return a.score > b.score;
+                     });
+    // prune the top beam_size items.
+    if (items.size() > beam_size_) {
+      items.resize(beam_size_);
+    }
+    result.emplace_back(items);
+  }
+  VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
+  for (auto &items : result) {
+    VLOG(3) << "item set:";
+    for (auto &item : items) {
+      VLOG(3) << ItemToString(item);
+    }
+  }
+
+  return result;
+}
+
+// the candidates of a source
+bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
+  if (sent_offset_ >= ids_->NumElements(lod_level_)) {
+    return false;
+  }
+  // find the current candidates
+  auto ids = *ids_;
+  auto scores = *scores_;
+
+  auto abs_lod = framework::ToAbsOffset(ids.lod());
+
+  auto *ids_data = ids.data<int64_t>();
+  auto *scores_data = scores.data<float>();
+
+  size_t instance_dim = 1;
+  for (int i = 1; i < ids.dims().size(); i++) {
+    instance_dim *= ids.dims()[i];
+  }
+
+  items->clear();
+  items->reserve(framework::product(ids.dims()));
+  for (size_t offset = abs_lod[lod_level_][sent_offset_];
+       offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
+    for (size_t d = 0; d < instance_dim; d++) {
+      const size_t dim_offset = offset * instance_dim + d;
+      items->emplace_back(offset, ids_data[dim_offset],
+                          scores_data[dim_offset]);
+    }
+  }
+
+  sent_offset_++;
+  return true;
+}
+
+std::ostream &operator<<(std::ostream &os, const BeamSearch::Item &item) {
+  os << "{";
+  os << "offset: " << item.offset << ", ";
+  os << "id: " << item.id << ", ";
+  os << "score: " << item.score << "";
+  os << "}";
+
+  return os;
+}
+
+std::string ItemToString(const BeamSearch::Item &item) {
+  std::ostringstream stream;
+  stream << item;
+  return stream.str();
+}
+
+class BeamSearchProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  BeamSearchProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    // inputs and outputs stored in proto
+    AddInput("pre_ids", "ids in previous step");
+    AddInput("ids", "a LoDTensor of shape of [None,k]");
+    AddInput("scores",
+             "a LoDTensor that has the same shape and LoD with `ids`");
+    AddOutput("selected_ids",
+              "a LoDTensor that stores the IDs selected by beam search");
+    AddOutput(
+        "selected_scores",
+        "a LoDTensor that has the same shape and LoD with `selected_ids`");
+
+    // Attributes stored in AttributeMap
+    AddAttr<int>("level", "the level of LoDTensor");
+    AddAttr<int>("beam_size", "beam size for beam search");
+    AddAttr<int>("end_id",
+                 "the token id which indicates the end of a sequence");
+
+    AddComment(
+        "This is a beam search operator that help to generate sequences.");
+  }
+};
+
+class BeamSearchInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    for (const std::string &arg :
+         std::vector<std::string>({"pre_ids", "ids", "scores"})) {
+      PADDLE_ENFORCE(context->HasInput(arg),
+                     "BeamSearch need input argument '%s'", arg);
+    }
+    for (const std::string &arg :
+         std::vector<std::string>({"selected_ids", "selected_scores"})) {
+      PADDLE_ENFORCE(context->HasOutput(arg),
+                     "BeamSearch need output argument '%s'", arg);
+    }
+  }
+};
+
+class BeamSearchInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("selected_ids")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+    for (auto &o : op_desc.Output("selected_scores")) {
+      block->Var(o)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(beam_search, paddle::operators::BeamSearchOp,
+                  paddle::operators::BeamSearchProtoAndCheckerMaker,
+                  paddle::operators::BeamSearchInferShape,
+                  paddle::operators::BeamSearchInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/beam_search_op.h b/paddle/operators/beam_search_op.h
new file mode 100644
index 0000000000..7ad85874fc
--- /dev/null
+++ b/paddle/operators/beam_search_op.h
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include "gtest/gtest.h"
+#endif
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * This is an implementation of beam search.
+ *
+ * To explain the details, lets take machine translation task for example, in
+ * this task, one source sentence is translated to multiple target sentences,
+ * during this period, one sentence will be translated to multiple translation
+ * prefixes(target sentence that have not ended), in each time step a prefix
+ * will have some candidates, input the candidate ids and their corresponding
+ * scores (probabilities), it will sort and select the top beam_size candidates
+ * for each source sentence, and store the selected candidates's score and their
+ * corresponding ids to LoDTensors.
+ *
+ * A detailed example:
+ *
+ * Input
+ *
+ * ids:
+ * LoD (should have 2 levels)
+ * first level: [0, 1, 4]
+ * second level: [0, 1, 2, 3, 4]
+ *
+ * tensor's data
+ * [
+ * [4, 2, 5]
+ * [2, 1, 3]
+ * [3, 5, 2]
+ * [8, 2, 1]
+ * ]
+ *
+ * scores:
+ * LoD same as `ids`
+ * tensor's data
+ * [
+ * [0.5, 0.3, 0.2]
+ * [0.6, 0.3, 0.1]
+ * [0.9, 0.5, 0.1]
+ * [0.7, 0.5, 0.1]
+ * ]
+ *
+ * the inputs means that there are 2 source sentences to translate, and the
+ * first source has 1 prefix, the second source has 2 prefix.
+ *
+ * lets assume beam size is 2, and the beam search's output should be
+ * LoD
+ * first level:
+ * [0, 1, 2]
+ * second level:
+ * [0, 2, 4]
+ *
+ * id tensor's data
+ * [[
+ * 4,
+ * 1,
+ * 3,
+ * 8,
+ * ]]
+ *
+ * score tensor's data
+ * [[
+ * 0.5,
+ * 0.3,
+ * 0.9,
+ * 0.7
+ * ]]
+ *
+ * TODO all the prune operations should be in the beam search, so it is better
+ * to split the beam search algorithm into a sequence of smaller operators, and
+ * the prune operators can be inserted in this sequence.
+ */
+class BeamSearch {
+ public:
+  // TODO(superjom) make type customizable
+  using id_t = size_t;
+  using score_t = float;
+  /*
+   * Input the arguments that needed by this class.
+   */
+  BeamSearch(const framework::LoDTensor& ids,
+             const framework::LoDTensor& scores, size_t level, size_t beam_size,
+             int end_id)
+      : beam_size_(beam_size),
+        ids_(&ids),
+        scores_(&scores),
+        lod_level_(level),
+        end_id_(end_id) {}
+
+  /*
+   * The main function of beam search.
+   *
+   * @selected_ids: a [None, 1]-shaped tensor with LoD.
+   *   In a machine translation model, it might be the candidate term id sets,
+   *   each set stored as a varience-length sequence.
+   *   The format might be described with a two-level LoD
+   *   - [[0 1]
+   *   -  [0 1 2]]
+   *   - [[]
+   *   -  [0 1]]
+   *   the first level of LoD tells that there are two source sentences. The
+   *   second level describes the details of the candidate id set's offsets in
+   * the
+   *   source sentences.
+   *
+   *  @selected_scores: a LoD tensor with the same shape and LoD with
+   * selected_ids.
+   *   It stores the corresponding scores of candidate ids in selected_ids.
+   *
+   * Return false if all the input tensor is empty, in machine translation task
+   * that means no candidates is provided, and the task will stop running.
+   */
+  void operator()(const framework::LoDTensor& pre_ids,
+                  framework::LoDTensor* selected_ids,
+                  framework::LoDTensor* selected_scores);
+  /*
+   * The basic items help to sort.
+   */
+  struct Item {
+    Item() {}
+    Item(size_t offset, size_t id, float score)
+        : offset(offset), id(id), score(score) {}
+    // offset in the higher lod level.
+    size_t offset;
+    // // prefix id in the lower lod level.
+    // size_t prefix;
+    // the candidate id
+    id_t id;
+    // the corresponding score
+    score_t score;
+  };
+
+ protected:
+  /*
+   * Delete all the records that follows the end token.
+   */
+  int PruneEndidCandidates(const framework::LoDTensor& pre_ids,
+                           std::vector<std::vector<Item>>* items);
+
+  /*
+   * Transform the items into a map whose key is offset, value is the items.
+   * NOTE low performance
+   */
+  std::vector<std::vector<Item>> ToMap(
+      const std::vector<std::vector<Item>>& inputs, size_t element_num);
+
+  /*
+   * For each source, select top beam_size records.
+   */
+  std::vector<std::vector<Item>> SelectTopBeamSizeItems();
+
+  /*
+   * Get the items of next source sequence, return false if no remaining items.
+   */
+  bool NextItemSet(std::vector<Item>* items);
+
+ private:
+  size_t beam_size_;
+  const framework::LoDTensor* ids_;
+  const framework::LoDTensor* scores_;
+  size_t lod_level_{0};
+  size_t sent_offset_{0};
+  int end_id_{0};
+};
+
+std::ostream& operator<<(std::ostream& os, const BeamSearch::Item& item);
+
+std::string ItemToString(const BeamSearch::Item& item);
+
+class BeamSearchOp : public framework::OperatorBase {
+ public:
+  BeamSearchOp(const std::string& type,
+               const framework::VariableNameMap& inputs,
+               const framework::VariableNameMap& outputs,
+               const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  BeamSearchOp(const BeamSearchOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not Implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
+    auto ids_var = scope.FindVar(Input("ids"));
+    auto scores_var = scope.FindVar(Input("scores"));
+    auto pre_ids_var = scope.FindVar(Input("pre_ids"));
+    PADDLE_ENFORCE_NOT_NULL(ids_var);
+    PADDLE_ENFORCE_NOT_NULL(scores_var);
+    PADDLE_ENFORCE_NOT_NULL(pre_ids_var);
+
+    auto& ids = ids_var->Get<framework::LoDTensor>();
+    auto& scores = scores_var->Get<framework::LoDTensor>();
+    auto& pre_ids = pre_ids_var->Get<framework::LoDTensor>();
+    size_t level = Attr<int>("level");
+    size_t beam_size = Attr<int>("beam_size");
+    int end_id = Attr<int>("end_id");
+    BeamSearch alg(ids, scores, level, beam_size, end_id);
+
+    auto selected_ids_var = scope.FindVar(Output("selected_ids"));
+    auto selected_scores_var = scope.FindVar(Output("selected_scores"));
+    PADDLE_ENFORCE_NOT_NULL(selected_ids_var);
+    PADDLE_ENFORCE_NOT_NULL(selected_scores_var);
+    auto& selected_ids_tensor =
+        *selected_ids_var->GetMutable<framework::LoDTensor>();
+    auto& selected_scores_tensor =
+        *selected_scores_var->GetMutable<framework::LoDTensor>();
+    alg(pre_ids, &selected_ids_tensor, &selected_scores_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/beam_search_op_test.cc b/paddle/operators/beam_search_op_test.cc
new file mode 100644
index 0000000000..d4beb64a85
--- /dev/null
+++ b/paddle/operators/beam_search_op_test.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/beam_search_op.h"
+
+#include <gtest/gtest.h>
+#include <vector>
+
+namespace paddle {
+namespace test {
+
+using std::vector;
+using framework::LoDTensor;
+using framework::LoD;
+using operators::BeamSearch;
+using paddle::platform::CPUPlace;
+using std::cout;
+using std::endl;
+
+void CreateInput(LoDTensor* ids, LoDTensor* scores) {
+  LoD lod;
+  vector<size_t> level0({0, 1, 4});
+  vector<size_t> level1({0, 1, 2, 3, 4});
+  lod.push_back(level0);
+  lod.push_back(level1);
+  ids->set_lod(lod);
+  scores->set_lod(lod);
+
+  auto dims = framework::make_ddim(vector<int64_t>({4, 3}));
+  ids->Resize(dims);
+  scores->Resize(dims);
+  CPUPlace place;
+
+  auto* ids_data = ids->mutable_data<int64_t>(place);
+  auto* scores_data = scores->mutable_data<float>(place);
+  vector<int64_t> _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1});
+  vector<float> _scores(
+      {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1});
+
+  for (int i = 0; i < 12; i++) {
+    ids_data[i] = _ids[i];
+    scores_data[i] = _scores[i];
+  }
+}
+
+TEST(beam_search_op, run) {
+  CPUPlace place;
+  LoDTensor ids, scores;
+  CreateInput(&ids, &scores);
+
+  LoDTensor pre_ids;
+  pre_ids.Resize(framework::make_ddim(vector<int64_t>(4, 1)));
+  for (int i = 0; i < 4; i++) {
+    pre_ids.mutable_data<int64_t>(place)[i] = i + 1;
+  }
+
+  BeamSearch beamsearch(ids, scores, (int64_t)0, (int64_t)2, 0);
+  LoDTensor sids, sscores;
+  beamsearch(pre_ids, &sids, &sscores);
+
+  LOG(INFO) << "score: " << sscores << endl;
+
+  ASSERT_EQ(sids.lod(), sscores.lod());
+
+  vector<int> tids({2, 4, 3, 8});
+  vector<float> tscores({0.3, 0.5, 0.9, 0.7});
+
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(tids[i], sids.data<int64_t>()[i]);
+    ASSERT_EQ(tscores[i], sscores.data<float>()[i]);
+  }
+}
+
+}  // namespace test
+}  // namespace paddle
diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc
new file mode 100644
index 0000000000..7640147a12
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cc
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class BilinearTensorProductOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The input(X) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The input(Y) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(weight_dims.size(), 3UL,
+                      "The input(Weight) must be a 3D tensor.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The first dimension(batch_size) of input(X) must be "
+                      "equal to the first dimension of the input(Y).");
+    PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1],
+                      "The second dimension of input(X) must be equal to "
+                      "the second dimension of the input(Weight).");
+    PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2],
+                      "The second dimension of input(Y) must be equal to "
+                      "the third dimension of the input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE(bias_dims.size() == 2UL && bias_dims[0] == 1UL,
+                     "The Input(Bias) must be a 2-D tensor with "
+                     "the 2nd dimension fixed to 1 (a row vector).");
+      PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0],
+                        "The second dimension of input(Bias) must be equal "
+                        "to the first dimension of the input(Weight).");
+    }
+
+    ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BilinearTensorProductOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of bilinear_tensor_product operator.");
+    AddInput("Y", "The second input of bilinear_tensor_product operator.");
+    AddInput("Weight",
+             "The learnable parameters of bilinear_tensor_product operator.");
+    AddInput("Bias", "The learnable bias of bilinear_tensor_product operator.")
+        .AsDispensable();
+    AddOutput("Out", "The output of bilinear_tensor_product operator.");
+    AddComment(R"DOC(
+Bilinear Tensor Product operator.
+Given input X and Y, a 3D tensor Weight and a Bias. Each column of the
+Output is computed by one slice $i = 1, . . . , k$ of the tensor:
+
+$$
+M =  (X W_i) * Y \\
+Out_i = \sum_j {M_j} + Bias_i
+$$
+
+Where $W_i$ is the $i$-th slice of Input(Weight);
+      $M_j$ is the $j$-th column of $M$;
+      $Out_i$ is the $i$-th column of Output(Out);
+      $Bias_i$ is a column vector, each element of it is equal to
+        the $i$-th element of $Bias$;
+
+)DOC");
+  }
+};
+
+class BilinearTensorProductOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(out_dims.size(), 2UL,
+                      "The input(Out@GRAD) must be a 2D Tensor.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[0], out_dims[0],
+        "The first dimension(batch_size) of input(Out@GRAD) must be "
+        "equal to the first dimension of the Input(X).");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[0], out_dims[1],
+        "The second dimension of input(Out@GRAD) must be equal to "
+        "the third dimension of the Input(Weight).");
+
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      PADDLE_ENFORCE_EQ(
+          bias_dims[1], out_dims[1],
+          "The second dimension of input(Out@GRAD) must be equal to "
+          "the second dimension of the Input(Bias).");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    auto weight_grad_name = framework::GradVarName("Weight");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+    if (ctx->HasOutput(weight_grad_name)) {
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp,
+            ops::BilinearTensorProductOpMaker, bilinear_tensor_product_grad,
+            ops::BilinearTensorProductOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu
new file mode 100644
index 0000000000..0f48010716
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/bilinear_tensor_product_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    bilinear_tensor_product,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::BilinearTensorProductKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+REGISTER_OP_CUDA_KERNEL(
+    bilinear_tensor_product_grad,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         float>,
+    ops::BilinearTensorProductGradKernel<paddle::platform::CUDADeviceContext,
+                                         double>);
diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h
new file mode 100644
index 0000000000..ba9a2c5ce3
--- /dev/null
+++ b/paddle/operators/bilinear_tensor_product_op.h
@@ -0,0 +1,185 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class BilinearTensorProductKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto output_mat = EigenMatrix<T>::From(*out);
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+    // Create the intermediate variable to caculate the result of
+    // Input(X) multiplied by Input(Weight_i), the formula is:
+    // left_mul = X Weight_i.
+    Tensor left_mul;
+    left_mul.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                             ctx.GetPlace());
+    auto left_mul_mat = EigenMatrix<T>::From(left_mul);
+
+    for (int i = 0; i < out_dim; ++i) {
+      auto output_col_vec = output_mat.chip(i, 1);
+      Tensor weight_mat =
+          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
+      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+      output_col_vec.device(place) =
+          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
+    }
+    if (bias) {
+      auto bias_vec = EigenMatrix<T>::From(*bias);
+      Eigen::DSizes<int, 2> bcast(batch_size, 1);
+      output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* weight = ctx.Input<Tensor>("Weight");
+    Tensor* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* d_y = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    Tensor* d_weight = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    Tensor* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+    const Tensor* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto batch_size = x->dims()[0];
+    auto weight_dims = weight->dims();
+    int out_dim = weight_dims[0];
+    auto x_dim = weight_dims[1];
+    auto y_dim = weight_dims[2];
+
+    auto x_mat = EigenMatrix<T>::From(*x);
+    auto y_mat = EigenMatrix<T>::From(*y);
+    auto d_out_mat = EigenMatrix<T>::From(*d_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    // Create the intermediate variable to caculate the Output(Y@Grad).
+    Tensor x_scale;
+    x_scale.mutable_data<T>(framework::make_ddim({batch_size, x_dim}),
+                            ctx.GetPlace());
+    auto x_scale_mat = EigenMatrix<T>::From(x_scale);
+
+    // Create the intermediate variable to caculate the Output(X@Grad).
+    Tensor y_scale;
+    y_scale.mutable_data<T>(framework::make_ddim({batch_size, y_dim}),
+                            ctx.GetPlace());
+    auto y_scale_mat = EigenMatrix<T>::From(y_scale);
+
+    math::SetConstant<DeviceContext, T> set_zero;
+
+    // Set Output(X@Grad) be zero.
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+    }
+
+    // Set Output(Y@Grad) be zero.
+    if (d_y) {
+      d_y->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_y, static_cast<T>(0));
+    }
+
+    // Caculate the Output(X@Grad) and Output(Y@Grad).
+    if (d_x || d_y) {
+      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
+      Eigen::DSizes<int, 2> bcast_for_y(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor weight_i = weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        if (d_x) {
+          y_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_x) *
+              y_mat;
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
+        }
+        if (d_y) {
+          x_scale_mat.device(place) =
+              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                  .broadcast(bcast_for_y) *
+              x_mat;
+          math::gemm<DeviceContext, T>(
+              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
+        }
+      }
+    }
+
+    // Caculate the gradient of Input(Weight).
+    if (d_weight) {
+      d_weight->mutable_data<T>(ctx.GetPlace());
+      Eigen::DSizes<int, 2> bcast_for_weight(1, x_dim);
+      for (int i = 0; i < out_dim; ++i) {
+        Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize(
+            framework::make_ddim({x_dim, y_dim}));
+        auto output_vec = d_out_mat.chip(i, 1);
+        x_scale_mat.device(place) =
+            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
+                .broadcast(bcast_for_weight) *
+            x_mat;
+        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                                     y->data<T>(), 0, d_weight_i.data<T>());
+      }
+    }
+
+    // Caculate the gradient of Input(Bias).
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_mat = framework::EigenVector<T>::Flatten(*d_bias);
+      d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes<int, 1>(0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
new file mode 100644
index 0000000000..83c8778fe4
--- /dev/null
+++ b/paddle/operators/bipartite_match_op.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class BipartiteMatchOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
+                   "Input(DistMat) of BipartiteMatch should not be null.");
+
+    auto dims = ctx->GetInputDim("DistMat");
+    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
+
+    ctx->SetOutputDim("ColToRowMatchIndices", dims);
+    ctx->SetOutputDim("ColToRowMatchDis", dims);
+  }
+};
+
+template <typename T>
+class BipartiteMatchKernel : public framework::OpKernel<T> {
+ public:
+  // The match_indices must be initialized to -1 at first.
+  // The match_dist must be initialized to 0 at first.
+  void BipartiteMatch(const Tensor& dist, int* match_indices,
+                      T* match_dist) const {
+    constexpr T kEPS = static_cast<T>(1e-6);
+    PADDLE_ENFORCE_EQ(dist.dims().size(), 2, "The rank of dist must be 2.");
+    int64_t row = dist.dims()[0];
+    int64_t col = dist.dims()[1];
+    auto* dist_data = dist.data<T>();
+    std::vector<int> row_pool;
+    for (int i = 0; i < row; ++i) {
+      row_pool.push_back(i);
+    }
+    while (row_pool.size() > 0) {
+      int max_idx = -1;
+      int max_row_idx = -1;
+      T max_dist = -1;
+      for (int64_t j = 0; j < col; ++j) {
+        if (match_indices[j] != -1) {
+          continue;
+        }
+        for (size_t k = 0; k < row_pool.size(); ++k) {
+          int m = row_pool[k];
+          // distance is 0 between m-th row and j-th column
+          if (dist_data[m * col + j] < kEPS) {
+            continue;
+          }
+          if (dist_data[m * col + j] > max_dist) {
+            max_idx = j;
+            max_row_idx = m;
+            max_dist = dist_data[m * col + j];
+          }
+        }
+      }
+      if (max_idx == -1) {
+        // Cannot find good match.
+        break;
+      } else {
+        PADDLE_ENFORCE_EQ(match_indices[max_idx], -1);
+        match_indices[max_idx] = max_row_idx;
+        match_dist[max_idx] = max_dist;
+        // Erase the row index.
+        row_pool.erase(
+            std::find(row_pool.begin(), row_pool.end(), max_row_idx));
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dist_mat = context.Input<LoDTensor>("DistMat");
+    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+
+    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
+
+    auto col = dist_mat->dims()[1];
+
+    int64_t n = dist_mat->lod().size() == 0UL
+                    ? 1
+                    : static_cast<int64_t>(dist_mat->lod().back().size() - 1);
+    if (dist_mat->lod().size()) {
+      PADDLE_ENFORCE_EQ(dist_mat->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    match_indices->mutable_data<int>({n, col}, context.GetPlace());
+    match_dist->mutable_data<T>({n, col}, context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    iset(dev_ctx, match_indices, static_cast<int>(-1));
+    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    tset(dev_ctx, match_dist, static_cast<T>(0));
+
+    int* indices = match_indices->data<int>();
+    T* dist = match_dist->data<T>();
+    if (n == 1) {
+      BipartiteMatch(*dist_mat, indices, dist);
+    } else {
+      auto lod = dist_mat->lod().back();
+      for (size_t i = 0; i < lod.size() - 1; ++i) {
+        Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
+        BipartiteMatch(one_ins, indices + i * col, dist + i * col);
+      }
+    }
+  }
+};
+
+class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BipartiteMatchOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "DistMat",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[K, M]. It is pair-wise distance matrix between the entities "
+        "represented by each row and each column. For example, assumed one "
+        "entity is A with shape [K], another entity is B with shape [M]. The "
+        "DistMat[i][j] is the distance between A[i] and B[j]. The bigger "
+        "the distance is, the better macthing the pairs are. Please note, "
+        "This tensor can contain LoD information to represent a batch of "
+        "inputs. One instance of this batch can contain different numbers of "
+        "entities.");
+    AddOutput("ColToRowMatchIndices",
+              "(Tensor) A 2-D Tensor with shape [N, M] in int type. "
+              "N is the batch size. If ColToRowMatchIndices[i][j] is -1, it "
+              "means B[j] does not match any entity in i-th instance. "
+              "Otherwise, it means B[j] is matched to row "
+              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
+              "i-th instance is saved in ColToRowMatchIndices[i][j].");
+    AddOutput("ColToRowMatchDis",
+              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
+              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
+              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
+              "instance are called LoD. Then "
+              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+    AddComment(R"DOC(
+This operator is a greedy bipartite matching algorithm, which is used to
+obtain the matching with the maximum distance based on the input
+distance matrix. For input 2D matrix, the bipartite matching algorithm can
+find the matched column for each row, also can find the matched row for
+each column. And this operator only calculate matched indices from column
+to row. For each instance, the number of matched indices is the number of
+of columns of the input ditance matrix.
+
+There are two outputs to save matched indices and distance.
+A simple description, this algothrim matched the best (maximum distance)
+row entity to the column entity and the matched indices are not duplicated
+in each row of ColToRowMatchIndices. If the column entity is not matched
+any row entity, set -1 in ColToRowMatchIndices.
+
+Please note that the input DistMat can be LoDTensor (with LoD) or Tensor.
+If LoDTensor with LoD, the height of ColToRowMatchIndices is batch size.
+If Tensor, the height of ColToRowMatchIndices is 1.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(bipartite_match, ops::BipartiteMatchOp,
+                  ops::BipartiteMatchOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(bipartite_match, ops::BipartiteMatchKernel<float>,
+                       ops::BipartiteMatchKernel<double>);
diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc
new file mode 100644
index 0000000000..446976edaf
--- /dev/null
+++ b/paddle/operators/cast_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class CastOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CastOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of cast op");
+    AddOutput("Out", "The output tensor of cast op");
+    AddAttr<int>("out_dtype", "output data type");
+    AddAttr<int>("in_dtype", "input data type");
+    AddComment(R"DOC(
+Cast Operator.
+
+This Operator casts the input tensor to another data type and
+returns tha Output Tensor.
+
+)DOC");
+  }
+};
+
+class CastOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "The input of cast op must be set");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "The output of cast op must be set");
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CastOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad = new framework::OpDesc();
+    grad->SetType("cast");
+    grad->SetInput("X", OutputGrad("Out"));
+    grad->SetOutput("Out", InputGrad("X"));
+    grad->SetAttr("out_dtype", GetAttr("in_dtype"));
+    grad->SetAttr("in_dtype", GetAttr("out_dtype"));
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+using CPU = paddle::platform::CPUDeviceContext;
+REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
+                        ops::CastOpProtoMaker);
+REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
+                       ops::CastOpKernel<CPU, double>,
+                       ops::CastOpKernel<CPU, int>,
+                       ops::CastOpKernel<CPU, int64_t>,
+                       ops::CastOpKernel<CPU, bool>);
diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu
new file mode 100644
index 0000000000..d68bbe6e39
--- /dev/null
+++ b/paddle/operators/cast_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cast_op.h"
+
+template <typename T>
+using CastOpKernel =
+    paddle::operators::CastOpKernel<paddle::platform::CUDADeviceContext, T>;
+
+REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel<float>, CastOpKernel<double>,
+                        CastOpKernel<int>, CastOpKernel<int64_t>,
+                        CastOpKernel<bool>);
diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h
new file mode 100644
index 0000000000..9f39d91edd
--- /dev/null
+++ b/paddle/operators/cast_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename InT, typename OutT>
+struct CastOpTransformFunctor {
+  HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
+};
+
+template <typename DeviceContext, typename InT>
+struct CastOpFunctor {
+  const framework::Tensor* in_;
+  framework::Tensor* out_;
+  const DeviceContext& ctx_;
+  CastOpFunctor(const framework::Tensor* in, framework::Tensor* out,
+                const DeviceContext& ctx)
+      : in_(in), out_(out), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* in_begin = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* in_end = in_begin + numel;
+    auto* out_begin = out_->mutable_data<OutT>(ctx_.GetPlace());
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, in_begin, in_end, out_begin,
+          CastOpTransformFunctor<InT, OutT>());
+  }
+};
+
+template <typename DeviceContext, typename InT>
+class CastOpKernel : public framework::OpKernel<InT> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("out_dtype")),
+        CastOpFunctor<DeviceContext, InT>(
+            in, out, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc
new file mode 100644
index 0000000000..44f667aead
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/chunk_eval_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ChunkEvalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Inference"),
+                   "Input(Inference) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Precision"),
+                   "Output(Precision) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Recall"),
+                   "Output(Recall) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("F1-Score"),
+                   "Output(F1-Score) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumInferChunks"),
+                   "Output(NumInferChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("NumLabelChunks"),
+                   "Output(NumLabelChunks) of ChunkEvalOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NumCorrectChunks"),
+        "Output(NumCorrectChunks) of ChunkEvalOp should not be null.");
+
+    auto inference_dim = ctx->GetInputDim("Inference");
+    auto label_dim = ctx->GetInputDim("Label");
+
+    PADDLE_ENFORCE(inference_dim == label_dim,
+                   "Inference's shape must be the same as Label's shape.");
+
+    ctx->SetOutputDim("Precision", {1});
+    ctx->SetOutputDim("Recall", {1});
+    ctx->SetOutputDim("F1-Score", {1});
+    ctx->SetOutputDim("NumInferChunks", {1});
+    ctx->SetOutputDim("NumLabelChunks", {1});
+    ctx->SetOutputDim("NumCorrectChunks", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   platform::CPUPlace());
+  }
+};
+
+class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ChunkEvalOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Inference",
+             "(Tensor, default: Tensor<int64_t>). "
+             "Predictions from the network.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int64_t>). The true tag sequences.");
+    AddOutput("Precision",
+              "(float). The evaluated precision (called positive predictive "
+              "value) of chunks on the given mini-batch.");
+    AddOutput("Recall",
+              "(float). The evaluated recall (true positive rate or "
+              "sensitivity) of chunks on the given mini-batch.");
+    AddOutput("F1-Score",
+              "(float). The evaluated F1-Score on the given mini-batch.");
+    AddOutput("NumInferChunks",
+              "(int64_t). The number of chunks in Inference on the given "
+              "mini-batch.");
+    AddOutput(
+        "NumLabelChunks",
+        "(int64_t). The number of chunks in Label on the given mini-batch.");
+    AddOutput(
+        "NumCorrectChunks",
+        "(int64_t). The number of chunks both in Inference and Label on the "
+        "given mini-batch.");
+    AddAttr<int>("num_chunk_types",
+                 "(int). The number of chunk type. See below for details.");
+    AddAttr<std::string>(
+        "chunk_scheme",
+        "(string, default IOB). The labeling scheme indicating "
+        "how to encode the chunks. Must be IOB, IOE, IOBES or plain. See below "
+        "for details.")
+        .SetDefault("IOB");
+    AddAttr<std::vector<int>>("excluded_chunk_types",
+                              "(list<int>) A list including chunk type ids "
+                              "indicating chunk types that are not counted. "
+                              "See below for details.")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(
+For some basics of chunking, please refer to
+‘Chunking with Support Vector Machines <https://aclanthology.info/pdf/N/N01/N01-1025.pdf>’.
+
+
+CheckEvalOp computes the precision, recall, and F1-score of chunk detection,
+and supports IOB, IOE, IOBES and IO (also known as plain) tagging schemes.
+Here is a NER example of labeling for these tagging schemes:
+
+ 	     Li     Ming    works  at  Agricultural   Bank   of    China  in  Beijing.
+  IO:    I-PER  I-PER   O      O   I-ORG          I-ORG  I-ORG I-ORG  O   I-LOC
+  IOB:   B-PER  I-PER   O      O   B-ORG          I-ORG  I-ORG I-ORG  O   B-LOC
+  IOE:   I-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   E-LOC
+  IOBES: B-PER  E-PER   O      O   I-ORG          I-ORG  I-ORG E-ORG  O   S-LOC
+
+There are three chunk types(named entity types) including PER(person), ORG(organization)
+and LOC(LOCATION), and we can see that the labels have the form <tag type>-<chunk type>.
+
+Since the calculations actually use label ids rather than labels, extra attention
+should be paid when mapping labels to ids to make CheckEvalOp work. The key point
+is that the listed equations are satisfied by ids.
+
+    tag_type = label % num_tag_type
+    chunk_type = label / num_tag_type
+
+where `num_tag_type` is the num of tag types in the tagging scheme, `num_chunk_type`
+is the num of chunk types, and `tag_type` get its value from the following table.
+
+    Scheme Begin Inside End   Single
+     plain   0     -      -     -
+     IOB     0     1      -     -
+     IOE     -     0      1     -
+     IOBES   0     1      2     3
+
+Still use NER as example, assuming the tagging scheme is IOB while chunk types are ORG,
+PER and LOC. To satisfy the above equations, the label map can be like this:
+
+    B-ORG  0
+    I-ORG  1
+    B-PER  2
+    I-PER  3
+    B-LOC  4
+    I-LOC  5
+    O      6
+
+It’s not hard to verify the equations noting that the num of chunk types
+is 3 and the num of tag types in IOB scheme is 2. For example, the label
+id of I-LOC is 5, the tag type id of I-LOC is 1, and the chunk type id of
+I-LOC is 2, which consistent with the results from the equations.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(chunk_eval, ops::ChunkEvalOp,
+                             ops::ChunkEvalOpMaker);
+REGISTER_OP_CPU_KERNEL(chunk_eval,
+                       ops::ChunkEvalKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h
new file mode 100644
index 0000000000..300aff90c0
--- /dev/null
+++ b/paddle/operators/chunk_eval_op.h
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <set>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class ChunkEvalKernel : public framework::OpKernel<T> {
+ public:
+  struct Segment {
+    int begin;
+    int end;
+    int type;
+    bool operator==(const Segment& y) const {
+      return begin == y.begin && end == y.end && type == y.type;
+    }
+  };
+
+  void GetSegments(const int64_t* label, int length,
+                   std::vector<Segment>& segments, int num_chunk_types,
+                   int num_tag_types, int other_chunk_type, int tag_begin,
+                   int tag_inside, int tag_end, int tag_single) const {
+    segments.clear();
+    segments.reserve(length);
+    int chunk_start = 0;
+    bool in_chunk = false;
+    int tag = -1;
+    int type = other_chunk_type;
+    for (int i = 0; i < length; ++i) {
+      int prev_tag = tag;
+      int prev_type = type;
+      PADDLE_ENFORCE_LE(label[i], num_chunk_types * num_tag_types);
+      tag = label[i] % num_tag_types;
+      type = label[i] / num_tag_types;
+      if (in_chunk && ChunkEnd(prev_tag, prev_type, tag, type, other_chunk_type,
+                               tag_begin, tag_inside, tag_end, tag_single)) {
+        Segment segment{
+            chunk_start,  // begin
+            i - 1,        // end
+            prev_type,
+        };
+        segments.push_back(segment);
+        in_chunk = false;
+      }
+      if (ChunkBegin(prev_tag, prev_type, tag, type, other_chunk_type,
+                     tag_begin, tag_inside, tag_end, tag_single)) {
+        chunk_start = i;
+        in_chunk = true;
+      }
+    }
+    if (in_chunk) {
+      Segment segment{
+          chunk_start,  // begin
+          length - 1,   // end
+          type,
+      };
+      segments.push_back(segment);
+    }
+  }
+
+  bool ChunkEnd(int prev_tag, int prev_type, int tag, int type,
+                int other_chunk_type, int tag_begin, int tag_inside,
+                int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return false;
+    if (type == other_chunk_type) return true;
+    if (type != prev_type) return true;
+    if (prev_tag == tag_begin) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_inside) return tag == tag_begin || tag == tag_single;
+    if (prev_tag == tag_end) return true;
+    if (prev_tag == tag_single) return true;
+    return false;
+  }
+
+  bool ChunkBegin(int prev_tag, int prev_type, int tag, int type,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single) const {
+    if (prev_type == other_chunk_type) return type != other_chunk_type;
+    if (type == other_chunk_type) return false;
+    if (type != prev_type) return true;
+    if (tag == tag_begin) return true;
+    if (tag == tag_inside) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_end) return prev_tag == tag_end || prev_tag == tag_single;
+    if (tag == tag_single) return true;
+    return false;
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    // initialize to parse configurations
+    int num_chunk_types, num_tag_types;
+    int other_chunk_type;
+    int tag_begin, tag_inside, tag_end, tag_single;
+    std::vector<Segment> label_segments;
+    std::vector<Segment> output_segments;
+    std::set<int> excluded_chunk_types;
+
+    if (context.Attr<std::string>("chunk_scheme") == "IOB") {
+      num_tag_types = 2;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = -1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOE") {
+      num_tag_types = 2;
+      tag_begin = -1;
+      tag_inside = 0;
+      tag_end = 1;
+      tag_single = -1;
+    } else if (context.Attr<std::string>("chunk_scheme") == "IOBES") {
+      num_tag_types = 4;
+      tag_begin = 0;
+      tag_inside = 1;
+      tag_end = 2;
+      tag_single = 3;
+    } else if (context.Attr<std::string>("chunk_scheme") == "plain") {
+      num_tag_types = 1;
+      tag_begin = -1;
+      tag_inside = -1;
+      tag_end = -1;
+      tag_single = -1;
+    } else {
+      PADDLE_THROW("Unknown chunk scheme.");
+    }
+    other_chunk_type = num_chunk_types = context.Attr<int>("num_chunk_types");
+    excluded_chunk_types.insert(
+        context.Attr<std::vector<int>>("excluded_chunk_types").begin(),
+        context.Attr<std::vector<int>>("excluded_chunk_types").end());
+
+    auto* inference = context.Input<LoDTensor>("Inference");
+    auto place = inference->place();
+    auto* label = context.Input<LoDTensor>("Label");
+    auto* precision = context.Output<Tensor>("Precision");
+    auto* recall = context.Output<Tensor>("Recall");
+    auto* f1 = context.Output<Tensor>("F1-Score");
+    auto* num_infer_chunks = context.Output<Tensor>("NumInferChunks");
+    auto* num_label_chunks = context.Output<Tensor>("NumLabelChunks");
+    auto* num_correct_chunks = context.Output<Tensor>("NumCorrectChunks");
+
+    const int64_t* inference_data = inference->data<int64_t>();
+    const int64_t* label_data = label->data<int64_t>();
+    T* precision_data = precision->mutable_data<T>(place);
+    T* racall_data = recall->mutable_data<T>(place);
+    T* f1_data = f1->mutable_data<T>(place);
+    int64_t* num_infer_chunks_data =
+        num_infer_chunks->mutable_data<int64_t>(place);
+    int64_t* num_label_chunks_data =
+        num_label_chunks->mutable_data<int64_t>(place);
+    int64_t* num_correct_chunks_data =
+        num_correct_chunks->mutable_data<int64_t>(place);
+    *num_infer_chunks_data = 0;
+    *num_label_chunks_data = 0;
+    *num_correct_chunks_data = 0;
+
+    auto lod = label->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE(lod == inference->lod(),
+                   "LoD must be same between Inference and Label.");
+    int num_sequences = lod[0].size() - 1;
+    for (int i = 0; i < num_sequences; ++i) {
+      int seq_length = lod[0][i + 1] - lod[0][i];
+      EvalOneSeq(inference_data + lod[0][i], label_data + lod[0][i], seq_length,
+                 output_segments, label_segments, *num_infer_chunks_data,
+                 *num_label_chunks_data, *num_correct_chunks_data,
+                 num_chunk_types, num_tag_types, other_chunk_type, tag_begin,
+                 tag_inside, tag_end, tag_single, excluded_chunk_types);
+    }
+    *precision_data = !(*num_infer_chunks_data)
+                          ? 0
+                          : static_cast<T>(*num_correct_chunks_data) /
+                                (*num_infer_chunks_data);
+    *racall_data = !(*num_label_chunks_data)
+                       ? 0
+                       : static_cast<T>(*num_correct_chunks_data) /
+                             (*num_label_chunks_data);
+    *f1_data = !(*num_correct_chunks_data)
+                   ? 0
+                   : 2 * (*precision_data) * (*racall_data) /
+                         ((*precision_data) + (*racall_data));
+  }
+
+  void EvalOneSeq(const int64_t* output, const int64_t* label, int length,
+                  std::vector<Segment>& output_segments,
+                  std::vector<Segment>& label_segments,
+                  int64_t& num_output_segments, int64_t& num_label_segments,
+                  int64_t& num_correct, int num_chunk_types, int num_tag_types,
+                  int other_chunk_type, int tag_begin, int tag_inside,
+                  int tag_end, int tag_single,
+                  const std::set<int>& excluded_chunk_types) const {
+    GetSegments(output, length, output_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    GetSegments(label, length, label_segments, num_chunk_types, num_tag_types,
+                other_chunk_type, tag_begin, tag_inside, tag_end, tag_single);
+    size_t i = 0, j = 0;
+    while (i < output_segments.size() && j < label_segments.size()) {
+      if (output_segments[i] == label_segments[j] &&
+          excluded_chunk_types.count(output_segments[i].type) != 1) {
+        ++num_correct;
+      }
+      if (output_segments[i].end < label_segments[j].end) {
+        ++i;
+      } else if (output_segments[i].end > label_segments[j].end) {
+        ++j;
+      } else {
+        ++i;
+        ++j;
+      }
+    }
+    for (auto& segment : label_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_label_segments;
+    }
+    for (auto& segment : output_segments) {
+      if (excluded_chunk_types.count(segment.type) != 1) ++num_output_segments;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc
new file mode 100644
index 0000000000..b90921d79b
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipByNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipByNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipByNormOp should not be null.");
+    auto max_norm = ctx->Attrs().Get<float>("max_norm");
+    PADDLE_ENFORCE_GT(max_norm, 0, "max_norm should be greater than 0.");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipByNormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of clip_by_norm op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out",
+              "(Tensor) The output of clip_by_norm op with shape as input(X)");
+    AddAttr<float>("max_norm", "(float) The maximum norm value.");
+    AddComment(R"DOC(
+ClipByNorm Operator.
+
+This operator limits the L2 norm of the input $X$ within $max\_norm$.
+If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be
+the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will
+be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as
+shown in the following formula:
+
+$$
+Out = \frac{max\_norm * X}{norm(X)},
+$$
+
+where $norm(X)$ represents the L2 norm of $X$.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp,
+                             ops::ClipByNormOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu
new file mode 100644
index 0000000000..cbf8fa4413
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/clip_by_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    clip_by_norm,
+    ops::ClipByNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h
new file mode 100644
index 0000000000..87956a707c
--- /dev/null
+++ b/paddle/operators/clip_by_norm_op.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ClipByNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max_norm = context.Attr<T>("max_norm");
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenVector<T>::Flatten(*input);
+    auto out = EigenVector<T>::Flatten(*output);
+    auto x_norm = x.square().sum().sqrt();
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto temp = (x_norm <= max_norm).template cast<T>().eval();
+    auto scaling = temp + (static_cast<T>(1) - temp) * max_norm / x_norm;
+    Eigen::array<int, 1> one_dim{{1}};
+    Eigen::DSizes<int, 1> m_dsize(input->numel());
+    out.device(place) = x * scaling.reshape(one_dim).broadcast(m_dsize);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
new file mode 100644
index 0000000000..7adb74eab7
--- /dev/null
+++ b/paddle/operators/clip_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/clip_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ClipOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ClipOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ClipOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
+    PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class ClipOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ClipOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor)The input of clip op."
+             "The number of dimensions must be between [1, 9].");
+    AddOutput("Out", "(Tensor)The output of clip op with shape as input(X)");
+    AddAttr<AttrType>(
+        "min", "(float)Minimum value, under which element is replaced by min.");
+    AddAttr<AttrType>(
+        "max", "(float)Maximum value, above which element is replaced by max");
+    AddComment(R"DOC(
+Clip Operator.
+
+The clip operator limits the value of given input within an interval. The
+interval is specified with arguments 'min' and 'max':
+
+$$
+Out = \min(\max(X, min), max)
+$$
+
+)DOC");
+  }
+};
+
+class ClipOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker<float>, clip_grad,
+            ops::ClipOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu
new file mode 100644
index 0000000000..5ccbc96434
--- /dev/null
+++ b/paddle/operators/clip_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/clip_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    clip, ops::ClipKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    clip_grad, ops::ClipGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
new file mode 100644
index 0000000000..51db185dff
--- /dev/null
+++ b/paddle/operators/clip_op.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Transform;
+
+template <typename T>
+class ClipFunctor {
+ public:
+  explicit ClipFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x < min_)
+      return min_;
+    else if (x > max_)
+      return max_;
+    else
+      return x;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename T>
+class ClipGradFunctor {
+ public:
+  explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {}
+  HOSTDEVICE T operator()(const T& x, const T& y) const {
+    return (y > min_ && y < max_) ? x : 0;
+  }
+
+ private:
+  T min_;
+  T max_;
+};
+
+template <typename DeviceContext, typename T>
+class ClipKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    const T* x_data = x->data<T>();
+    int64_t numel = x->numel();
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_data,
+          x_data + numel, out_data, ClipFunctor<T>(min, max));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ClipGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto max = context.Attr<T>("max");
+    auto min = context.Attr<T>("min");
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+    if (d_x != nullptr) {
+      auto* x = context.Input<Tensor>("X");
+      int64_t numel = d_out->numel();
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      const T* d_out_data = d_out->data<T>();
+      const T* x_data = x->data<T>();
+      Transform<DeviceContext> trans;
+      trans(context.template device_context<DeviceContext>(), d_out_data,
+            d_out_data + numel, x_data, d_x_data, ClipGradFunctor<T>(min, max));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc
new file mode 100644
index 0000000000..930c295a9c
--- /dev/null
+++ b/paddle/operators/compare_op.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CompareOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) the left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y", string::Sprintf(
+                      "(LoDTensor) the right hand operand of %s operator",
+                      comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. Each of them is a
+N-dim tensor. X and Y could be any type.  The each element of the Out tensor is
+calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+    AddAttr<int>("axis",
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+  }
+};
+
+template <typename OpComment>
+class CompareOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"), "%s operator must has input X",
+                   comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"), "%s operator must has input Y",
+                   comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class CompareOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // CompareOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_OP(op_type, _equation)                      \
+  struct _##op_type##Comment {                                       \
+    static char type[];                                              \
+    static char equation[];                                          \
+  };                                                                 \
+  char _##op_type##Comment::type[]{#op_type};                        \
+  char _##op_type##Comment::equation[]{_equation};                   \
+  REGISTER_OPERATOR(                                                 \
+      op_type, ::paddle::operators::CompareOp,                       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::CompareOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
+REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(equal, "Out = X == Y");
+REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu
new file mode 100644
index 0000000000..f625824dbc
--- /dev/null
+++ b/paddle/operators/compare_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/compare_op.h"
+
+REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor);
diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
new file mode 100644
index 0000000000..9c655d6c0d
--- /dev/null
+++ b/paddle/operators/compare_op.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LessThanFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <typename T>
+struct LessEqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a <= b; }
+};
+
+template <typename T>
+struct EqualFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    if (std::is_floating_point<T>::value) {
+      // This branch will be optimized while compiling if T is integer. It is
+      // safe to cast a and b to double.
+      return fabs(static_cast<double>(a - b)) < 1e-8;
+    } else {
+      return (a == b);
+    }
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class CompareOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor)                    \
+  REGISTER_OP_##dev##_KERNEL(                                             \
+      op_type, ::paddle::operators::CompareOpKernel<                      \
+                   ::paddle::platform::dev##DeviceContext, functor<int>>, \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,      \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<float>>,        \
+      ::paddle::operators::CompareOpKernel<                               \
+          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
new file mode 100644
index 0000000000..32b61edfd0
--- /dev/null
+++ b/paddle/operators/concat_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/concat_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class ConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
+                      "Inputs(X) of ConcatOp should be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ConcatOp should not be null.");
+
+    auto ins = ctx->GetInputsDim("X");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    const size_t n = ins.size();
+
+    PADDLE_ENFORCE_GT(n, 1, "Input tensors count should > 1.");
+
+    auto out_dims = ins[0];
+    size_t in_zero_dims_size = out_dims.size();
+    for (size_t i = 1; i < n; i++) {
+      for (size_t j = 0; j < in_zero_dims_size; j++) {
+        if (j == axis) {
+          out_dims[axis] += ins[i][j];
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
+        }
+      }
+    }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConcatOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input tensors of concat operator.").AsDuplicable();
+    AddOutput("Out", "Output tensor of concat operator.");
+    AddAttr<int>("axis",
+                 "The axis along which the input tensors will be concatenated.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+Concat Operator.
+
+Concatenate the input tensors along dimension axis.
+Examples:
+  Input[0] = [[1,2],[3,4]]
+  Input[1] = [[5,6]]
+  axis = 0
+  Output = [[1,2],
+            [3,4],
+            [5,6]]
+
+)DOC");
+  }
+};
+
+class ConcatOpGrad : public framework::OperatorWithKernel {
+ public:
+  ConcatOpGrad(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
+               ops::ConcatOpGrad, false)
+REGISTER_OP_CPU_KERNEL(concat,
+                       ops::ConcatKernel<paddle::platform::CPUPlace, float>)
+REGISTER_OP_CPU_KERNEL(concat_grad,
+                       ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc
new file mode 100644
index 0000000000..7b46452d3d
--- /dev/null
+++ b/paddle/operators/concat_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/concat_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    concat, ops::ConcatKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    concat_grad,
+    ops::ConcatGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
new file mode 100644
index 0000000000..de4011585a
--- /dev/null
+++ b/paddle/operators/concat_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ConcatKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = ins.size();
+    size_t output_offset = 0;
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    for (size_t i = 0; i < n; i++) {
+      auto& in = ins[i];
+      auto axis_dim = in->dims()[axis];
+      auto in_stride = framework::stride(in->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>(), in_stride,
+                       in->dims(), out_stride, out->data<T>() + output_offset);
+      output_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ConcatGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = outs.size();
+    size_t input_offset = 0;
+    auto in_stride = framework::stride(in->dims());
+    for (size_t i = 0; i < n; i++) {
+      auto& out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride(out->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                       in_stride, out->dims(), out_stride, out->data<T>());
+      input_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
new file mode 100644
index 0000000000..e333002bfd
--- /dev/null
+++ b/paddle/operators/cond_op.cc
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cond_op.h"
+#include "paddle/operators/gather.h"
+#include "paddle/operators/scatter.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DDim = framework::DDim;
+
+framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto& sub_scope = scope.NewScope();
+  sub_scopes->push_back(&sub_scope);
+  return sub_scope;
+}
+
+std::vector<framework::Scope*>& CondOp::GetSubScopes(
+    const framework::Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
+}
+
+LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
+  auto index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  auto& index_tensors =
+      *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
+  index_tensors.push_back(LoDTensor());
+  return index_tensors.back();
+}
+
+std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
+    const framework::Scope& scope) const {
+  auto* index_tensors_var = scope.FindVar("IndexTensors");
+  PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
+                          "Output(IndexTensors) of CondOp should not be null.");
+  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
+}
+
+void CondOp::PrepareDataForSubnet(
+    const framework::Scope& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    // Create two sub scopes for true and false branches
+    //   sub_scopes[0] for the true branch
+    //   sub_scopes[1] for the false branch
+    AddSubScope(scope);
+    // Create two tensors for true and false indices:
+    //   index_tensors[0] for the true branch
+    //   index_tensors[1] for the false branch
+    AddIndexTensor(scope);
+  }
+
+  Variable* cond_var = scope.FindVar(Input("Cond"));
+  PADDLE_ENFORCE_NOT_NULL(cond_var,
+                          "Input(Cond) of CondOp should not be null.");
+  const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
+
+  // get the true/false index at runtime according to cond tensor
+  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
+  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
+  std::vector<std::vector<int>> index_vectors;
+  index_vectors.resize(BRANCH_NUM);
+
+  const int* cond_data = cond->data<int>();
+  for (int i = 0; i < cond->dims()[0]; ++i) {
+    if (cond_data[i])
+      index_vectors[TRUE_BRANCH].push_back(i);
+    else
+      index_vectors[FALSE_BRANCH].push_back(i);
+  }
+
+  // put index_vectors[0] and index_vectors[1] into two tensors:
+  // index_tensors[0] and index_tensors[1]
+  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
+    int* index_tensor_data_ptr =
+        index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
+    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
+           dim[0] * sizeof(int));
+  }
+
+  // create input in subscopes according to index_vectors
+  for (auto& input : Inputs("Xs")) {
+    Variable* var_parent = scope.FindVar(input);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = var_child->GetMutable<LoDTensor>();
+
+      // Resize child
+      DDim dim = tensor_parent->dims();
+      dim[0] = index_tensors[i].dims()[0];
+      tensor_child->mutable_data<float>(dim, platform::CPUPlace());
+
+      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
+    }
+  }
+
+  // create output_tensors in subscope for sub_net
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->Var(var_name);
+      }
+    }
+  }
+}
+
+void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
+                                 const platform::DeviceContext& dev_ctx) const {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  const std::vector<framework::LoDTensor>& index_tensors =
+      GetIndexTensors(scope);
+
+  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
+  PADDLE_ENFORCE(!Outputs("Outs").empty(),
+                 "Outputs(Outs) of CondOp can't be empty.");
+  for (auto& output : Outputs("Outs")) {
+    const LoDTensor* tensor_t_out =
+        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    const LoDTensor* tensor_f_out =
+        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
+
+    auto* var_out = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
+    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+
+    DDim true_dim = tensor_t_out->dims();
+    DDim false_dim = tensor_f_out->dims();
+    true_dim[0] = 0;
+    false_dim[0] = 0;
+    PADDLE_ENFORCE_EQ(true_dim, false_dim,
+                      "Outputs not of the same shape except the first dim");
+
+    DDim out_dim = tensor_t_out->dims();
+    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
+    tensor_out->Resize(out_dim);
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+
+  // merge output results:
+  // output_tensor = true_output_tensor + false_output_tensor
+  for (auto& output : Outputs("Outs")) {
+    Variable* var_parent = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = &var_child->Get<LoDTensor>();
+      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
+                           tensor_parent);
+    }
+  }
+}
+
+void CondOp::Run(const Scope& scope, const platform::Place& place) const {
+  // get device context from pool
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  auto& dev_ctx = *pool.Get(place);
+
+  PrepareDataForSubnet(scope, dev_ctx);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], place);
+  }
+  MergeDataFromSubnet(scope, dev_ctx);
+}
+
+class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CondOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Cond", "The condition, which is a bool vector");
+    AddInput("Xs", "Inputs of Subnets").AsDuplicable();
+    AddOutput("Outs", "Outputs of Cond_Op after merge").AsDuplicable();
+
+    AddOutput("SubScopes", "sub scopes for true and false branches");
+    AddOutput("IndexTensors", "Index Tensors contains indices for true/false");
+
+    AddComment(R"DOC(
+Sample Dependent Conditional Operator.
+
+Given Cond[i] as a 1/0 vector to indicate true/false:
+Out[i] = subnet_true[i], if Cond[i] == true
+Out[i] = subnet_false[i], if Cond[i] == false
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(cond, paddle::operators::CondOp,
+                             paddle::operators::CondOpProtoAndCheckerMaker);
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
new file mode 100644
index 0000000000..7dcdc47e0b
--- /dev/null
+++ b/paddle/operators/cond_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "glog/logging.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * @brief CondOp is a dynamic if-else Operator
+ *
+ * It has a input tensor named cond indicating which netop each instance will
+ * run.
+ *
+ * if cond == 1, it will run true_net, which is a NetOp.
+ *
+ * if cond == 0, it will run false_net, which is another NetOp.
+ */
+class CondOp : public framework::OperatorBase {
+ public:
+  CondOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    sub_net_op_.resize(BRANCH_NUM);
+  }
+
+  CondOp(const CondOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    // TODO(yuyang18): Implement copy ctor well.
+    PADDLE_THROW("Not implemented");
+  }
+
+  framework::Scope& AddSubScope(const framework::Scope& scope) const;
+  std::vector<framework::Scope*>& GetSubScopes(
+      const framework::Scope& scope) const;
+
+  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
+  std::vector<framework::LoDTensor>& GetIndexTensors(
+      const framework::Scope& scope) const;
+
+  void PrepareDataForSubnet(const framework::Scope& scope,
+                            const platform::DeviceContext& dev_ctx) const;
+  void MergeDataFromSubnet(const framework::Scope& scope,
+                           const platform::DeviceContext& dev_ctx) const;
+
+  /*
+   * Set True Block
+   */
+  void set_truenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[TRUE_BRANCH] = std::move(net);
+  }
+
+  /*
+   * Set False Block
+   */
+  void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
+    sub_net_op_[FALSE_BRANCH] = std::move(net);
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override;
+
+ private:
+  const int TRUE_BRANCH = 0;
+  const int FALSE_BRANCH = 1;
+  const int BRANCH_NUM = 2;
+
+  // sub_net_op_[0]: subnet_t
+  // sub_net_op_[1]: subnet_f
+  std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc
new file mode 100644
index 0000000000..3cae61a438
--- /dev/null
+++ b/paddle/operators/conditional_block_op.cc
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class ConditionalOp : public framework::OperatorBase {
+ public:
+  ConditionalOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  std::vector<const framework::LoDTensor *> InputTensors(
+      const framework::Scope &scope) const {
+    std::vector<const framework::LoDTensor *> retv;
+    auto xs = Inputs("X");
+    retv.resize(xs.size(), nullptr);
+    std::transform(
+        xs.begin(), xs.end(), retv.begin(),
+        [&scope](const std::string &var_name) -> const framework::LoDTensor * {
+          auto *var = scope.FindVar(var_name);
+          PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", var_name);
+          return &var->Get<framework::LoDTensor>();
+        });
+    return retv;
+  }
+};
+
+class ConditionalBlockOp : public ConditionalOp {
+ public:
+  ConditionalBlockOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto xs = InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Output("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto *scopes = scope_var->GetMutable<std::vector<framework::Scope *>>();
+      scopes->resize(1);
+      scopes->front() = &scope.NewScope();
+      auto &cur_scope = *scopes->front();
+
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+    }
+  }
+};
+
+class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConditionalBlockOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The conditional variable of this operator. If X is empty, the "
+             "whole sub-block will not be executed.")
+        .AsDuplicable();
+    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
+    AddOutput("Scope",
+              "(std::vector<Scope*>) The step scope of conditional block. To "
+              "unify the conditional block, rnn and while op, the type of "
+              "scope is std::vector<Scope*>");
+    AddAttr<framework::BlockDesc *>(
+        "sub_block", "The step block of conditional block operator");
+    AddComment(R"DOC(Conditional block operator
+
+Run the sub-block if X is not empty. Params is the other inputs and Out is the
+outputs of the sub-block.
+)DOC");
+  }
+};
+
+class ConditionalBlockGradOp : public ConditionalOp {
+ public:
+  ConditionalBlockGradOp(const std::string &type,
+                         const framework::VariableNameMap &inputs,
+                         const framework::VariableNameMap &outputs,
+                         const framework::AttributeMap &attrs)
+      : ConditionalOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto xs = this->InputTensors(scope);
+    bool need_run = std::all_of(
+        xs.begin(), xs.end(),
+        [](const framework::LoDTensor *t) { return t->numel() != 0; });
+
+    if (need_run) {
+      auto *scope_var = scope.FindVar(Input("Scope"));
+      PADDLE_ENFORCE(scope_var != nullptr, "Must set scope");
+      auto &scopes = scope_var->Get<std::vector<framework::Scope *>>();
+      framework::Scope &cur_scope = *scopes[0];
+
+      framework::Executor exec(dev_place);
+      auto *block = Attr<framework::BlockDesc *>("sub_block");
+      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
+                                  Outputs(framework::GradVarName("Params")));
+
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
+                                  Outputs(framework::GradVarName("X")));
+    }
+  }
+
+ private:
+  void AssignLocalGradientToGlobal(
+      const platform::Place &place, const framework::Scope &cur_scope,
+      const std::vector<std::string> &p_names,
+      const std::vector<std::string> &pg_names) const {
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto out_grad_name = pg_names[i];
+      auto in_grad_name = framework::GradVarName(p_names[i]);
+      auto *in_var = cur_scope.FindVar(in_grad_name);
+      if (in_var == nullptr) {
+        continue;
+      }
+      auto new_in_grad_name = cur_scope.Rename(in_grad_name);
+      auto assign = framework::OpRegistry::CreateOp(
+          "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}},
+          framework::AttributeMap{});
+      assign->Run(cur_scope, place);
+      cur_scope.Rename(new_in_grad_name, in_grad_name);
+    }
+  }
+};
+
+class ConditionalBlockGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInputs("X"));
+    if (context->HasInputs("Params")) {
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      context->SetOutputsDim(framework::GradVarName("Params"),
+                             context->GetInputsDim("Params"));
+    }
+    PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("X")));
+    context->SetOutputsDim(framework::GradVarName("X"),
+                           context->GetInputsDim("X"));
+  }
+};
+
+class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto grad_op = new framework::OpDesc();
+    grad_op->SetType("conditional_block_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Out", Output("Out"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetInput("Scope", Output("Scope"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Params", false));
+    grad_op->SetBlockAttr("sub_block", *this->grad_block_[0]);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(conditional_block, ops::ConditionalBlockOp,
+                  ops::ConditionalBlockOpProtoMaker,
+                  ops::ConditionalBlockGradMaker);
+REGISTER_OPERATOR(conditional_block_grad, ops::ConditionalBlockGradOp,
+                  ops::ConditionalBlockGradInferShape);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
new file mode 100644
index 0000000000..3a5409a7e3
--- /dev/null
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
+    static_cast<size_t>(1024) * 1024 * 1024;
+
+template <typename T>
+class CUDNNConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
+
+    int input_channels = input->dims()[1];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
+
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out =
+        output_channels / groups * output_height * output_width * output_depth;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t algo;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // Allocate on GPU memory
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    for (int i = 0; i < groups; i++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+          cudnn_filter_desc, filter_data + i * group_offset_filter,
+          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int64_t user_workspace_size =
+        static_cast<size_t>(ctx.Attr<int>("workspace_size_MB"));
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_grad_desc;
+
+    ScopedFilterDescriptor filter_desc;
+    ScopedFilterDescriptor filter_grad_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(
+            layout, framework::vectorize2int(output_grad->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()), groups);
+
+    int input_channels = input->dims()[1];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
+    int output_grad_channels = filter->dims()[0];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
+
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t workspace_size_in_bytes = 0, tmp_size = 0;
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    if (input_grad) {
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+              handle, cudnn_filter_desc,
+              // dyDesc: Handle to the previously initialized input differential
+              // tensor descriptor.
+              cudnn_output_grad_desc, cudnn_conv_desc,
+              // dxDesc: Handle to the previously initialized output tensor
+              // descriptor.
+              cudnn_input_desc,
+              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+              handle, cudnn_filter_desc, cudnn_output_grad_desc,
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+
+    if (filter_grad) {
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc,
+            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
+      }
+    }
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset filter_grad.
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
+            cudnn_conv_desc, filter_algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
+            filter_grad_data + i * group_offset_filter));
+      }
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvOpKernel<float>,
+                   paddle::operators::CUDNNConvOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   paddle::operators::CUDNNConvGradOpKernel<float>,
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
new file mode 100644
index 0000000000..d6882b275b
--- /dev/null
+++ b/paddle/operators/conv_op.cc
@@ -0,0 +1,338 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  int groups = ctx->Attrs().Get<int>("groups");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "Conv intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(), filter_dims.size(),
+      "Conv input dimension and filter dimension should be the same.");
+  PADDLE_ENFORCE(
+      in_dims.size() - strides.size() == 2U,
+      "Conv input dimension and strides dimension should be consistent.");
+  PADDLE_ENFORCE_EQ(
+      paddings.size(), strides.size(),
+      "Conv paddings dimension and Conv strides dimension should be the same.");
+
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
+                    "The number of input channels should be equal to filter "
+                    "channels * groups.");
+
+  PADDLE_ENFORCE_EQ(
+      filter_dims[0] % groups, 0,
+      "The number of output channels should be divided by groups.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    PADDLE_ENFORCE(in_dims[i + 2] + 2 * paddings[i] -
+                           (dilations[i] * (filter_dims[i + 2] - 1) + 1) >
+                       0,
+                   "Due to the settings of paddings, filter_dims and "
+                   "dilations, the output size is less than 0, please check "
+                   "again.");
+    output_shape.push_back(OutputSize(in_dims[i + 2], filter_dims[i + 2],
+                                      dilations[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Output");
+}
+
+framework::OpKernelType ConvOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "H is the height of the filter, and W is the width of the filter. "
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator. "
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1}), the "
+                            "strides(h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0}), the "
+                            "paddings(h_pad, w_pad) of "
+                            "convolution operator.")
+      .SetDefault({0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. Need set use_cudnn to true."
+               "workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
+size, C is the number of channels, H is the height of the feature, and W is
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
+)DOC");
+}
+
+Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is the "
+      "number of channels, D is the depth of the feature, H is the height of "
+      "the feature, "
+      "and W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution operator. "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "D is the depth of the filter, H is the height of the filter, and W "
+           "is the width of the filter."
+           "If the groups attribute is greater than 1, C equals the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution operator."
+            "The format of output tensor is also NCDHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default:{1, 1, 1}), the "
+                            "strides(d_stride, h_stride, w_stride) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int>, default:{0, 0, 0}), the "
+                            "paddings(d_pad, h_pad, w_pad) of convolution "
+                            "operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<int>(
+      "groups",
+      "(int default:1), the groups number of the convolution operator. "
+      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
+      "when group=2, the first half of the filters is only connected to the "
+      "first half of the input channels, while the second half of the filters "
+      "is only connected to the second half of the input channels.")
+      .SetDefault(1);
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "dilations(d_dilation, h_dilation, w_dilation) of "
+                            "convolution operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Only used in cudnn kernel. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardware. This size should be chosen carefully.")
+      .SetDefault(4096);
+
+  AddComment(R"DOC(
+Convolution3D Operator.
+
+The convolution operation calculates the output based on the input, filter
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format, where N is batch
+size, C is the number of channels,D is the depth of the feature, H is the height of
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
+)DOC");
+}
+
+void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
+REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
+            ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
new file mode 100644
index 0000000000..4f942444f3
--- /dev/null
+++ b/paddle/operators/conv_op.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    conv3d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_grad,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
new file mode 100644
index 0000000000..5a8933e791
--- /dev/null
+++ b/paddle/operators/conv_op.h
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int OutputSize(int input_size, int filter_size, int dilation,
+                      int padding, int stride) {
+  const int dkernel = dilation * (filter_size - 1) + 1;
+  const int output_size = (input_size + 2 * padding - dkernel) / stride + 1;
+  return output_size;
+}
+inline bool IsExpand(std::vector<int64_t>& filter_dim,
+                     std::vector<int>& strides, std::vector<int>& paddings,
+                     std::vector<int>& dilations) {
+  bool filter_1 = true, strides_1 = true, padding_0 = true, dilation_1 = true;
+  for (size_t j = 0; j < strides.size(); ++j) {
+    filter_1 = filter_1 && (static_cast<int>(filter_dim[j + 2]) == 1);
+    strides_1 = strides_1 && (strides[j] == 1);
+    padding_0 = padding_0 && (paddings[j] == 0);
+    dilation_1 = dilation_1 && (dilations[j] == 1);
+  }
+  return !(filter_1 && strides_1 && padding_0 && dilation_1);
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Conv3DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class ConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class ConvOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(framework::vectorize(output->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w) or (i_c/g * k_d * k_h * k_w, o_d *
+    // o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output->dims()[1],
+        output->numel() / (output->dims()[0] * output->dims()[1])};
+
+    // convolution operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output->dims()[1]) / groups;
+
+    math::Vol2ColFunctor<DeviceContext, T> vol2col;
+    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    for (int i = 0; i < batch_size; i++) {
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
+
+      for (int g = 0; g < groups; g++) {
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+        if (!is_expand) {
+          col.ShareDataWith(in_slice);
+          col_matrix.ShareDataWith(col);
+          col_matrix.Resize(col_matrix_shape);
+        } else if (data_dim == 2U) {
+          // im2col
+          im2col(dev_ctx, in_slice, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col
+          vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+        }
+
+        // gemm
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+        math::matmul<DeviceContext, T>(dev_ctx, filter_slice, false, col_matrix,
+                                       false, T(1.0), &out_slice, T(0.0));
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+    // The filter and filter_grad will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    if (!input_grad && !filter_grad) return;
+
+    int groups = context.Attr<int>("groups");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // filter_shape_vec: {k_o, k_i, k_h, k_w} or {k_o, k_i, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));
+    // output_shape_vec: {o_n, o_c, o_h, o_w} or {o_n, o_c, o_d, o_h, o_w}
+    std::vector<int64_t> output_shape_vec(
+        framework::vectorize(output_grad->dims()));
+
+    // use col_shape in the im2col calculation
+    // col_shape_vec: {i_c/g, k_h, k_w, o_h, o_w} or {i_c/g, k_d, k_h, k_w, o_d,
+    // o_h, o_w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = input->dims()[1] / groups;
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = output_shape_vec[j + 2];
+    }
+    framework::DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (i_c/g * k_h * k_w, o_h * o_w)
+    // or
+    // (i_c/g * k_d * k_h * k_w, o_d * o_h * o_w)
+    framework::DDim col_matrix_shape =
+        framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    framework::DDim input_shape = framework::slice_ddim(
+        input->dims(), 1, static_cast<int>(input->dims().size()));
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    framework::DDim output_matrix_shape = {
+        output_grad->dims()[1],
+        output_grad->numel() /
+            (output_grad->dims()[0] * output_grad->dims()[1])};
+
+    // convolution backward input operator:  gemm + col2im(or col2vol)
+    // convolution backward weight operator: im2col(or vol2col) + gemm
+    int in_step = static_cast<int>(input->dims()[1]) / groups;
+    int out_step = static_cast<int>(output_grad->dims()[1]) / groups;
+
+    bool is_expand = IsExpand(filter_shape_vec, strides, paddings, dilations);
+    Tensor col;
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    if (is_expand) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+    }
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+
+      // if is_expand is false, the operation of set_zero is unnecessary,
+      // because math::matmul will reset input_grad.
+      if (is_expand) {
+        set_zero(dev_ctx, input_grad, static_cast<T>(0));
+      }
+      math::Col2VolFunctor<DeviceContext, T> col2vol;
+      math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // gemm
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
+
+          Tensor in_grad_slice =
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col_matrix.ShareDataWith(in_grad_slice);
+            col_matrix.Resize(col_matrix_shape);
+          }
+          math::matmul<DeviceContext, T>(dev_ctx, filter_slice, true,
+                                         out_grad_slice, false, T(1.0),
+                                         &col_matrix, T(0.0));
+
+          if (is_expand && data_dim == 2U) {
+            col2im(dev_ctx, col, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &in_grad_slice);
+          } else if (is_expand && data_dim == 3U) {
+            col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice);
+          }
+        }
+      }
+    }
+
+    if (filter_grad) {
+      filter_grad->mutable_data<T>(context.GetPlace());
+      Tensor filter_grad_ = *filter_grad;
+      filter_grad_.Resize(filter_matrix_shape);
+      set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
+      for (int i = 0; i < batch_size; i++) {
+        Tensor out_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+        for (int g = 0; g < groups; g++) {
+          // im2col
+          Tensor out_grad_slice =
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
+
+          if (!is_expand) {
+            col.ShareDataWith(in_slice);
+            col_matrix.ShareDataWith(col);
+            col_matrix.Resize(col_matrix_shape);
+          } else if (data_dim == 2U) {
+            im2col(dev_ctx, in_slice, dilations, strides,
+                   std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
+                   &col);
+          } else if (data_dim == 3U) {
+            vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col);
+          }
+
+          // gemm
+          Tensor filter_grad_slice =
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
+          math::matmul<DeviceContext, T>(dev_ctx, out_grad_slice, false,
+                                         col_matrix, true, T(1.0),
+                                         &filter_grad_slice, T(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
new file mode 100644
index 0000000000..106b68a0a0
--- /dev/null
+++ b/paddle/operators/conv_shift_op.cc
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class ConvShiftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                      "The 2nd dimension of Input(Y) should be odd.");
+    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                      "The 2nd dimension of Input(Y) should be less than or "
+                      "equal to the 2nd dimension of Input(X).");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConvShiftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConvShiftOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+             "where B is the batch size and M is the data dimension.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
+             "where B is the batch size and N is the data dimension. N must "
+             "be odd.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+              "i.e., the same shape as X.");
+    AddComment(R"DOC(
+ConvShift Operator.
+
+A layer for circular convolution of two vectors,
+as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
+
+The equation is:
+
+$$Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}$$
+
+where X's index is computed modulo M, and Y's index is computed modulo N.
+
+Both inputs X and Y can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+template <typename T>
+class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *Out = context.Output<Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto out = EigenMatrix<T>::From(*Out);
+    out.setZero();
+
+    size_t batch_size = X->dims()[0];
+    size_t x_width = X->dims()[1];
+    size_t y_width = Y->dims()[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    for (size_t k = 0; k < batch_size; ++k) {
+      for (size_t i = 0; i < x_width; ++i) {
+        for (size_t j = 0; j < y_width; ++j) {
+          int index = (i + j - y_half_width + x_width) % x_width;
+          out(k, i) += x(k, index) * y(k, j);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto dout = EigenMatrix<T>::From(*dOut);
+
+    auto x_dims = X->dims();
+    auto y_dims = Y->dims();
+    size_t batch_size = x_dims[0];
+    size_t x_width = x_dims[1];
+    size_t y_width = y_dims[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    // The below trades code duplication for efficiency (keeping the if
+    // statement outside of the loop).
+    if (dX) {
+      dX->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*dX);
+      dx.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dx(k, index) += dout(k, i) * y(k, j);
+          }
+        }
+      }
+    }
+
+    if (dY) {
+      dY->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*dY);
+      dy.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dy(k, j) += x(k, index) * dout(k, i);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OP_CPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
new file mode 100644
index 0000000000..cf7abc196e
--- /dev/null
+++ b/paddle/operators/conv_shift_op.cu
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Some notes on the design:
+//
+// Each thread is responsible for computing a single output out[k, i].
+// Thread blocks are based on tiles of x with height 1 in the batch dimension.
+//
+// This design is based on the typical use case where the filter
+// y is fairly small. For large y, it would probably be more efficient
+// to also tile across y.
+template <typename T>
+__global__ void ConvShiftForward(const T *x, const T *y, int x_width,
+                                 int y_width, int y_half_width, int batch_size,
+                                 T *out) {
+  extern __shared__ T mem[];
+
+  int tx = threadIdx.x;
+  int i = blockIdx.x * blockDim.x + tx;  // global x index
+  int k = blockIdx.y;                    // batch index
+
+  // Check if we are in a boundary block with fewer x's to process than
+  // blockDim.x.
+  int num_x =
+      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
+
+  T *sx = mem;
+  T *sx_pad = &mem[num_x];
+  T *sy = &mem[blockDim.x + y_width];
+
+  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
+  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
+  for (int j = tx; j < y_width; j += blockDim.x) {
+    sy[j] = y[k * y_width + j];
+    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
+  }
+
+  // Load a cyclically shifted slice of x into shared memory.
+  if (tx < num_x) {
+    int load_i = (i - y_half_width + x_width) % x_width;
+    sx[tx] = x[k * x_width + load_i];
+  }
+  __syncthreads();
+
+  if (tx < num_x) {
+    // Compute dot product of sx[tx:tx + y_width] and sy.
+    T sum = 0;
+    for (int j = 0; j < y_width; ++j) {
+      sum += sx[tx + j] * sy[j];
+    }
+
+    // Save to out[k, i].
+    out[k * x_width + i] = sum;
+  }
+}
+
+// Compute x gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void ConvShiftGradX(const T *dout, const T *y, int x_width,
+                               int y_width, int y_half_width, int batch_size,
+                               T *dx) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dx[k * x_width + index],
+              dout[k * x_width + i] * y[k * y_width + j]);
+  }
+}
+
+// Compute y gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width,
+                            int y_half_width, int batch_size, T *dy) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dy[k * y_width + j],
+              x[k * x_width + index] * dout[k * x_width + i]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class ConvShiftKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    Tensor *Out = context.Output<Tensor>("Out");
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    T *out_data = Out->mutable_data<T>(context.GetPlace());
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    const int x_per_block = 256;
+    int num_x_blocks = DivUp(x_width, x_per_block);
+    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
+
+    dim3 grid_dim(num_x_blocks, batch_size);
+
+    auto stream =
+        context.template device_context<platform::CUDADeviceContext>().stream();
+
+    ConvShiftForward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
+        x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data);
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    const T *dout_data = dOut->data<T>();
+
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    auto &device_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    const int x_per_block = 256;
+    int num_x_blocks = DivUp(x_width, x_per_block);
+    dim3 grid_dim(num_x_blocks, y_width, batch_size);
+
+    if (dX) {
+      T *dx_data = dX->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dX, static_cast<T>(0.0));
+      ConvShiftGradX<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          dout_data, y_data, x_width, y_width, y_half_width, batch_size,
+          dx_data);
+    }
+    if (dY) {
+      T *dy_data = dY->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dY, static_cast<T>(0.0));
+      ConvShiftDy<T><<<grid_dim, x_per_block, 0, device_ctx.stream()>>>(
+          x_data, dout_data, x_width, y_width, y_half_width, batch_size,
+          dy_data);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift,
+    ops::ConvShiftKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
new file mode 100644
index 0000000000..6781d87ef0
--- /dev/null
+++ b/paddle/operators/conv_shift_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ConvShiftKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class ConvShiftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc
new file mode 100644
index 0000000000..23bc97e13c
--- /dev/null
+++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv_transpose_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+
+static constexpr size_t kConvCUDNNWorkspaceLimitBytes = 1024 * 1024 * 1024;
+
+template <typename T>
+class CUDNNConvTransposeOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    // (N, M, H, W) or (N, M, D, H, W)
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+    // (M, C, K_h, K_w) or (M, C, K_d, K_h, K_w)
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t algo;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    // Get the algorithm
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+        handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+        // dxDesc: Handle to the previously initialized output tensor
+        // descriptor.
+        cudnn_output_desc, CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(
+        platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+            handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc,
+            cudnn_output_desc, algo, &workspace_size_in_bytes));
+
+    // Allocate on GPU memory
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+
+    // ------------------- cudnn conv transpose forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+        handle, &alpha, cudnn_filter_desc, filter_data, cudnn_input_desc,
+        input_data, cudnn_conv_desc, algo, cudnn_workspace,
+        workspace_size_in_bytes, &beta, cudnn_output_desc, output_data));
+
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CUDNNConvTransposeGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    // cudnn v5 does not support dilations
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    // Input: (N, M, H, W) or (N, M, D, H, W)
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    // Output: (N, C, O_h, O_w) or (N, C, O_d, O_h, O_w)
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output_grad->dims()));
+    // Filter (M, C, K_h, K_w) or (M, C, K_d K_h, K_w)
+    cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
+        layout, framework::vectorize2int(filter->dims()));
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t bwd_filter_ws_size, fwd_ws_size;
+    size_t workspace_size_in_bytes = 0;
+    size_t workspace_size_limit = kConvCUDNNWorkspaceLimitBytes;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+    if (input_grad) {
+      // choose backward algorithm for data
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+          workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          handle, cudnn_output_desc, cudnn_filter_desc, cudnn_conv_desc,
+          cudnn_input_desc, data_algo, &fwd_ws_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, fwd_ws_size);
+    }
+
+    if (filter_grad) {
+      // choose backward algorithm for filter
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      // get workspace for backwards filter algorithm
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_output_desc, cudnn_input_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &bwd_filter_ws_size));
+      workspace_size_in_bytes =
+          std::max(workspace_size_in_bytes, bwd_filter_ws_size);
+    }
+
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::CUDAPlace gpu = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_output_desc, output_grad_data,
+          cudnn_filter_desc, filter_data, cudnn_conv_desc, data_algo,
+          cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+          input_grad_data));
+    }
+
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset filter_grad.
+      // Gradient with respect to the filter
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+          handle, &alpha, cudnn_output_desc, output_grad_data, cudnn_input_desc,
+          input_data, cudnn_conv_desc, filter_algo, cudnn_workspace,
+          workspace_size_in_bytes, &beta, cudnn_filter_desc, filter_grad_data));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(conv2d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv2d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(conv3d_transpose, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeOpKernel<float>,
+                   ops::CUDNNConvTransposeOpKernel<double>);
+REGISTER_OP_KERNEL(conv3d_transpose_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::CUDNNConvTransposeGradOpKernel<float>,
+                   ops::CUDNNConvTransposeGradOpKernel<double>);
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
new file mode 100644
index 0000000000..089290a506
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.cc
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of ConvTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of ConvTransposeOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
+
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5,
+                 "ConvTransposeOp intput should be 4-D or 5-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims.size(), filter_dims.size(),
+                    "ConvTransposeOp input dimension and filter dimension "
+                    "should be the same.");
+  PADDLE_ENFORCE(in_dims.size() - strides.size() == 2U,
+                 "ConvTransposeOp input dimension and strides dimension should "
+                 "be consistent.");
+  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
+                    "ConvTransposeOp paddings dimension and strides "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(paddings.size(), dilations.size(),
+                    "ConvTransposeOp paddings dimension and dilations "
+                    "dimension should be the same.");
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
+                    "In ConvTransposeOp, The input channel should be the same "
+                    "as the number of filters.");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[1]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1;
+    output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - 2 * paddings[i] +
+                           filter_extent);
+  }
+  ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
+}
+
+framework::OpKernelType ConvTransposeOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution transpose operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H is the height of the feature, and "
+      "W is the width of the feature.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator. "
+            "The format of output tensor is also NCHW.");
+
+  AddAttr<std::vector<int>>("dilations",
+                            "(vector<int> default:{1, 1}), the "
+                            "dilations(h_dilation, w_dilation) of convolution "
+                            "transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
+      "convolution transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "transpose operator.")
+      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution2D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
+  $$
+)DOC");
+}
+
+Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(OpProto* proto,
+                                               OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("Input",
+           "(Tensor) The input tensor of convolution transpose operator."
+           "The format of input tensor is NCDHW. Where N is batch size, C is "
+           "the number of channels, D is the depth of the feature, H is the "
+           "height of the feature, and "
+           "W is the width of the feature.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
+           "is the depth of the filter, H is the height of the filter, and "
+           "W is the width of the filter."
+           "We enforce groups number == 1 and padding == 0 in "
+           "the convolution3d transpose scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D is the depth of the feature, H is the "
+            "height of the feature, and W is the width of the feature.");
+
+  AddAttr<std::vector<int>>(
+      "dilations",
+      "(vector<int> default:{1, 1, 1}), the "
+      "dilations(d_dilation,h_dilation, w_dilation) of convolution "
+      "transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int> default:{1, 1, 1}), the "
+                            "strides{d_stride, h_stride, w_stride} of "
+                            "convolution transpose operator.")
+      .SetDefault({1, 1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
+                            "h_pad, w_pad) of convolution transpose operator.")
+      .SetDefault({0, 0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+  AddAttr<int>("workspace_size_MB",
+               "Used in cudnn kernel only. workspace size for cudnn, in MB, "
+               "workspace is a section of GPU memory which will be "
+               "allocated/freed each time the operator runs, larger "
+               "workspace size can increase performance but also requires "
+               "better hardward. This size should be carefully setted.")
+      .SetDefault(4096);
+  AddComment(R"DOC(
+Convolution3D Transpose Operator.
+
+The convolution transpose operation calculates the output based on the input, filter
+and dilations, strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:   
+  Input:
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
+  Output:
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (D_f - 1) + 1 \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (H_f - 1) + 1 \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (W_f - 1) + 1
+  $$
+)DOC");
+}
+
+void ConvTransposeOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker,
+            conv2d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+
+REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker,
+            conv3d_transpose_grad, ops::ConvTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc
new file mode 100644
index 0000000000..f1d827c606
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.cu.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/conv_transpose_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv2d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_transpose,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GemmConvTransposeKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    conv3d_transpose_grad,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     float>,
+    ops::GemmConvTransposeGradKernel<paddle::platform::CUDADeviceContext,
+                                     double>);
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
new file mode 100644
index 0000000000..8c0d57afcd
--- /dev/null
+++ b/paddle/operators/conv_transpose_op.h
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+// Define Op classes in .h file so that other conv transpose
+// operator implementations can reuse the code.
+class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Conv3DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv3DTransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class ConvTransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class ConvTransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped, so it should not be constant pointer
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape =
+        framework::slice_ddim(output->dims(), 1, output->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    output->mutable_data<T>(context.GetPlace());
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, output, static_cast<T>(0));
+
+    math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
+    math::Col2VolFunctor<DeviceContext, T> col2vol;
+
+    // convolution transpose: gemm + col2im or col2vol (similar to conv-backward
+    // on input)
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (m, h * w) or (m, d * h * w)
+      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+
+      // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+      // col_matrix = filter * input_batch
+      // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+      math::matmul<DeviceContext, T>(dev_ctx, filter, true, input_batch, false,
+                                     static_cast<T>(1.0), &col_matrix,
+                                     static_cast<T>(0.0));
+
+      if (data_dim == 2U) {
+        // col2im: col_matrix -> dy
+        // from (c * k_h * k_w, h * w) to (c, o_h, o_w)
+        col2im(dev_ctx, col, dilations, strides,
+               std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                paddings[1]},
+               &output_batch);
+      } else if (data_dim == 3U) {
+        // col2vol: col_matrix -> dy
+        // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w)
+        col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch);
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+    // For filter, we do not use const pointer b/c we will do reshape,
+    // but we should avoid modifying its value.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    if ((!input_grad) && (!filter_grad)) return;
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    const int batch_size = static_cast<int>(input->dims()[0]);
+
+    // input_shape_vec: {n, c, h, w} or {n, c, d, h, w}
+    std::vector<int64_t> input_shape_vec = framework::vectorize(input->dims());
+    // filter_shape_vec: {k_o, k_c, k_h, k_w} or {k_o, k_c, k_d, k_h, k_w}
+    std::vector<int64_t> filter_shape_vec = framework::vectorize(filter.dims());
+
+    // use col_shape in the im2col and col2im (or vol2col and col2vol)
+    // calculation
+    // col_shape_vec: {c, k_h, k_w, h, w} or {c, k_d, k_h, k_w, d, h, w}
+    size_t data_dim = filter_shape_vec.size() - 2;
+    std::vector<int64_t> col_shape_vec(1 + 2 * data_dim);
+    col_shape_vec[0] = output_grad->dims()[1];
+    for (size_t j = 0; j < data_dim; ++j) {
+      col_shape_vec[j + 1] = filter_shape_vec[j + 2];
+      col_shape_vec[j + 1 + data_dim] = input_shape_vec[j + 2];
+    }
+    DDim col_shape(framework::make_ddim(col_shape_vec));
+
+    // use col_matrix_shape in the gemm calculation
+    // size: (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w)
+    DDim col_matrix_shape = framework::flatten_to_2d(col_shape, data_dim + 1);
+
+    // output size: (c, o_h, o_w) or (c, o_d, o_h, o_w)
+    DDim output_shape = framework::slice_ddim(output_grad->dims(), 1,
+                                              output_grad->dims().size());
+
+    // input matrix size: (m, h * w) or (m, d * h * w)
+    DDim input_matrix_shape = {input->dims()[1], col_matrix_shape[1]};
+
+    // filter size: (m, c * k_h * k_w) or (m, c * k_d * k_h * k_w)
+    DDim filter_matrix_shape = {input->dims()[1], col_matrix_shape[0]};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose grad on input:
+    // im2col + gemm (similar to conv-forward)
+    // input need to compute gradient
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (input_grad || filter_grad) {
+      Tensor col;
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // col_matrix shares the same piece of data with col,
+      // but will be reshaped into a two-dimensional matrix shape
+      // to call the matrix multiplication interface.
+      Tensor col_matrix;
+      col_matrix.ShareDataWith(col);
+      col_matrix.Resize(col_matrix_shape);
+
+      Tensor filter_grad_;
+      math::SetConstant<DeviceContext, T> set_zero;
+
+      math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
+      math::Vol2ColFunctor<DeviceContext, T> vol2col;
+
+      if (input_grad) {
+        input_grad->mutable_data<T>(context.GetPlace());
+      }
+      if (filter_grad) {  // filter size (m, c, k_h, k_w)
+        filter_grad->mutable_data<T>(context.GetPlace());
+        set_zero(dev_ctx, filter_grad, static_cast<T>(0));
+        filter_grad_ = *filter_grad;
+        filter_grad_.Resize(filter_matrix_shape);
+      }
+
+      for (int i = 0; i < batch_size; i++) {
+        // batch with size (c, o_h * o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+
+        if (data_dim == 2U) {
+          // im2col: dy -> col matrix
+          // from (c, o_h, o_w) to (c * k_h * k_w, h * w)
+          im2col(dev_ctx, output_grad_batch, dilations, strides,
+                 std::vector<int>{paddings[0], paddings[1], paddings[0],
+                                  paddings[1]},
+                 &col);
+        } else if (data_dim == 3U) {
+          // vol2col: dy -> col_matrix
+          // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w)
+          vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings,
+                  &col);
+        }
+
+        if (input_grad) {
+          // batch with size (m, h, w)
+          Tensor input_grad_batch =
+              input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: dx = filter * dy
+          // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, h * w)
+          // or
+          // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m,
+          // d, h, w)
+          math::matmul<DeviceContext, T>(
+              dev_ctx, filter, false, col_matrix, false, static_cast<T>(1.0),
+              &input_grad_batch, static_cast<T>(0.0));
+        }
+        if (filter_grad) {
+          // input batch
+          Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+          // gemm: d_filter = x * dy^T
+          // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, k_h * k_w)
+          // or
+          // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d *
+          // k_h * k_w)
+          math::matmul<DeviceContext, T>(dev_ctx, in_batch, false, col_matrix,
+                                         true, static_cast<T>(1.0),
+                                         &filter_grad_, static_cast<T>(1.0));
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
new file mode 100644
index 0000000000..9019a1edb3
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cos_sim_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CosSimOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // notnull check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("XNorm"),
+                   "Output(XNorm) of CosSimOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("YNorm"),
+                   "Output(YNorm) of CosSimOp should not be null.");
+
+    // shape check
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+
+    // resize tensor
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->SetOutputDim("XNorm", {x_dims[0], 1});
+    ctx->SetOutputDim("YNorm", {y_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class CosSimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CosSimOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The 1st input of cos_sim op.");
+    AddInput("Y", "The 2nd input of cos_sim op.");
+    AddOutput("Out", "The output of cos_sim op.");
+    AddOutput("XNorm",
+              "Norm of the first input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+    AddOutput("YNorm",
+              "Norm of the second input, reduced along the 1st "
+              "dimension.")
+        .AsIntermediate();
+
+    AddComment(R"DOC(
+Cosine Similarity Operator.
+
+$Out = X^T * Y / (\sqrt{X^T * X} * \sqrt{Y^T * Y})$
+
+The input X and Y must have the same shape, except that the 1st dimension
+of input Y could be just 1 (different from input X), which will be
+broadcasted to match the shape of input X before computing their cosine
+similarity.
+
+Both the input X and Y can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+
+class CosSimOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    // notnull check
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("XNorm"), "Input(XNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("YNorm"), "Input(YNorm) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
+
+    // shape check
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto xnorm_dims = ctx->GetInputDim("XNorm");
+    auto ynorm_dims = ctx->GetInputDim("YNorm");
+    auto out_dims = ctx->GetInputDim("Out");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Ranks of Input(X) and Input(Y) must be equal.");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) must not be less than 2.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 1, x_dims.size()),
+                      framework::slice_ddim(y_dims, 1, y_dims.size()),
+                      "All dimensions except the 1st of Input(X) and Input(Y) "
+                      "must be equal.");
+    PADDLE_ENFORCE(x_dims[0] == y_dims[0] || y_dims[0] == 1,
+                   "The 1st dimension of Input(Y) must be equal to Input(X) or"
+                   " just 1 (which will be broadcasted to match Input(X)).");
+    auto target_xnorm_dims = framework::make_ddim({x_dims[0], 1});
+    auto target_ynorm_dims = framework::make_ddim({y_dims[0], 1});
+    PADDLE_ENFORCE_EQ(xnorm_dims, target_xnorm_dims,
+                      "Shape of Input(XNorm) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(ynorm_dims, target_ynorm_dims,
+                      "Shape of Input(YNorm) must be [Y.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_dims, target_xnorm_dims,
+                      "Shape of Input(Out) must be [X.Dim(0), 1].");
+    PADDLE_ENFORCE_EQ(out_grad_dims, target_xnorm_dims,
+                      "Shape of Input(Out@Grad) must be [X.Dim(0), 1].");
+
+    // resize tensor
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad,
+            ops::CosSimOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu
new file mode 100644
index 0000000000..9e5d1b6e4f
--- /dev/null
+++ b/paddle/operators/cos_sim_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/cos_sim_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim, ops::CosSimKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    cos_sim_grad,
+    ops::CosSimGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
new file mode 100644
index 0000000000..eadcca55f9
--- /dev/null
+++ b/paddle/operators/cos_sim_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CosSimKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* out_z = context.Output<Tensor>("Out");
+    auto* out_x_norm = context.Output<Tensor>("XNorm");
+    auto* out_y_norm = context.Output<Tensor>("YNorm");
+    out_z->mutable_data<T>(context.GetPlace());
+    out_x_norm->mutable_data<T>(context.GetPlace());
+    out_y_norm->mutable_data<T>(context.GetPlace());
+
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+
+    int cols = framework::product(in_x->dims()) / rows_x;
+
+    if (rows_x == rows_y) {
+      math::CosSimFunctor<T, true> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
+    } else {
+      math::CosSimFunctor<T, false> functor(
+          in_x->data<T>(), in_y->data<T>(), out_x_norm->data<T>(),
+          out_y_norm->data<T>(), out_z->data<T>(), cols);
+      platform::ForRange<DeviceContext> for_range(
+          static_cast<const DeviceContext&>(context.device_context()), rows_x);
+      for_range(functor);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CosSimGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    // get Tensor
+    auto* in_x = context.Input<Tensor>("X");
+    auto* in_y = context.Input<Tensor>("Y");
+    auto* in_z = context.Input<Tensor>("Out");
+    auto* in_x_norm = context.Input<Tensor>("XNorm");
+    auto* in_y_norm = context.Input<Tensor>("YNorm");
+    auto* out_grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad_y = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto* in_grad_z = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    // compute gradident
+    int rows_x = in_x->dims()[0];
+    int rows_y = in_y->dims()[0];
+    int cols = framework::product(in_x->dims()) / rows_x;
+
+    if (rows_x == rows_y) {
+      if (out_grad_x) {
+        math::CosSimGradFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+      if (out_grad_y) {
+        math::CosSimGradFunctor<T> functor(
+            in_y_norm->data<T>(), in_x_norm->data<T>(), in_y->data<T>(),
+            in_x->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_y->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+    } else {
+      if (out_grad_x) {
+        math::CosSimDxFunctor<T> functor(
+            in_x_norm->data<T>(), in_y_norm->data<T>(), in_x->data<T>(),
+            in_y->data<T>(), in_z->data<T>(), in_grad_z->data<T>(),
+            out_grad_x->mutable_data<T>(context.GetPlace()), cols);
+        platform::ForRange<DeviceContext> for_range(
+            static_cast<const DeviceContext&>(context.device_context()),
+            rows_x);
+        for_range(functor);
+      }
+      if (out_grad_y) {
+        out_grad_y->mutable_data<T>(context.GetPlace());
+        math::SetConstant<DeviceContext, T> set_zero;
+        auto& dev_ctx = context.template device_context<DeviceContext>();
+        set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
+
+        math::CosSimDyFunctor<DeviceContext, T> functor;
+        functor(dev_ctx, in_x_norm->data<T>(), in_y_norm->data<T>(),
+                in_x->data<T>(), in_y->data<T>(), in_z->data<T>(),
+                in_grad_z->data<T>(), static_cast<size_t>(rows_x),
+                static_cast<size_t>(cols), out_grad_y->data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
new file mode 100644
index 0000000000..30626028c1
--- /dev/null
+++ b/paddle/operators/crf_decoding_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/crf_decoding_op.h"
+
+namespace paddle {
+namespace operators {
+class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CRFDecodingOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Emission",
+             "(LoDTensor, default: LoDTensor<float>). A LoDTensor with shape "
+             "[N x D] where N is the size of the mini-batch and D is the total "
+             "tag number. This input is the unscaled emission weight matrix of "
+             "the linear_chain_crf operator.");
+    AddInput(
+        "Transition",
+        "(Tensor, default: Tensor<float>). A Tensor with shape [(D + 2) x D]. "
+        "This input is the transition weights learned by the linear_chain_crf "
+        "operator, denoted as w. The 1st row of w are transition weights for "
+        "the start mask. The 2nd row of w are transition weights for the end "
+        "mask. Transition weights between other tags begin from the 3rd row of "
+        "w. See more details in comments of the linear_chain_crf operator.");
+    AddInput(
+        "Label",
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
+        "[N x 1]. This input is optional. See more details in the operator's "
+        "comments.")
+        .AsDispensable();
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
+    AddComment(R"DOC(
+The crf_decoding operator reads the emission feature weights and the transition
+feature weights learned by the linear_chain_crf operator. It implements the
+Viterbi algorithm which is a dynamic programming algorithm for finding the most
+likely sequence of hidden states, called the Viterbi path, that results in a
+sequence of observed tags.
+
+The output of this operator changes according to whether Input(Label) is given:
+
+1. Input(Label) is given:
+
+This happens in training. This operator is used to co-work with the chunk_eval
+operator.
+
+When Input(Label) is given, the crf_decoding operator returns a row vector
+with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
+input to chunk_eval operator.
+
+2. Input(Label) is not given:
+
+This is the standard decoding process.
+
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
+range from 0 to maximum tag number - 1. Each element indicates an index of a
+predicted tag.
+)DOC");
+  }
+};
+
+class CRFDecodingOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Emission"),
+                   "Input(Emission) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Transition"),
+                   "Input(Transition) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ViterbiPath"),
+                   "Output(ViterbiPath) should be not null.");
+
+    auto emission_dims = ctx->GetInputDim("Emission");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
+    auto transition_dims = ctx->GetInputDim("Transition");
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+                      "The Input(Transition) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_dims[0] - 2, transition_dims[1],
+        "An invalid dimension for the Input(Transition), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[1], transition_dims[1],
+        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+        "should be equal to the tag number.");
+
+    if (ctx->HasInput("Label")) {
+      auto label_dims = ctx->GetInputDim("Label");
+      PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                     "The Input(Label) should be a 2-D tensor with the 2nd "
+                     "dimensions fixed to 1.");
+      PADDLE_ENFORCE_EQ(
+          emission_dims[0], label_dims[0],
+          "The height of Input(Emission) and the height of Input(Label) "
+          "should be the same.");
+    }
+
+    ctx->ShareLoD("Emission", /*->*/ "ViterbiPath");
+    ctx->SetOutputDim("ViterbiPath", {emission_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        platform::CPUPlace());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp,
+                             ops::CRFDecodingOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    crf_decoding,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CRFDecodingOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
new file mode 100644
index 0000000000..ce2f4e6622
--- /dev/null
+++ b/paddle/operators/crf_decoding_op.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+using framework::LoD;
+using framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class CRFDecodingOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* emission_weights = ctx.Input<LoDTensor>("Emission");
+    auto* transition_weights = ctx.Input<Tensor>("Transition");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
+
+    PADDLE_ENFORCE_EQ(emission_weights->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    auto lod = emission_weights->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Emission) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = lod[level].size() - 1;
+
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<DeviceContext, int64_t>()(
+        ctx.template device_context<DeviceContext>(), decoded_path, 0);
+    for (size_t i = 0; i < seq_num; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor decoded_path_one_seq = decoded_path->Slice(start_pos, end_pos);
+      Decode(emission_weights->Slice(start_pos, end_pos), *transition_weights,
+             &decoded_path_one_seq);
+    }
+
+    if (label) {
+      PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
+                        "The Input(Label) should be a sequence.");
+      const int64_t* label_value = label->data<int64_t>();
+      size_t batch_size = emission_weights->dims()[0];
+      for (size_t i = 0; i < batch_size; ++i) {
+        path[i] = label_value[i] == path[i] ? 1 : 0;
+      }
+    }
+  }
+
+ private:
+  void Decode(const Tensor& emission_weights, const Tensor& transition_weights,
+              Tensor* decoded_path) const {
+    auto emission_dims = emission_weights.dims();
+    const size_t seq_len = emission_dims[0];
+    const size_t tag_num = emission_dims[1];
+
+    const size_t state_trans_base_idx = 2;
+
+    const T* x = emission_weights.data<T>();
+    const T* w = transition_weights.data<T>();
+    int64_t* path = decoded_path->data<int64_t>();
+
+    // alpha is a memo table. An element alpha(k, v) records the score of the
+    // best sequence of tags from position 1 to position k with v being the end
+    // tag.
+    Tensor alpha;
+    T* alpha_value = alpha.mutable_data<T>(emission_dims, platform::CPUPlace());
+    Tensor track;
+    int* track_value =
+        track.mutable_data<int>(emission_dims, platform::CPUPlace());
+
+    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
+
+    for (size_t k = 1; k < seq_len; ++k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T max_score = -std::numeric_limits<T>::max();
+        int max_j = 0;
+        for (size_t j = 0; j < tag_num; ++j) {
+          T score = alpha_value[(k - 1) * tag_num + j] +
+                    w[(j + state_trans_base_idx) * tag_num + i];
+          if (score > max_score) {
+            max_score = score;
+            max_j = j;
+          }
+        }
+
+        alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
+        track_value[k * tag_num + i] = max_j;
+      }
+    }
+
+    T max_score = -std::numeric_limits<T>::max();
+    int max_i = 0;
+    for (size_t i = 0; i < tag_num; ++i) {
+      T score = alpha_value[(seq_len - 1) * tag_num + i] + w[tag_num + i];
+      if (score > max_score) {
+        max_score = score;
+        max_i = i;
+      }
+    }
+    path[seq_len - 1] = max_i;
+    for (int k = seq_len - 1; k >= 1; --k) {
+      path[k - 1] = max_i = track_value[k * tag_num + max_i];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
new file mode 100644
index 0000000000..310e351443
--- /dev/null
+++ b/paddle/operators/crop_op.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/crop_op.h"
+#include <boost/lexical_cast.hpp>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class CropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of CropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of CropOp should not be null.");
+    auto x_dim = ctx->GetInputDim("X");
+    if (!ctx->HasInput("Y")) {
+      auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+      PADDLE_ENFORCE_EQ(
+          int64_t(shape.size()), x_dim.size(),
+          "Shape size should be equal to dimention size of input tensor.");
+      std::vector<int64_t> tensor_shape(shape.size());
+      for (size_t i = 0; i < shape.size(); ++i) {
+        tensor_shape[i] = static_cast<int64_t>(shape[i]);
+      }
+      ctx->SetOutputDim("Out", framework::make_ddim(tensor_shape));
+    } else {
+      auto y_dim = ctx->GetInputDim("Y");
+      PADDLE_ENFORCE_EQ(framework::arity(x_dim), framework::arity(y_dim),
+                        "Tensor rank of both CropOp's "
+                        "inputs must be same.");
+      ctx->SetOutputDim("Out", y_dim);
+    }
+  }
+};
+
+class CropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CropOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input of pad op. "
+             "The input should be a k-D tensor(k > 0 and k < 7).");
+    AddInput("Y",
+             "The input used as reference for cropping, "
+             "which is of the same dimensions as X.")
+        .AsDispensable();
+    AddOutput("Out",
+              "The output of crop op, "
+              "which is of the same dimensions as X.");
+    AddAttr<std::vector<int>>("offsets",
+                              "A list<int> describing offsets to be cropped. "
+                              "The size of offsets list should be the same as "
+                              "the dimension size of input X.");
+    AddAttr<std::vector<int>>("shape",
+                              "A list<int> describing the shape of output. "
+                              "The size of shape list should be the same as "
+                              "the dimension size of input X.")
+        .SetDefault(std::vector<int>());
+    AddComment(R"DOC(
+Crop Operator.
+
+Crop input into output, as specified by offsets and shape.
+
+There are two ways to set shape:
+1. reference input: crop input X into the same shape as reference input.
+                    The dimension of reference input should
+                    be the same as the dimension of input X.
+2. shape list: crop input X into the shape described by a list<int>.
+               The size of shape list should be the same as
+               the dimension size of input X.
+
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+
+Case 1:
+Given
+
+    X = [[0, 1, 2, 0, 0]
+         [0, 3, 4, 0, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    shape = [2, 2],
+
+we get:
+
+    Out = [[1, 2],
+           [3, 4]].
+
+
+Case 2:
+Given
+
+    X = [[0, 1, 2, 5, 0]
+         [0, 3, 4, 6, 0]
+         [0, 0, 0, 0, 0]],
+
+and
+
+    offsets = [0, 1],
+
+and
+
+    Y = [[0, 0, 0]
+         [0, 0, 0]],
+
+we get:
+
+    Out = [[1, 2, 5],
+           [3, 4, 6]].
+)DOC");
+  }
+};
+
+class CropOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad);
+REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CPU_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu
new file mode 100644
index 0000000000..bba5db4c6c
--- /dev/null
+++ b/paddle/operators/crop_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/crop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel<float>);
+REGISTER_OP_CUDA_KERNEL(
+    crop_grad, ops::CropGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
new file mode 100644
index 0000000000..69d1a92977
--- /dev/null
+++ b/paddle/operators/crop_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {  // Internal
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using framework::Tensor;
+
+template <typename T>
+class CropKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    const T* x_data = x->data<T>();
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto x_stride = framework::stride(x->dims());
+    auto out_stride = framework::stride(out->dims());
+    auto offsets = context.Attr<std::vector<int>>("offsets");
+    PADDLE_ENFORCE_EQ(
+        x->dims().size(), static_cast<int64_t>(offsets.size()),
+        "Offsets size should be equal to dimension size of input tensor.");
+    int64_t offset = 0;
+    for (size_t i = 0; i < offsets.size(); ++i) {
+      offset += (x_stride[i] * offsets[i]);
+    }
+    StridedMemcpy<T>(context.device_context(), x_data + offset, x_stride,
+                     out->dims(), out_stride, out_data);
+  }
+};
+
+template <typename DeviceContext, typename T, size_t D>
+void CropGradFunction(const framework::ExecutionContext& context) {
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  if (d_x != nullptr) {
+    auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+    d_x->mutable_data<T>(context.GetPlace());
+    auto offsets = context.Attr<std::vector<int>>("offsets");
+    Eigen::array<std::pair<int, int>, D> paddings;
+    for (size_t i = 0; i < D; ++i) {
+      paddings[i].first = offsets[i];
+      paddings[i].second = d_x->dims()[i] - d_out->dims()[i] - offsets[i];
+    }
+    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
+    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+    d_x_tensor.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        d_out_tensor.pad(paddings, 0);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class CropGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    size_t rank =
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    switch (rank) {
+      case 1:
+        CropGradFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        CropGradFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        CropGradFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        CropGradFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        CropGradFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        CropGradFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "CropOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
new file mode 100644
index 0000000000..7abd5b1c61
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cc
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Y");
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class CrossEntropyGradientOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of cross_entropy
+  // is determined by its input "X".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             " where N is the batch size and D is the number of classes. "
+             "This input is a probability computed by the previous operator, "
+             "which is almost always the result of a softmax operator.");
+    AddInput("Label",
+             "(Tensor), the ground truth which is a 2-D tensor. When "
+             "soft_label is set to false, Label is a Tensor<int64> with shape "
+             "[N x 1]. When soft_label is set to true, Label is a "
+             "Tensor<float/double> with shape [N x D].");
+    AddOutput("Y",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "[N x 1]. The cross entropy loss.");
+    AddAttr<bool>("soft_label",
+                  "(bool, default false), a flag indicating whether to "
+                  "interpretate the given labels as soft labels.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+CrossEntropy Operator.
+
+It supports both standard cross-entropy and soft-label cross-entropy loss
+computation.
+1) One-hot cross-entropy:
+    soft_label = false, Label[i, 0] indicates the class index for sample i:
+
+                $Y[i] = -\log(X[i, Label[i]])$
+
+2) Soft-label cross-entropy:
+    soft_label = true, Label[i, j] indicates the soft label of class j
+    for sample i:
+
+                $Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}$
+
+   Please make sure that in this case the summuation of each row of Label
+   equals one.
+
+3) One-hot cross-entropy with vecterized Input(Label):
+     As a special case of 2), when each row of Input(Label) has only one
+     non-zero element (equals 1), soft-label cross-entropy degenerates to a
+     one-hot cross-entropy with one-hot label representation.
+
+Both the input X and Label can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input X.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(cross_entropy, ops::CrossEntropyOp, ops::CrossEntropyOpMaker,
+            cross_entropy_grad, ops::CrossEntropyGradientOp);
+REGISTER_OP_CPU_KERNEL(cross_entropy, ops::CrossEntropyOpKernel<float>,
+                       ops::CrossEntropyOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(cross_entropy_grad,
+                       ops::CrossEntropyGradientOpKernel<float>,
+                       ops::CrossEntropyGradientOpKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
new file mode 100644
index 0000000000..3b04894e6c
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cu
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T>
+__global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                           const int64_t* label, const int N,
+                                           const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    int idx = i * D + label[i];
+    dX[idx] = -dY[i] / X[idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
+                                               const T* label, const int N,
+                                               const int D) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < N * D) {
+    int row_ids = ids / D;
+    dX[ids] = -label[ids] * dY[row_ids] / X[ids];
+  }
+}
+}  // namespace
+
+template <typename T>
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        ctx.template device_context<platform::CUDADeviceContext>(), y, x, label,
+        ctx.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    const T* dy_data =
+        ctx.Input<Tensor>(framework::GradVarName("Y"))->data<T>();
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    const T* x_data = x->data<T>();
+
+    int64_t batch_size = x->dims()[0];
+    int64_t class_num = x->dims()[1];
+
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto stream = dev_ctx.stream();
+
+    if (ctx.Attr<bool>("soft_label")) {
+      auto* label_data = label->data<T>();
+      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
+    } else {
+      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      functor(dev_ctx, dx, 0);
+      auto* label_data = label->data<int64_t>();
+      grid = (batch_size + block - 1) / block;
+      CrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
+          dx_data, dy_data, x_data, label_data, batch_size, class_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel<float>,
+                        ops::CrossEntropyOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpCUDAKernel<float>,
+                        ops::CrossEntropyGradientOpCUDAKernel<double>);
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
new file mode 100644
index 0000000000..5623d2ded1
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* y = ctx.Output<Tensor>("Y");
+    y->mutable_data<T>(ctx.GetPlace());
+
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        ctx.template device_context<platform::CPUDeviceContext>(), y, x, labels,
+        ctx.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const Tensor* label = ctx.Input<Tensor>("Label");
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+
+    int64_t class_num = x->dims()[1];
+    if (ctx.Attr<bool>("soft_label")) {
+      auto x_mat = EigenMatrix<T>::From(*x);
+      auto dy_mat = EigenMatrix<T>::From(*dy);
+      auto lbl_mat = EigenMatrix<T>::From(*label);
+      auto dx_mat = EigenMatrix<T>::From(*dx);
+
+      dx_mat.device(*ctx.template device_context<platform::CPUDeviceContext>()
+                         .eigen_device()) =
+          -(lbl_mat *
+            dy_mat.broadcast(Eigen::DSizes<int64_t, 2>(1, class_num)) / x_mat);
+    } else {
+      int64_t batch_size = x->dims()[0];
+      const T* dy_data = dy->data<T>();
+      const T* x_data = x->data<T>();
+      const int64_t* label_data = label->data<int64_t>();
+
+      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      functor(ctx.template device_context<platform::CPUDeviceContext>(), dx, 0);
+
+      for (int64_t i = 0; i < batch_size; ++i) {
+        PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
+        int64_t index = i * class_num + label_data[i];
+        dx_data[index] = -dy_data[i] / x_data[index];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/ctc_align_op.cc b/paddle/operators/ctc_align_op.cc
new file mode 100644
index 0000000000..eeecbd3212
--- /dev/null
+++ b/paddle/operators/ctc_align_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CTCAlignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input of CTCAlignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                   "Output of CTCAlignOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("Input");
+
+    // TODO(wanghaoshuang): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Output", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class CTCAlignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CTCAlignOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LodTensor, default: LoDTensor<int>), Its shape is "
+             "[Lp, 1], where Lp is the sum of all input sequences' length.");
+    AddOutput("Output", "(Tensor, default: Tensor<int>), The align result.");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label setted in Connectionist "
+                 "Temporal Classification (CTC) op.")
+        .SetDefault(0);
+    AddAttr<bool>("merge_repeated",
+                  "(bool, default: true), whether to "
+                  "merge repeated elements between two blanks. ")
+        .SetDefault(true);
+    AddComment(R"DOC(
+CTCAlign op is used to merge repeated elements between two blanks
+and then delete all blanks in sequence.
+
+Given:
+    Input.data = [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6,
+                  6, 0, 0, 7, 7, 7, 0]
+    Input.dims = {18, 1}
+    Input.LoD = [[0, 11, 18]]
+
+And:
+    blank = 0
+    merge_repeated = True
+
+Then:
+    Output.data = [1, 2, 4, 4, 5, 6,
+                   6, 7]
+    Output.dims = {8, 1}
+    Output.LoD = [[0, 6, 8]]
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ctc_align, ops::CTCAlignOp, ops::CTCAlignOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ctc_align, ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::CTCAlignKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/ctc_align_op.cu b/paddle/operators/ctc_align_op.cu
new file mode 100644
index 0000000000..45635f1674
--- /dev/null
+++ b/paddle/operators/ctc_align_op.cu
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdio.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/ctc_align_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
+                                      const size_t num_seq, size_t* lod0,
+                                      const int blank, const int merge_repeated,
+                                      size_t* out_lod0, T* output) {
+  int ouput_idx = 0;
+  out_lod0[0] = 0;
+
+  for (int i = 0; i < num_seq; ++i) {
+    T pre_token = -1;
+    for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
+      if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
+        output[ouput_idx] = tokens[j];
+        ++ouput_idx;
+      }
+      pre_token = tokens[j];
+    }
+    out_lod0[i + 1] = ouput_idx;
+  }
+}
+
+template <typename T>
+class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    const size_t level = 0;
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    const T* tokens = input->data<T>();
+    const int64_t num_tokens = input->dims()[0];
+    const size_t num_seq = input_lod[level].size() - 1;
+
+    const int blank = ctx.Attr<int>("blank");
+    const int merge_repeated =
+        static_cast<int>(ctx.Attr<bool>("merge_repeated"));
+
+    // prepare a lod to record lod information while merging elements
+    thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
+    size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
+
+    // merge elements and delete blank
+    T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
+
+    auto stream = ctx.cuda_device_context().stream();
+    MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
+        num_tokens, tokens, num_seq, input_lod[level].data(), blank,
+        merge_repeated, dev_out_lod0_ptr, output_data);
+
+    // set output lod
+    thrust::host_vector<size_t> host_out_lod0(dev_out_lod0.begin(),
+                                              dev_out_lod0.end());
+    framework::LoD out_lod;
+    out_lod.push_back(host_out_lod0);
+    output->set_lod(out_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(ctc_align, paddle::operators::CTCAlignOpCUDAKernel<int>,
+                        paddle::operators::CTCAlignOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/ctc_align_op.h b/paddle/operators/ctc_align_op.h
new file mode 100644
index 0000000000..fed89aa1e8
--- /dev/null
+++ b/paddle/operators/ctc_align_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string.h>
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class CTCAlignKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* output = ctx.Output<LoDTensor>("Output");
+    const size_t level = 0;
+    auto input_lod = framework::ToAbsOffset(input->lod());
+
+    // check input dims and lod
+    auto input_dims = input->dims();
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      static_cast<int64_t>(input_lod[level].back()),
+                      "The first dimension of Input(Input) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    const size_t num_sequences = input_lod[level].size() - 1;
+    size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+    bool merge_repeated = ctx.Attr<bool>("merge_repeated");
+
+    // merge repeated tokens and delete blank
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    size_t output_idx = 0;
+    std::vector<size_t> output_lod0(1, 0);
+    const T* input_data = input->data<T>();
+    for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
+      T prev_token = -1;
+      for (size_t i = input_lod[level][seq_idx];
+           i < input_lod[level][seq_idx + 1]; ++i) {
+        if ((unsigned)input_data[i] != blank &&
+            !(merge_repeated && input_data[i] == prev_token)) {
+          output_data[output_idx] = input_data[i];
+          ++output_idx;
+        }
+        prev_token = input_data[i];
+      }
+      output_lod0.push_back(output_idx);
+    }
+
+    // set output lod
+    framework::LoD output_lod;
+    output_lod.push_back(output_lod0);
+    output->set_lod(output_lod);
+
+    // resize output dims
+    output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
new file mode 100644
index 0000000000..739a8d881c
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DecayedAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
+                      "Param and Grad input of DecayedAdagradOp should have "
+                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
+                      "Param and Moment input of DecayedAdagradOp should have "
+                      "the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecayedAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("decay",
+                   "(float, default 0.95) "
+                   "Discounting factor for coming gradient")
+        .SetDefault(0.95);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Decayed Adagrad Optimizer.
+
+The update is done as follows:
+
+$$
+moment\_out = decay * moment + (1 - decay) * grad * grad \\
+param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + epsilon}
+$$
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have an epsilon attribute. It is added here for numerical
+stability to avoid the division by zero error.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
+                             ops::DecayedAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
new file mode 100644
index 0000000000..7bc8161f23
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
new file mode 100644
index 0000000000..fec9705cfc
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DecayedAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float decay = ctx.Attr<float>("decay");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000..571a75c9dc
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
new file mode 100644
index 0000000000..9b5f7afc6a
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.cc
@@ -0,0 +1,189 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "grpc_client.h"
+#include "paddle/framework/threadpool.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::AsyncSendVariable(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& var_name,
+                                  int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] {
+    auto* var = p_scope->FindVar(var_name_val);
+    sendrecv::VariableMessage req;
+    SerializeToMessage(var_name_val, var, *p_ctx, &req);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    SendProcessor* s = new SendProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = NULL;
+
+    auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& ret_msg) {
+  auto* outvar = var_h.scope->FindVar(var_h.name);
+  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+}
+
+bool RPCClient::AsyncGetVariable(const std::string& ep,
+                                 const platform::DeviceContext& ctx,
+                                 const framework::Scope& scope,
+                                 const std::string& var_name,
+                                 int64_t time_out) {
+  const platform::DeviceContext* p_ctx = &ctx;
+  const std::string ep_val = ep;
+  const std::string var_name_val = var_name;
+  const framework::Scope* p_scope = &scope;
+  const auto ch = GetChannel(ep_val);
+
+  framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {
+    sendrecv::VariableMessage req;
+    req.set_varname(var_name_val);
+
+    // varhandle
+    VarHandle var_h;
+    var_h.ep = ep_val;
+    var_h.scope = p_scope;
+    var_h.name = var_name_val;
+    var_h.ctx = p_ctx;
+
+    // stub context
+    GetProcessor* s = new GetProcessor(ch);
+    s->Prepare(var_h, time_out);
+    s->response_call_back_ = ProcGetResponse;
+
+    auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+    rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  });
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(BATCH_BARRIER_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::Wait() {
+  if (req_count_ <= 0) {
+    return true;
+  }
+  const size_t kReqCnt = req_count_;
+  bool a[kReqCnt];
+  std::vector<std::future<void>> waits(req_count_);
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i] = framework::Async([i, &a, this] { a[i] = Proceed(); });
+  }
+
+  for (int i = 0; i < req_count_; i++) {
+    waits[i].wait();
+  }
+
+  int last_req_count = req_count_;
+  req_count_ = 0;
+
+  for (int i = 0; i < last_req_count; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool RPCClient::Proceed() {
+  void* tag = NULL;
+  bool ok = false;
+
+  // request counts.
+  if (!cq_.Next(&tag, &ok)) {
+    LOG(ERROR) << "Get meets CompletionQueue error";
+    return false;
+  }
+
+  GPR_ASSERT(ok);
+  PADDLE_ENFORCE(tag);
+
+  // TODO(gongwb): add more retries.
+  ClientBase* c = static_cast<ClientBase*>(tag);
+  if (!c->status_.ok()) {
+    LOG(ERROR) << "proc param error:" << c->var_h_.String()
+               << " grpc error:" << c->status_.error_message();
+    delete c;
+    return false;
+  }
+
+  c->Process();
+  delete c;
+  return true;
+}
+
+std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  grpc::ChannelArguments args;
+  args.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  args.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+
+  auto ch = std::shared_ptr<grpc::Channel>(
+      grpc::CreateCustomChannel(ep, grpc::InsecureChannelCredentials(), args));
+
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
new file mode 100644
index 0000000000..f9499f6dc7
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <time.h>
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct VarHandle {
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  std::string name;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << "name:[" << name << "] ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& msg);
+
+class ClientBase {
+ public:
+  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+    context_ = NULL;
+  }
+
+  virtual ~ClientBase() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Prepare(int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+    RequestSendCallBack;
+
+class SendProcessor : public ClientBase {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VoidMessage reply_;
+  RequestSendCallBack response_call_back_ = NULL;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+    RequestGetCallBack;
+
+class GetProcessor : public ClientBase {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VariableMessage reply_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class BatchBarrierProcessor : public ClientBase {
+ public:
+  explicit BatchBarrierProcessor(std::shared_ptr<grpc::Channel> ch)
+      : ClientBase(ch) {}
+
+  virtual ~BatchBarrierProcessor() {}
+
+  virtual void Process() {}
+  sendrecv::VoidMessage reply_;
+};
+
+class RPCClient {
+ public:
+  bool AsyncSendVariable(const std::string& ep,
+                         const platform::DeviceContext& ctx,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         int64_t time_out = 600 * 1000);
+
+  bool AsyncGetVariable(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& var_name,
+                        int64_t time_out = 600 * 1000);
+
+  bool AsyncSendBatchBarrier(const std::string& ep,
+                             int64_t time_out = 600 * 1000);
+
+  bool Wait();
+
+ private:
+  bool Proceed();
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  int64_t req_count_ = 0;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
new file mode 100644
index 0000000000..4f94e1315f
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.cc
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/grpc_server.h"
+
+using grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq)
+      : service_(service), cq_(cq), status_(PROCESS) {
+    PADDLE_ENFORCE(cq_);
+  }
+  virtual ~RequestBase() {}
+  virtual void Process() { assert(false); }
+
+  CallStatus Status() { return status_; }
+  void SetStatus(CallStatus status) { status_ = status; }
+  virtual std::string GetReqName() {
+    assert(false);
+    return "";
+  }
+
+ protected:
+  grpc::ServerContext ctx_;
+  sendrecv::SendRecvService::AsyncService* service_;
+  grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+};
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq,
+                       SimpleBlockQueue<MessageWithName>* queue)
+      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+                                  this);
+  }
+
+  virtual ~RequestSend() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    MessageWithName msg_with_name =
+        std::make_pair(request_.varname(), std::move(request_));
+    queue_->Push(std::move(msg_with_name));
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  SimpleBlockQueue<MessageWithName>* queue_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+                      grpc::ServerCompletionQueue* cq, framework::Scope* scope,
+                      const platform::DeviceContext* dev_ctx,
+                      SimpleBlockQueue<char>* queue)
+      : RequestBase(service, cq),
+        responder_(&ctx_),
+        scope_(scope),
+        dev_ctx_(dev_ctx),
+        queue_(queue) {
+    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+  }
+
+  virtual ~RequestGet() {}
+
+  virtual std::string GetReqName() { return request_.varname(); }
+
+  virtual void Process() {
+    // proc request.
+    std::string var_name = request_.varname();
+    auto* var = scope_->FindVar(var_name);
+    SerializeToMessage(var_name, var, *dev_ctx_, &reply_);
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+    status_ = FINISH;
+    queue_->Push('c');
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  framework::Scope* scope_;
+  const platform::DeviceContext* dev_ctx_;
+  SimpleBlockQueue<char>* queue_;
+};
+
+void AsyncGRPCServer::WaitClientGet(int count) {
+  for (int i = 0; i < count; ++i) {
+    var_get_queue_.Pop();
+  }
+}
+
+void AsyncGRPCServer::RunSyncUpdate() {
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.SetMaxSendMessageSize(std::numeric_limits<int>::max());
+  builder.SetMaxReceiveMessageSize(std::numeric_limits<int>::max());
+  builder.RegisterService(&service_);
+
+  cq_send_ = builder.AddCompletionQueue();
+  cq_get_ = builder.AddCompletionQueue();
+
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << address_ << std::endl;
+
+  std::function<void()> send_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
+  std::function<void()> get_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+
+  t_send_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
+                                cq_send_.get(), "cq_send", send_register)));
+
+  t_get_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this,
+                                cq_get_.get(), "cq_get", get_register)));
+
+  // wait server
+  server_->Wait();
+  t_send_->join();
+  t_get_->join();
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  cq_send_->Shutdown();
+  cq_get_->Shutdown();
+  is_shut_down_ = true;
+}
+
+// This URL explains why shutdown is complicate:
+void AsyncGRPCServer::ShutDown() {
+  server_->Shutdown();
+  ShutdownQueue();
+}
+
+void AsyncGRPCServer::TryToRegisterNewSendOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestSend* send =
+      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  VLOG(4) << "Create RequestSend status:" << send->Status();
+}
+
+void AsyncGRPCServer::TryToRegisterNewGetOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_,
+                                   &var_get_queue_);
+  VLOG(4) << "Create RequestGet status:" << get->Status();
+}
+
+// FIXME(typhoonzero): change cq_name to enum.
+void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq,
+                                    std::string cq_name,
+                                    std::function<void()> TryToRegisterNewOne) {
+  TryToRegisterNewOne();
+
+  void* tag = NULL;
+  bool ok = false;
+  while (true) {
+    if (!cq->Next(&tag, &ok)) {
+      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      break;
+    }
+
+    PADDLE_ENFORCE(tag);
+    // FIXME(typhoonzero): de-couple the barriers with recv_op
+    if (cq_name == "cq_get") WaitCond(1);
+    if (cq_name == "cq_send") WaitCond(0);
+
+    RequestBase* base = (RequestBase*)tag;
+    // reference:
+    // https://github.com/tensorflow/tensorflow/issues/5596
+    // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM
+    // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I
+    if (!ok) {
+      LOG(WARNING) << cq_name << " recv no regular event:argument name"
+                   << base->GetReqName();
+      TryToRegisterNewOne();
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        TryToRegisterNewOne();
+        base->Process();
+        break;
+      }
+      case FINISH: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+void AsyncGRPCServer::WaitCond(int cond) {
+  std::unique_lock<std::mutex> lock(this->barrier_mutex_);
+  barrier_condition_.wait(lock,
+                          [=] { return this->barrier_cond_step_ == cond; });
+}
+
+void AsyncGRPCServer::SetCond(int cond) {
+  {
+    std::lock_guard<std::mutex> lock(this->barrier_mutex_);
+    barrier_cond_step_ = cond;
+  }
+  barrier_condition_.notify_all();
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
new file mode 100644
index 0000000000..3f8b9d9317
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <thread>
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+class RequestBase;
+
+class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+ public:
+  explicit AsyncGRPCServer(const std::string &address) : address_(address) {}
+
+  void RunSyncUpdate();
+
+  // functions to sync server barrier status.
+  void WaitCond(int cond);
+  void SetCond(int cond);
+  void WaitClientGet(int count);
+
+  void SetScope(framework::Scope *scope) { scope_ = scope; }
+
+  void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; }
+
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+
+  void ShutDown();
+
+ protected:
+  void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name,
+                     std::function<void()> TryToRegisterNewOne);
+  void TryToRegisterNewSendOne();
+  void TryToRegisterNewGetOne();
+  void ShutdownQueue();
+
+ private:
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+
+  sendrecv::SendRecvService::AsyncService service_;
+  std::unique_ptr<grpc::Server> server_;
+
+  std::string address_;
+  framework::Scope *scope_;
+  const platform::DeviceContext *dev_ctx_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+  SimpleBlockQueue<char> var_get_queue_;
+
+  // condition of the sub program
+  std::mutex barrier_mutex_;
+  mutable int barrier_cond_step_;
+  std::condition_variable barrier_condition_;
+
+  std::unique_ptr<std::thread> t_send_;
+  std::unique_ptr<std::thread> t_get_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/operators/detail/safe_ref.h b/paddle/operators/detail/safe_ref.h
new file mode 100644
index 0000000000..ff2a156f3d
--- /dev/null
+++ b/paddle/operators/detail/safe_ref.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace operators {
+namespace detail {
+/**
+ * Get Reference From Pointer with check. The error message is printf format,
+ * and passed by `args`
+ */
+template <typename T, typename... ARGS>
+inline T &Ref(T *ptr, ARGS &&... args) {
+  PADDLE_ENFORCE(ptr != nullptr, args...);
+  return *ptr;
+}
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000..8f962b4c69
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
+the Apache License, Version 2.0 (the "License"); you may not use this file
+except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto3";
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  // TODO(typhoonzero): add streaming API
+  rpc SendVariable(VariableMessage) returns (VoidMessage) {}
+  // Argument VariableMessage for GetVariable should only contain varname.
+  rpc GetVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// LoDTensor
+// SelectedRows
+enum VarType {
+  LOD_TENSOR = 0;
+  SELECTED_ROWS = 1;
+}
+
+message VariableMessage {
+  string varname = 1;
+  // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
+  VarType type = 2;
+  bytes serialized = 3;
+}
+
+message VoidMessage {}
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/operators/detail/sendrecvop_utils.cc
new file mode 100644
index 0000000000..7635b9e8db
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var) {
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
new file mode 100644
index 0000000000..8e66f7299c
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
+#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg);
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000..c7f5ff4b5f
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
new file mode 100644
index 0000000000..9ed524d4dc
--- /dev/null
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/ddim.h"
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T, int Rank>
+struct StridedMemcpyFunctor;
+
+template <typename T>
+struct StridedMemcpyFunctor<T, 1> {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<1> src_stride, framework::Dim<1> dst_dim,
+                  framework::Dim<1> dst_stride, T* dst) const {
+    auto place = dev_ctx.GetPlace();
+    if (platform::is_cpu_place(place)) {
+      auto& cpu_place = boost::get<platform::CPUPlace>(place);
+      memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+      auto& gpu_place = boost::get<platform::CUDAPlace>(place);
+      auto& cuda_ctx =
+          reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
+      memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head,
+                   cuda_ctx.stream());
+#else
+      PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+    }
+  }
+};
+
+template <typename T, int Rank>
+struct StridedMemcpyFunctor {
+  void operator()(const platform::DeviceContext& dev_ctx, const T* src,
+                  framework::Dim<Rank> src_stride, framework::Dim<Rank> dst_dim,
+                  framework::Dim<Rank> dst_stride, T* dst) const {
+    for (int64_t i = 0; i < dst_dim.head; ++i) {
+      StridedMemcpyFunctor<T, Rank - 1> func;
+      func(dev_ctx, src, src_stride.tail, dst_dim.tail, dst_stride.tail, dst);
+      src += src_stride.head;
+      dst += dst_stride.head;
+    }
+  }
+};
+
+template <typename T>
+struct StridedCopyDimVisitor : public boost::static_visitor<void> {
+  StridedCopyDimVisitor(const platform::DeviceContext& dev_ctx, const T* src,
+                        const framework::DDim& src_stride,
+                        const framework::DDim& dst_stride, T* dst)
+      : dev_ctx_(dev_ctx),
+        src_(src),
+        src_stride_(src_stride),
+        dst_stride_(dst_stride),
+        dst_(dst) {}
+
+  template <typename Dim>
+  void operator()(Dim dst_dim) const {
+    Dim src_stride = boost::get<Dim>(src_stride_);
+    Dim dst_stride = boost::get<Dim>(dst_stride_);
+    constexpr int dim = Dim::dimensions;
+    StridedMemcpyFunctor<T, dim> functor;
+    functor(dev_ctx_, src_, src_stride, dst_dim, dst_stride, dst_);
+  }
+
+  const platform::DeviceContext& dev_ctx_;
+  const T* src_;
+  const framework::DDim& src_stride_;
+  const framework::DDim& dst_stride_;
+  T* dst_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detection_output_op.cc b/paddle/operators/detection_output_op.cc
new file mode 100644
index 0000000000..ea44cd3267
--- /dev/null
+++ b/paddle/operators/detection_output_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+namespace paddle {
+namespace operators {
+
+class DetectionOutputOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DetectionOutputOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Loc",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input predict locations"
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is 4, H and W both are 1.");
+    AddInput("Conf",
+             "(Tensor) The input tensor of detection_output operator."
+             "The input priorbox confidence."
+             "The format of input tensor is kNCHW. Where K is priorbox point "
+             "numbers,"
+             "N is How many boxes are there on each point, "
+             "C is the number of classes, H and W both are 1.");
+    AddInput("PriorBox",
+             "(Tensor) The input tensor of detection_output operator."
+             "The format of input tensor is the position and variance "
+             "of the boxes");
+    AddOutput("Out",
+              "(Tensor) The output tensor of detection_output operator.");
+    AddAttr<int>("background_label_id", "(int), The background class index.");
+    AddAttr<int>("num_classes", "(int), The number of the classification.");
+    AddAttr<float>("nms_threshold",
+                   "(float), The Non-maximum suppression threshold.");
+    AddAttr<float>("confidence_threshold",
+                   "(float), The classification confidence threshold.");
+    AddAttr<int>("top_k", "(int), The bbox number kept of the layer’s output.");
+    AddAttr<int>("nms_top_k",
+                 "(int), The bbox number kept of the NMS’s output.");
+    AddComment(R"DOC(
+          detection output for SSD(single shot multibox detector)
+          Apply the NMS to the output of network and compute the predict
+          bounding box location. The output’s shape of this layer could
+          be zero if there is no valid bounding box.
+        )DOC");
+  }
+};
+
+class DetectionOutputOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Loc"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Conf"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(X) of DetectionOutputOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of DetectionOutputOp should not be null.");
+    std::vector<int64_t> output_shape({1, 7});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(detection_output, ops::DetectionOutputOp,
+                             ops::DetectionOutputOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.cu.cc b/paddle/operators/detection_output_op.cu.cc
new file mode 100644
index 0000000000..4a6560e049
--- /dev/null
+++ b/paddle/operators/detection_output_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detection_output_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    detection_output,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DetectionOutputKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/detection_output_op.h b/paddle/operators/detection_output_op.h
new file mode 100644
index 0000000000..86285b748a
--- /dev/null
+++ b/paddle/operators/detection_output_op.h
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/math/detection_util.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/strided_memcpy.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+inline void transpose_fun(const framework::ExecutionContext& context,
+                          const framework::Tensor& src,
+                          framework::Tensor* dst) {
+  int input_nums = src.dims()[0];
+  int offset = 0;
+  for (int j = 0; j < input_nums; ++j) {
+    framework::Tensor in_p_tensor = src.Slice(j, j + 1);
+    std::vector<int64_t> shape_vec(
+        {in_p_tensor.dims()[0], in_p_tensor.dims()[1], in_p_tensor.dims()[3],
+         in_p_tensor.dims()[4], in_p_tensor.dims()[2]});
+    framework::DDim shape(framework::make_ddim(shape_vec));
+    framework::Tensor in_p_tensor_transpose;
+    in_p_tensor_transpose.mutable_data<T>(shape, context.GetPlace());
+    std::vector<int> shape_axis({0, 1, 3, 4, 2});
+    math::Transpose<DeviceContext, T, 5> trans5;
+    trans5(context.template device_context<DeviceContext>(), in_p_tensor,
+           &in_p_tensor_transpose, shape_axis);
+    auto dst_stride = framework::stride(dst->dims());
+    auto src_stride = framework::stride(in_p_tensor_transpose.dims());
+    StridedMemcpy<T>(context.device_context(), in_p_tensor_transpose.data<T>(),
+                     src_stride, in_p_tensor_transpose.dims(), dst_stride,
+                     dst->data<T>() + offset);
+    offset += in_p_tensor_transpose.dims()[4] * src_stride[4];
+  }
+}
+template <typename DeviceContext, typename T>
+class DetectionOutputKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_loc = context.Input<framework::Tensor>("Loc");
+    const framework::Tensor* in_conf = context.Input<framework::Tensor>("Conf");
+    const framework::Tensor* in_priorbox =
+        context.Input<framework::Tensor>("PriorBox");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int num_classes = context.template Attr<int>("num_classes");
+    int top_k = context.template Attr<int>("top_k");
+    int nms_top_k = context.template Attr<int>("nms_top_k");
+    int background_label_id = context.template Attr<int>("background_label_id");
+    float nms_threshold = context.template Attr<float>("nms_threshold");
+    float confidence_threshold =
+        context.template Attr<float>("confidence_threshold");
+    size_t batch_size = in_conf->dims()[1];
+    int conf_sum_size = in_conf->numel();
+    // for softmax
+    std::vector<int64_t> conf_shape_softmax_vec(
+        {conf_sum_size / num_classes, num_classes});
+    framework::DDim conf_shape_softmax(
+        framework::make_ddim(conf_shape_softmax_vec));
+    // for knchw => nhwc
+    std::vector<int64_t> loc_shape_vec({1, in_loc->dims()[1], in_loc->dims()[3],
+                                        in_loc->dims()[4],
+                                        in_loc->dims()[2] * in_loc->dims()[0]});
+    std::vector<int64_t> conf_shape_vec(
+        {1, in_conf->dims()[1], in_conf->dims()[3], in_conf->dims()[4],
+         in_conf->dims()[2] * in_conf->dims()[0]});
+    framework::DDim loc_shape(framework::make_ddim(loc_shape_vec));
+    framework::DDim conf_shape(framework::make_ddim(conf_shape_vec));
+    framework::Tensor loc_tensor;
+    framework::Tensor conf_tensor;
+    loc_tensor.mutable_data<T>(loc_shape, context.GetPlace());
+    conf_tensor.mutable_data<T>(conf_shape, context.GetPlace());
+    // for cpu
+    framework::Tensor loc_cpu;
+    framework::Tensor conf_cpu;
+    framework::Tensor priorbox_cpu;
+    const T* priorbox_data = in_priorbox->data<T>();
+    transpose_fun<DeviceContext, T>(context, *in_loc, &loc_tensor);
+    transpose_fun<DeviceContext, T>(context, *in_conf, &conf_tensor);
+    conf_tensor.Resize(conf_shape_softmax);
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), &conf_tensor,
+        &conf_tensor);
+    T* loc_data = loc_tensor.data<T>();
+    T* conf_data = conf_tensor.data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
+      framework::Copy(loc_tensor, platform::CPUPlace(),
+                      context.device_context(), &loc_cpu);
+      loc_data = loc_cpu.data<T>();
+      conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
+      framework::Copy(conf_tensor, platform::CPUPlace(),
+                      context.device_context(), &conf_cpu);
+      conf_data = conf_cpu.data<T>();
+      priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
+      framework::Copy(*in_priorbox, platform::CPUPlace(),
+                      context.device_context(), &priorbox_cpu);
+      priorbox_data = priorbox_cpu.data<T>();
+    }
+    // get decode bboxes
+    size_t num_priors = in_priorbox->numel() / 8;
+    std::vector<std::vector<operators::math::BBox<T>>> all_decoded_bboxes;
+    for (size_t n = 0; n < batch_size; ++n) {
+      std::vector<operators::math::BBox<T>> decoded_bboxes;
+      for (size_t i = 0; i < num_priors; ++i) {
+        size_t prior_offset = i * 8;
+        size_t loc_pred_offset = n * num_priors * 4 + i * 4;
+        std::vector<math::BBox<T>> prior_bbox_vec;
+        math::GetBBoxFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                      prior_bbox_vec);
+        std::vector<std::vector<T>> prior_bbox_var;
+        math::GetBBoxVarFromPriorData<T>(priorbox_data + prior_offset, 1,
+                                         prior_bbox_var);
+        std::vector<T> loc_pred_data;
+        for (size_t j = 0; j < 4; ++j)
+          loc_pred_data.push_back(*(loc_data + loc_pred_offset + j));
+        math::BBox<T> bbox = math::DecodeBBoxWithVar<T>(
+            prior_bbox_vec[0], prior_bbox_var[0], loc_pred_data);
+        decoded_bboxes.push_back(bbox);
+      }
+      all_decoded_bboxes.push_back(decoded_bboxes);
+    }
+    std::vector<std::map<size_t, std::vector<size_t>>> all_indices;
+    int num_kept = math::GetDetectionIndices<T>(
+        conf_data, num_priors, num_classes, background_label_id, batch_size,
+        confidence_threshold, nms_top_k, nms_threshold, top_k,
+        all_decoded_bboxes, &all_indices);
+
+    if (num_kept <= 0) {
+      std::vector<int64_t> out_shape_vec({0, 0});
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out->Resize(out_shape);
+      return;
+    }
+    std::vector<int64_t> out_shape_vec({num_kept, 7});
+    framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+    out->mutable_data<T>(out_shape, context.GetPlace());
+    framework::Tensor out_cpu;
+    T* out_data = out->data<T>();
+    if (platform::is_gpu_place(context.GetPlace())) {
+      out_cpu.mutable_data<T>(out->dims(), platform::CPUPlace());
+      out_data = out_cpu.data<T>();
+    }
+    math::GetDetectionOutput<T>(conf_data, num_kept, num_priors, num_classes,
+                                batch_size, all_indices, all_decoded_bboxes,
+                                out_data);
+    if (platform::is_gpu_place(context.GetPlace())) {
+      framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
+                      out);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
new file mode 100644
index 0000000000..5274aa204e
--- /dev/null
+++ b/paddle/operators/dropout_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class DropoutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", x_dims);
+    if (ctx->Attrs().Get<bool>("is_test") == false) {
+      ctx->SetOutputDim("Mask", x_dims);
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DropoutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of dropout op.");
+    AddOutput("Out", "The output of dropout op.");
+    AddOutput("Mask", "The random sampled dropout mask.").AsIntermediate();
+
+    AddAttr<float>("dropout_prob", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float& drop_p) {
+          PADDLE_ENFORCE(drop_p >= 0.0f && drop_p <= 1.0f,
+                         "'dropout_prob' must be between 0.0 and 1.0.");
+        });
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<bool>("fix_seed",
+                  "A flag indicating whether to use a fixed seed to generate "
+                  "random mask. NOTE: DO NOT set this flag to true in "
+                  "training. Setting this flag to true is only useful in "
+                  "unittest or for debug that always the same output units "
+                  "will be dropped.")
+        .SetDefault(false);
+    AddAttr<int>("seed", "Dropout random seed.").SetDefault(0);
+
+    AddComment(R"DOC(
+Dropout Operator.
+
+Dropout refers to randomly dropping out units in a nerual network. It is a
+regularization technique for reducing overfitting by preventing neuron
+co-adaption during training. The dropout operator randomly set (according to
+the given dropout probability) the outputs of some units to zero, while others
+are set equal to their corresponding inputs.
+
+)DOC");
+  }
+};
+
+template <typename AttrType>
+class DropoutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_test"), false,
+                      "GradOp is only callable when is_test is false");
+
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Mask must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims, out_dims,
+                      "Dimensions of Input(X) and Out@Grad must be the same.");
+    auto mask_dims = ctx->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(x_dims, mask_dims,
+                      "Dimensions of Input(X) and Mask must be the same.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker<float>, dropout_grad,
+            ops::DropoutOpGrad<float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout,
+    ops::CPUDropoutKernel<paddle::platform::CPUDeviceContext, float, float>);
+REGISTER_OP_CPU_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
new file mode 100644
index 0000000000..84d78445a4
--- /dev/null
+++ b/paddle/operators/dropout_op.cu
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/operators/dropout_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, typename AttrType>
+struct MaskGenerator {
+  AttrType dropout_prob;
+  int seed;
+
+  __host__ __device__ MaskGenerator(AttrType dropout_prob, int seed)
+      : dropout_prob(dropout_prob), seed(seed) {}
+
+  inline __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed);
+    thrust::uniform_real_distribution<AttrType> dist(0, 1);
+    rng.discard(n);
+    if (dist(rng) < dropout_prob) {
+      return static_cast<T>(0);
+    }
+    return static_cast<T>(1);
+  }
+};
+
+// It seems that Eigen::Tensor::setRandom in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename Place, typename T, typename AttrType>
+class GPUDropoutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    y->mutable_data<T>(context.GetPlace());
+    AttrType dropout_prob = context.Attr<AttrType>("dropout_prob");
+
+    auto X = EigenMatrix<T>::Reshape(*x, 1);
+    auto Y = EigenMatrix<T>::Reshape(*y, 1);
+
+    auto& place = *context.template device_context<Place>().eigen_device();
+    if (!context.Attr<bool>("is_test")) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+      int size = framework::product(mask->dims());
+
+      std::random_device rnd;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+
+      thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+      thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                        thrust::device_ptr<T>(mask_data),
+                        MaskGenerator<T, AttrType>(dropout_prob, seed));
+      auto M = EigenMatrix<T>::Reshape(*mask, 1);
+      Y.device(place) = X * M;
+    } else {
+      Y.device(place) = X * (1.0f - dropout_prob);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    dropout,
+    ops::GPUDropoutKernel<paddle::platform::CUDADeviceContext, float, float>);
+REGISTER_OP_CUDA_KERNEL(
+    dropout_grad,
+    ops::DropoutGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
new file mode 100644
index 0000000000..46e5dbc64f
--- /dev/null
+++ b/paddle/operators/dropout_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, typename AttrType>
+class CPUDropoutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Output<Tensor>("Out");
+    const auto* x_data = x->data<T>();
+    auto* y_data = y->mutable_data<T>(context.GetPlace());
+    float dropout_prob = context.Attr<float>("dropout_prob");
+
+    if (!context.Attr<bool>("is_test")) {
+      auto* mask = context.Output<Tensor>("Mask");
+      auto* mask_data = mask->mutable_data<T>(context.GetPlace());
+
+      // NOTE: fixed seed should only be used in unittest or for debug.
+      // Guarantee to use random seed in training.
+      std::random_device rnd;
+      std::minstd_rand engine;
+      int seed =
+          context.Attr<bool>("fix_seed") ? context.Attr<int>("seed") : rnd();
+      engine.seed(seed);
+
+      std::uniform_real_distribution<float> dist(0, 1);
+      size_t size = framework::product(mask->dims());
+      for (size_t i = 0; i < size; ++i) {
+        if (dist(engine) < dropout_prob) {
+          mask_data[i] = 0;
+          y_data[i] = 0;
+        } else {
+          mask_data[i] = 1;
+          y_data[i] = x_data[i];
+        }
+      }
+    } else {
+      auto X = EigenMatrix<T>::Reshape(*x, 1);
+      auto Y = EigenMatrix<T>::Reshape(*y, 1);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      Y.device(place) = X * (1.0f - dropout_prob);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class DropoutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(!context.Attr<bool>("is_test"),
+                   "GradOp is only callable when is_test is false");
+
+    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* mask = context.Input<Tensor>("Mask");
+    grad_x->mutable_data<T>(context.GetPlace());
+
+    auto M = EigenMatrix<T>::Reshape(*mask, 1);
+    auto dX = EigenMatrix<T>::Reshape(*grad_x, 1);
+    auto dY = EigenMatrix<T>::Reshape(*grad_y, 1);
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    dX.device(place) = dY * M;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/edit_distance_op.cc b/paddle/operators/edit_distance_op.cc
new file mode 100644
index 0000000000..7e7dfc79eb
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cc
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/edit_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class EditDistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Hyps"), "Input(Hyps) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Refs"), "Input(Refs) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SequenceNum"),
+                   "Output(SequenceNum) shouldn't be null.");
+    auto hyp_dims = ctx->GetInputDim("Hyps");
+    auto ref_dims = ctx->GetInputDim("Refs");
+    PADDLE_ENFORCE(hyp_dims.size() == 2 && hyp_dims[1] == 1,
+                   "Input(Hyps) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    PADDLE_ENFORCE(ref_dims.size() == 2 && ref_dims[1] == 1,
+                   "Input(Refs) must be a 2-D LoDTensor with the 2nd dimension "
+                   "equal to 1.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("Refs"));
+    ctx->SetOutputDim("SequenceNum", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(framework::proto::DataType::FP32,
+                                   ctx.device_context());
+  }
+};
+
+class EditDistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  EditDistanceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Hyps",
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
+             "The indices for hypothesis strings.");
+    AddInput("Refs",
+             "(2-D LoDTensor<int64_t>, 2nd dim. equal to 1) "
+             "The indices for reference strings.");
+    AddOutput("SequenceNum", "The sequence count of current batch");
+    AddAttr<bool>("normalized",
+                  "(bool, default false) Indicated whether to normalize "
+                  "the edit distance by the length of reference string.")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(2-D Tensor with shape [`batch_size` x 1]) "
+              "The output edit distances of EditDistance operator.");
+    AddComment(R"DOC(
+
+EditDistance operator computes the edit distances between a batch of hypothesis
+strings and their references.
+
+Edit distance, also called Levenshtein distance, measures how dissimilar two strings
+are by counting the minimum number of operations to transform one string into anthor.
+Here the operations include insertion, deletion, and substitution. For example,
+given hypothesis string A = "kitten" and reference B = "sitting", the edit distance
+is 3 for A will be transformed into B at least after two substitutions and one
+insertion:
+
+   "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with the total
+number denoted by `batch_size`, and the separation is specified by the LoD information.
+And the `batch_size` reference strings are arranged in order in the same way in the
+LoDTensor Input(Refs).
+
+Output(Out) contains the `batch_size` results and each stands for the edit stance
+for a pair of strings respectively. If Attr(normalized) is true, the edit distance
+will be divided by the length of reference string.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(edit_distance, ops::EditDistanceOp, ops::EditDistanceOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    edit_distance, ops::EditDistanceKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/edit_distance_op.cu b/paddle/operators/edit_distance_op.cu
new file mode 100644
index 0000000000..c3e116af08
--- /dev/null
+++ b/paddle/operators/edit_distance_op.cu
@@ -0,0 +1,156 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void FillFirstRow(T* dist, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < N + 1) {
+    dist[idx] = idx;
+  }
+}
+
+template <typename T>
+__global__ void FillFirstColumn(T* dist, const int M, const int N) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx < M + 1) {
+    dist[idx * (N + 1)] = idx;
+  }
+}
+
+template <typename T>
+__global__ void Levenshtein(T* dist, const int64_t* x1, const int64_t* x2,
+                            const int M, const int N, const int start) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int offset = N;
+  int index = start + idx * offset;
+  int row = index / (N + 1);
+  int col = index % (N + 1);
+  if (row > 0 && col > 0 && row < M + 1 && col < N + 1) {
+    int cost = x1[row - 1] == x2[col - 1] ? 0 : 1;
+    int dels = dist[(row - 1) * (N + 1) + col] + 1;
+    int ins = dist[row * (N + 1) + col - 1] + 1;
+    int subs = dist[(row - 1) * (N + 1) + (col - 1)] + cost;
+    dist[index] = min(dels, min(ins, subs));
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(T* out, const T* dist, const int M, const int N,
+                          bool normalized) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  if (idx == 0) {
+    out[0] = normalized ? dist[M * (N + 1) + N] / N : dist[M * (N + 1) + N];
+  }
+}
+
+template <typename Place, typename T>
+class EditDistanceGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    sequence_num->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto normalized = ctx.Attr<bool>("normalized");
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+
+    const size_t num_strs = hyp_lod.size() - 1;
+    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
+    set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
+                 sequence_num, static_cast<int64_t>(num_strs));
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<T>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; num++) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+      if (m == 0 || n == 0) {
+        distance = std::max(m, n);
+        if (normalized) {
+          PADDLE_ENFORCE(n > 0,
+                         "The reference string (#%d) cannot be empty "
+                         "when Attr(normalized) is enabled.",
+                         n);
+          distance = distance / n;
+        }
+        memory::Copy(boost::get<Place>(ctx.GetPlace()), out + num,
+                     platform::CPUPlace(), &distance, sizeof(T), stream);
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
+
+        FillFirstColumn<T><<<1 + m / PADDLE_CUDA_NUM_THREADS,
+                             PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, m, n);
+
+        FillFirstRow<T><<<1 + n / PADDLE_CUDA_NUM_THREADS,
+                          PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, n);
+        // Compute the elements of distance matrix in the anti-diagonal diretion
+        for (int64_t slice = 2; slice < m + n + 1; ++slice) {
+          int z_m = slice < m + 1 ? 0 : slice - m;
+          int z_n = slice < n + 1 ? 0 : slice - n;
+          int size = slice - (z_m + z_n) + 1;  // number of elments in the same
+                                               // anti-diagonal line to update
+          // the start index at which computes from
+          int start = slice < n + 1 ? slice : (z_n + 1) * (n + 1) - 1;
+          Levenshtein<T><<<1 + (size - 1) / PADDLE_CUDA_NUM_THREADS,
+                           PADDLE_CUDA_NUM_THREADS, 0, stream>>>(dist, x1, x2,
+                                                                 m, n, start);
+        }
+        SetOutput<T><<<1, 1, 0, stream>>>(out + num, dist, m, n, normalized);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    edit_distance,
+    ops::EditDistanceGPUKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/edit_distance_op.h b/paddle/operators/edit_distance_op.h
new file mode 100644
index 0000000000..974299e604
--- /dev/null
+++ b/paddle/operators/edit_distance_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class EditDistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+
+    auto* x1_t = ctx.Input<framework::LoDTensor>("Hyps");
+    auto* x2_t = ctx.Input<framework::LoDTensor>("Refs");
+    auto* sequence_num = ctx.Output<framework::Tensor>("SequenceNum");
+    int64_t* seq_num_data = sequence_num->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto normalized = ctx.Attr<bool>("normalized");
+
+    auto hyp_lod = x1_t->lod()[0];
+    auto ref_lod = x2_t->lod()[0];
+    PADDLE_ENFORCE(
+        hyp_lod.size() == ref_lod.size(),
+        "Input(Hyps) and Input(Refs) must have the same batch size.");
+    for (size_t i = 1; i < ref_lod.size(); ++i) {
+      PADDLE_ENFORCE(ref_lod[i] > ref_lod[i - 1],
+                     "Reference string %d is empty.", i);
+    }
+    auto num_strs = hyp_lod.size() - 1;
+    *seq_num_data = static_cast<int64_t>(num_strs);
+
+    out_t->Resize({static_cast<int64_t>(num_strs), 1});
+    out_t->mutable_data<float>(ctx.GetPlace());
+    auto out = out_t->data<T>();
+
+    T distance = 0.0;
+    for (size_t num = 0; num < num_strs; ++num) {
+      auto m = static_cast<int64_t>(hyp_lod[num + 1] - hyp_lod[num]);
+      auto n = static_cast<int64_t>(ref_lod[num + 1] - ref_lod[num]);
+
+      if (m == 0) {
+        distance = n;
+      } else if (n == 0) {
+        distance = m;
+      } else {
+        framework::Tensor dist_t;
+        dist_t.Resize({m + 1, n + 1});
+        dist_t.mutable_data<T>(ctx.GetPlace());
+        auto dist = dist_t.data<T>();
+        auto x1 = x1_t->data<int64_t>() + hyp_lod[num];
+        auto x2 = x2_t->data<int64_t>() + ref_lod[num];
+        for (int64_t i = 0; i < m + 1; ++i) {
+          dist[i * (n + 1)] = i;
+        }
+        for (int64_t j = 0; j < n + 1; ++j) {
+          dist[j] = j;
+        }
+        for (int64_t i = 1; i < m + 1; ++i) {
+          for (int64_t j = 1; j < n + 1; ++j) {
+            int cost = x1[i - 1] == x2[j - 1] ? 0 : 1;
+            int dels = dist[(i - 1) * (n + 1) + j] + 1;
+            int ins = dist[i * (n + 1) + (j - 1)] + 1;
+            int subs = dist[(i - 1) * (n + 1) + (j - 1)] + cost;
+            dist[i * (n + 1) + j] = std::min(dels, std::min(ins, subs));
+          }
+        }
+        distance = dist[m * (n + 1) + n];
+      }
+
+      if (normalized) {
+        PADDLE_ENFORCE(n > 0,
+                       "The reference string (#%d) cannot be empty "
+                       "when Attr(normalized) is enabled.",
+                       n);
+        distance = distance / n;
+      }
+      out[num] = distance;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
new file mode 100644
index 0000000000..37951fa758
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_add_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseAddOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Add", "Out = X + Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker,
+            elementwise_add_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu
new file mode 100644
index 0000000000..641cea323a
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_add_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_add,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_add_grad,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseAddGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
new file mode 100644
index 0000000000..a8389429f2
--- /dev/null
+++ b/paddle/operators/elementwise_add_op.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct AddFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseAddGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseAddBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
+                           ElementwiseAddBroadCastGradFunctor<T>,
+                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
new file mode 100644
index 0000000000..6ebd58b1b3
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_div_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseDivOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Div", "Out = X / Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker,
+            elementwise_div_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu
new file mode 100644
index 0000000000..a0372123d6
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_div_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_div,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_div_grad,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
new file mode 100644
index 0000000000..ef26cb6c91
--- /dev/null
+++ b/paddle/operators/elementwise_div_op.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct DivFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a / b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseDivGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto z_e = framework::EigenVector<T>::Flatten(*z);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseDivBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e / y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
+                           ElementwiseDivBroadCastGradFunctor<T>,
+                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_max_op.cc b/paddle/operators/elementwise_max_op.cc
new file mode 100644
index 0000000000..53c27ae5be
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_max_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMaxOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = max(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_max, ops::ElementwiseOp, ops::ElementwiseMaxOpMaker,
+            elementwise_max_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_max_op.cu b/paddle/operators/elementwise_max_op.cu
new file mode 100644
index 0000000000..5ff4af1747
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_max_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_max_grad,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMaxGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
new file mode 100644
index 0000000000..255728e8e6
--- /dev/null
+++ b/paddle/operators/elementwise_max_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MaxFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a > b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMaxBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
+                           ElementwiseMaxBroadCastGradFunctor<T>,
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_min_op.cc b/paddle/operators/elementwise_min_op.cc
new file mode 100644
index 0000000000..99482e1bf6
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_min_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseMinOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMinOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Max", "Out = min(X, Y)");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_min, ops::ElementwiseOp, ops::ElementwiseMinOpMaker,
+            elementwise_min_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_min_op.cu b/paddle/operators/elementwise_min_op.cu
new file mode 100644
index 0000000000..3547e6ccb7
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_min_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_min_grad,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMinGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
new file mode 100644
index 0000000000..e6627a0f1b
--- /dev/null
+++ b/paddle/operators/elementwise_min_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MinFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a < b ? a : b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseMinGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMinBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
+                           ElementwiseMinBroadCastGradFunctor<T>,
+                           ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
new file mode 100644
index 0000000000..450dd05c79
--- /dev/null
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_mul_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseMulOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Mul", "Out = X \\odot\\ Y");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
+            elementwise_mul_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
new file mode 100644
index 0000000000..f73e8afda9
--- /dev/null
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_mul_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mul,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_mul_grad,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
new file mode 100644
index 0000000000..4b86b00b5a
--- /dev/null
+++ b/paddle/operators/elementwise_mul_op.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct MulFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a * b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseMulGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = x_e * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMulBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseMulBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto x_e = framework::EigenVector<T>::Flatten(*x);
+    auto y_e = framework::EigenVector<T>::Flatten(*y);
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+
+    auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e * y_e_bcast;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (x_e * dz_e)
+                           .reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
+                           ElementwiseMulBroadCastGradFunctor<T>,
+                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
new file mode 100644
index 0000000000..1a0131d8b9
--- /dev/null
+++ b/paddle/operators/elementwise_op.h
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class ElementwiseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  using Tensor = framework::Tensor;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of elementwise op should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of elementwise op should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
+                      "Rank of first input must >= rank of second input.");
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ElementwiseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
+    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
+    AddOutput("Out", "The output of elementwise op.");
+    AddAttr<int>("axis",
+                 "(int, default -1). The start dimension index "
+                 "for broadcasting Y onto X.")
+        .SetDefault(-1)
+        .EqualGreaterThan(-1);
+    comment_ = R"DOC(
+Limited Elementwise {name} Operator.
+
+The equation is:
+
+$${equation}$$
+
+$X$ is a tensor of any dimension and the dimensions of tensor $Y$ must be
+smaller than or equal to the dimensions of $X$.
+
+There are two cases for this operator:
+1. The shape of $Y$ is same with $X$;
+2. The shape of $Y$ is a subset of $X$.
+
+For case 2:
+$Y$ will be broadcasted to match the shape of $X$ and axis should be
+set to index of the start dimension to broadcast $Y$ onto $X$.
+
+For example
+  .. code-block:: python
+
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Either of the inputs $X$ and $Y$ or none can carry the LoD (Level of Details)
+information. However, the output only shares the LoD information with input $X$.
+
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string& src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string equation) {
+    Replace(comment_, "{name}", name);
+    Replace(comment_, "{equation}", equation);
+  }
+};
+
+class ElementwiseOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  using Tensor = framework::Tensor;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
new file mode 100644
index 0000000000..d749b8e875
--- /dev/null
+++ b/paddle/operators/elementwise_op_function.h
@@ -0,0 +1,414 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/platform/transform.h"
+
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
+
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+/*
+ * Out = X ⊙ Y
+ * If Y's shape does not match X' shape, they will be reshaped.
+ * For example:
+ * 1. shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+ *    pre=2, n=3*4, post=5
+ *    x.shape(2, 12, 5) * y.shape(1,12,1).broadcast(2,12,5)
+ * 2. shape(X) = (2, 3, 4, 5), shape(Y) = (4,5)
+ *    pre=2*3, n=4*5, post=1
+ *    x.shape(2, 3, 20) * y.shape(1,1,20).broadcast(2,3,20)
+ */
+inline void get_mid_dims(const framework::DDim& x_dims,
+                         const framework::DDim& y_dims, const int axis,
+                         int& pre, int& n, int& post) {
+  pre = 1;
+  n = 1;
+  post = 1;
+  for (int i = 0; i < axis; ++i) {
+    pre *= x_dims[i];
+  }
+
+  for (int i = 0; i < y_dims.size(); ++i) {
+    PADDLE_ENFORCE_EQ(x_dims[i + axis], y_dims[i],
+                      "Broadcast dimension mismatch.");
+    n *= y_dims[i];
+  }
+
+  for (int i = axis + y_dims.size(); i < x_dims.size(); ++i) {
+    post *= x_dims[i];
+  }
+}
+
+template <typename T, typename DeviceContext>
+class RowwiseTransformIterator;
+template <typename T, typename DeviceContext>
+class MidWiseTransformIterator;
+
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+
+  RowwiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++i_;
+    if (UNLIKELY(i_ == n_)) {
+      i_ = 0;
+    }
+    return *this;
+  }
+
+  bool operator==(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const RowwiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUDeviceContext> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+
+  MidWiseTransformIterator<T, platform::CPUDeviceContext>& operator++() {
+    ++j_;
+    if (UNLIKELY(j_ == post_)) {
+      ++i_;
+      j_ = 0;
+      if (UNLIKELY(i_ == n_)) {
+        i_ = 0;
+      }
+    }
+    return *this;
+  }
+
+  bool operator==(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+
+  bool operator!=(const MidWiseTransformIterator<T, platform::CPUDeviceContext>&
+                      rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+
+  const T& operator*() { return ptr_[i_]; }
+
+ private:
+  const T* ptr_;
+  int64_t i_;
+  int64_t j_;
+  int64_t n_;
+  int64_t post_;
+};
+
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+
+template <typename T>
+class MidWiseTransformIterator<T, platform::CUDADeviceContext>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::CUDADeviceContext>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+
+template <typename Functor, typename T, typename DeviceContext,
+          typename OutType = T>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const DeviceContext& ctx, Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<OutType>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+
+  inline void Run() const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, DeviceContext>(y_, n),
+          z_, func_);
+  }
+
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<DeviceContext> trans;
+    trans(ctx_, x_, x_ + nx_,
+          MidWiseTransformIterator<T, DeviceContext>(y_, n, post), z_, func_);
+  }
+
+ private:
+  const T* x_;
+  const T* y_;
+  OutType* z_;
+  int64_t nx_;
+  const DeviceContext& ctx_;
+  Functor func_;
+};
+
+#define EIGEN_FUNCTOR(name, eigen_op)                                          \
+  struct Eigen##name##Functor {                                                \
+    template <typename DeviceContext, typename T>                              \
+    inline void Run(const framework::Tensor* x, const framework::Tensor* y,    \
+                    framework::Tensor* z,                                      \
+                    const framework::ExecutionContext& ctx) {                  \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_e);                                                  \
+    }                                                                          \
+    template <typename DeviceContext, typename T>                              \
+    inline void RunBroadCast(const framework::Tensor* x,                       \
+                             const framework::Tensor* y, framework::Tensor* z, \
+                             const framework::ExecutionContext& ctx, int pre,  \
+                             int n) {                                          \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))                  \
+                         .broadcast(Eigen::DSizes<int, 2>(pre, 1))             \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
+    }                                                                          \
+    template <typename DeviceContext, typename T>                              \
+    inline void RunBroadCast2(const framework::Tensor* x,                      \
+                              const framework::Tensor* y,                      \
+                              framework::Tensor* z,                            \
+                              const framework::ExecutionContext& ctx, int pre, \
+                              int n, int post) {                               \
+      auto x_e = framework::EigenVector<T>::Flatten(*x);                       \
+      auto y_e = framework::EigenVector<T>::Flatten(*y);                       \
+      auto z_e = framework::EigenVector<T>::Flatten(*z);                       \
+      auto y_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))               \
+                         .broadcast(Eigen::DSizes<int, 3>(pre, 1, post))       \
+                         .reshape(Eigen::DSizes<int, 1>(x_e.size()));          \
+      z_e.device(                                                              \
+          *ctx.template device_context<DeviceContext>().eigen_device()) =      \
+          eigen_op(x_e, y_bcast);                                              \
+    }                                                                          \
+  }
+
+template <class functor, typename DeviceContext, typename T>
+void ElementwiseCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<T>(ctx.GetPlace());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.");
+
+  if (x_dims == y_dims) {
+    functor f;
+    f.template Run<DeviceContext, T>(x, y, z, ctx);
+    return;
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor f;
+    f.template RunBroadCast<DeviceContext, T>(x, y, z, ctx, pre, n);
+    return;
+  } else {
+    functor f;
+    f.template RunBroadCast2<DeviceContext, T>(x, y, z, ctx, pre, n, post);
+    return;
+  }
+}
+
+#define EIGEN_ADD(x, y) ((x) + (y))
+EIGEN_FUNCTOR(Add, EIGEN_ADD);
+
+#define EIGEN_SUB(x, y) ((x) - (y))
+EIGEN_FUNCTOR(Sub, EIGEN_SUB);
+
+#define EIGEN_MUL(x, y) ((x) * (y))
+EIGEN_FUNCTOR(Mul, EIGEN_MUL);
+
+#define EIGEN_DIV(x, y) ((x) / (y))
+EIGEN_FUNCTOR(Div, EIGEN_DIV);
+
+template <typename DeviceContext, typename T, typename functor,
+          typename broadcastfunctor, typename broadcast2functor>
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* out = ctx.Input<Tensor>("Out");
+  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+  auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+
+  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+  if (dx) {
+    dx->mutable_data<T>(ctx.GetPlace());
+  }
+  if (dy) {
+    dy->mutable_data<T>(ctx.GetPlace());
+  }
+
+  if (x_dims == y_dims) {
+    functor f;
+    f(place, x, y, out, dx, dy, dout);
+    return;
+  }
+
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+
+  if (post == 1) {
+    broadcastfunctor f;
+    f(place, x, y, out, dx, dy, dout, pre, n);
+    return;
+  } else {
+    broadcast2functor f;
+    f(place, x, y, out, dx, dy, dout, pre, n, post);
+    return;
+  }
+}
+
+template <typename Functor, typename DeviceContext, typename T,
+          typename OutType = T>
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
+  using Tensor = framework::Tensor;
+
+  auto* x = ctx.Input<Tensor>("X");
+  auto* y = ctx.Input<Tensor>("Y");
+  auto* z = ctx.Output<Tensor>("Out");
+  z->mutable_data<OutType>(ctx.GetPlace());
+  TransformFunctor<Functor, T, DeviceContext, OutType> functor(
+      x, y, z, ctx.template device_context<DeviceContext>(), Functor());
+
+  auto x_dims = x->dims();
+  auto y_dims = y->dims();
+  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                    "Rank of first input must >= rank of second input.");
+
+  if (x_dims == y_dims) {
+    functor.Run();
+    return;
+  }
+
+  if (y_dims.size() == 1 && y_dims[0] == 1) {
+    // y is a scalar
+    auto extended_dims = framework::vectorize(x_dims);
+    extended_dims.push_back(1);
+    x_dims = framework::make_ddim(extended_dims);
+  }
+
+  int axis = ctx.Attr<int>("axis");
+  axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+  PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                 "Axis should be in range [0, x_dims)");
+
+  int pre, n, post;
+  get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+  if (post == 1) {
+    functor.RunRowWise(n, pre);
+    return;
+  } else {
+    functor.RunMidWise(n, pre, post);
+    return;
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_pow_op.cc b/paddle/operators/elementwise_pow_op.cc
new file mode 100644
index 0000000000..5293cc7dd3
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cc
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_pow_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwisePowOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwisePowOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Pow", "Out = X ^ Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(elementwise_pow, ops::ElementwiseOp,
+                             ops::ElementwisePowOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.cu b/paddle/operators/elementwise_pow_op.cu
new file mode 100644
index 0000000000..643c978e63
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_pow_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_pow,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwisePowKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
new file mode 100644
index 0000000000..6019e709e0
--- /dev/null
+++ b/paddle/operators/elementwise_pow_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cmath>
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct PowFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return std::pow(a, b); }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwisePowKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
new file mode 100644
index 0000000000..d3c51f0a69
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/elementwise_sub_op.h"
+#include "paddle/operators/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+class ElementwiseSubOpMaker : public ElementwiseOpMaker {
+ public:
+  ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : ElementwiseOpMaker(proto, op_checker) {
+    SetComment("Sub", "Out = X - Y");
+    AddComment(comment_);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker,
+            elementwise_sub_grad, ops::ElementwiseOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu
new file mode 100644
index 0000000000..7a2516ef6a
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.cu
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/elementwise_sub_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_sub,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    elementwise_sub_grad,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
+                                  int64_t>);
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
new file mode 100644
index 0000000000..a2aca79302
--- /dev/null
+++ b/paddle/operators/elementwise_sub_op.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/elementwise_op_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(T a, T b) const { return a - b; }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseSubKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
+  }
+};
+
+template <typename T>
+struct ElementwiseSubGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) * dz_e;
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCastGradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
+                           .sum(Eigen::array<int, 1>{{0}});
+    }
+  }
+};
+
+template <typename T>
+struct ElementwiseSubBroadCast2GradFunctor {
+  template <typename Device, typename X, typename Y, typename Z, typename dX,
+            typename dY, typename dZ, typename Pre, typename N, typename Post>
+  void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
+                  Post post) {
+    auto dz_e = framework::EigenVector<T>::Flatten(*dz);
+    if (dx) {
+      auto dx_e = framework::EigenVector<T>::Flatten(*dx);
+      dx_e.device(d) = dz_e;
+    }
+
+    if (dy) {
+      auto dy_e = framework::EigenVector<T>::Flatten(*dy);
+      dy_e.device(d) = (-1.0) *
+                       dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
+                           .sum(Eigen::array<int, 2>{{0, 2}});
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
+                           ElementwiseSubBroadCastGradFunctor<T>,
+                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc
new file mode 100644
index 0000000000..043c93654d
--- /dev/null
+++ b/paddle/operators/expand_op.cc
@@ -0,0 +1,137 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto x_dims = ctx->GetInputDim("X");
+
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims.size()), expand_times.size(),
+                      "The number of Attr(expand_times)'s value must be equal "
+                      "to the rank of Input(X).");
+    PADDLE_ENFORCE_LE(x_dims.size(), 6,
+                      "The rank of Input(X) must not be greater than 6.");
+
+    std::vector<int64_t> out_shape(x_dims.size());
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_GE(expand_times[i], 1,
+                        "Each value of Attr(expand_times) should not be "
+                        "less than 1.");
+      out_shape[i] = x_dims[i] * expand_times[i];
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+    if (out_shape[0] == x_dims[0]) {
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+};
+
+class ExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+             "X is the input to be expanded.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>). A tensor with rank in [1, 6]."
+              "The rank of Output(Out) have the same with Input(X). "
+              "After expanding, size of each dimension of Output(Out) is equal "
+              "to size of the corresponding dimension of Input(X) multiplying "
+              "the corresponding value given by Attr(expand_times).");
+    AddAttr<std::vector<int>>("expand_times",
+                              "Expand times number for each dimension.");
+    AddComment(R"DOC(
+Expand operator tiles the input by given times number. You should set times
+number for each dimension by providing attribute 'expand_times'. The rank of X
+should be in [1, 6]. Please note that size of 'expand_times' must be the same
+with X's rank. Following is a using case:
+
+Input(X) is a 3-D tensor with shape [2, 3, 1]:
+
+        [
+           [[1], [2], [3]],
+           [[4], [5], [6]]
+        ]
+
+Attr(expand_times):  [1, 2, 2]
+
+Output(Out) is a 3-D tensor with shape [2, 6, 2]:
+
+        [
+            [[1, 1], [2, 2], [3, 3], [1, 1], [2, 2], [3, 3]],
+            [[4, 4], [5, 5], [6, 6], [4, 4], [5, 5], [6, 6]]
+        ]
+
+)DOC");
+  }
+};
+
+class ExpandGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> expand_times =
+        ctx->Attrs().Get<std::vector<int>>("expand_times");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      PADDLE_ENFORCE_EQ(x_dims[i] * expand_times[i], out_dims[i],
+                        "Each dimension size of Input(Out@GRAD) should be "
+                        "equal to multiplication of crroresponding dimension "
+                        "size of Input(X) and Attr(expand_times) value.");
+    }
+
+    auto x_grad_name = framework::GradVarName("X");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad,
+            ops::ExpandGradOp);
+REGISTER_OP_CPU_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu
new file mode 100644
index 0000000000..84e8fa567b
--- /dev/null
+++ b/paddle/operators/expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    expand, ops::ExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    expand_grad,
+    ops::ExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
new file mode 100644
index 0000000000..a4994cf3a5
--- /dev/null
+++ b/paddle/operators/expand_op.h
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <boost/preprocessor/arithmetic/div.hpp>
+#include <boost/preprocessor/arithmetic/mod.hpp>
+#include <boost/preprocessor/comparison/greater.hpp>
+#include <boost/preprocessor/comparison/greater_equal.hpp>
+#include <boost/preprocessor/control/if.hpp>
+#include <boost/preprocessor/repetition/repeat.hpp>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+#define MAX_RANK_SUPPORTED 6
+
+#define EXPAND_TEMPLATE(z, n, data) \
+  case n + 1: {                     \
+    Expand<n + 1>(context);         \
+    break;                          \
+  }
+#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~)
+#define COND(n)                                               \
+  BOOST_PP_GREATER_EQUAL(BOOST_PP_DIV(n, MAX_RANK_SUPPORTED), \
+                         BOOST_PP_MOD(n, MAX_RANK_SUPPORTED))
+#define EXPAND_GRAD_CASE(n)                                        \
+  case n: {                                                        \
+    ExpandBackward<n>(context, reshape_dims_vec, reduce_dims_vec); \
+    break;                                                         \
+  }
+#define EXPAND_GRAD_TEMPLATE(z, n, data) \
+  BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), )
+#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~)
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED)
+      default:
+        PADDLE_ENFORCE(false,
+                       "Only support tensor with rank being between 1 and 6.");
+    }
+  }
+
+ protected:
+  template <int Rank>
+  void Expand(const framework::ExecutionContext& context) const {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto* out0 = context.Output<Tensor>("Out");
+    Eigen::DSizes<int, Rank> bcast_dims;
+    auto x_dims = in0->dims();
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      bcast_dims[i] = expand_times[i];
+    }
+    auto x = EigenTensor<T, Rank>::From(*in0);
+    out0->mutable_data<T>(context.GetPlace());
+    auto y = EigenTensor<T, Rank>::From(*out0);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    y.device(place) = x.broadcast(bcast_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto& expand_times = context.Attr<std::vector<int>>("expand_times");
+    auto x_dims = in0->dims();
+    // 1. reshape_dims_vec is the broadcast parameter. For each dimension i,
+    //    if expand_times[i] > 1 and x_dims[i] > 1, i will be splitted to two
+    //    dimensions [expand_times[i], x_dims[i]].
+    // 2. reduce_dims_vec is the dimension parameter to compute gradients. For
+    //    each dimension expanded, the gradients should be summed to original
+    //    size.
+    std::vector<int> reshape_dims_vec;
+    std::vector<int> reduce_dims_vec;
+    for (size_t i = 0; i < expand_times.size(); ++i) {
+      if (expand_times[i] == 1) {
+        reshape_dims_vec.push_back(x_dims[i]);
+      } else {
+        if (x_dims[i] == 1) {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+        } else {
+          reduce_dims_vec.push_back(reshape_dims_vec.size());
+          reshape_dims_vec.push_back(expand_times[i]);
+          reshape_dims_vec.push_back(x_dims[i]);
+        }
+      }
+    }
+
+    int dims = reshape_dims_vec.size() * MAX_RANK_SUPPORTED +
+               reduce_dims_vec.size() - MAX_RANK_SUPPORTED - 1;
+    // no need reduce, just copy
+    if (reduce_dims_vec.size() == 0) {
+      auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+      out0->mutable_data<T>(context.GetPlace());
+      framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
+    } else {
+      switch (dims) {
+        REP_EXPAND_GRAD_TEMPLATE(72)
+        default:
+          PADDLE_ENFORCE(
+              false, "Only support tensor with rank being between 1 and 6.");
+      }
+    }
+  }
+
+ protected:
+  template <int Dims>
+  void ExpandBackward(const framework::ExecutionContext& context,
+                      const std::vector<int>& reshape_dims_vec,
+                      const std::vector<int>& reduce_dims_vec) const {
+    size_t reshape_size = Dims / MAX_RANK_SUPPORTED + 1;
+    size_t reduce_size = Dims % MAX_RANK_SUPPORTED + 1;
+    PADDLE_ENFORCE_EQ(reshape_size, reshape_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reshape dimensions.");
+    PADDLE_ENFORCE_EQ(reduce_size, reduce_dims_vec.size(),
+                      "Inconsistent size between template Dims and "
+                      "reduce dimensions.");
+    auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto x = EigenVector<T>::Flatten(*(context.Input<Tensor>("X")));
+    out0->mutable_data<T>(context.GetPlace());
+    auto x_grad = EigenVector<T>::Flatten(*out0);
+    Eigen::DSizes<int, Dims / MAX_RANK_SUPPORTED + 1> reshape_dims;
+    for (size_t i = 0; i < reshape_size; ++i) {
+      reshape_dims[i] = reshape_dims_vec[i];
+    }
+    Eigen::DSizes<int, Dims % MAX_RANK_SUPPORTED + 1> reduce_dims;
+    for (size_t i = 0; i < reduce_size; ++i) {
+      reduce_dims[i] = reduce_dims_vec[i];
+    }
+    auto out_grad = EigenVector<T>::Flatten(*in0);
+    x_grad.device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
new file mode 100644
index 0000000000..d738e1850c
--- /dev/null
+++ b/paddle/operators/feed_op.cc
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class FeedOp : public framework::OperatorBase {
+ public:
+  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto feed_var_name = Input("X");
+    auto *feed_var = scope.FindVar(feed_var_name);
+
+    PADDLE_ENFORCE(feed_var != nullptr,
+                   "Cannot find feed_var in scope, feed_var_name is %s",
+                   feed_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = Attr<int>("col");
+
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var "
+            << out_name;
+
+    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    auto &feed_item = feed_list.at(static_cast<size_t>(col));
+    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::Copy(feed_item, place, dev_ctx, out_item);
+    out_item->set_lod(feed_item.lod());
+  }
+};
+
+class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FeedOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of feed op");
+    AddOutput("Out", "The output of feed op");
+    AddAttr<int>("col", "(int) The column of feed");
+    AddComment(R"DOC(
+Feed Operator.
+
+It should not be configured by users directly.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
new file mode 100644
index 0000000000..7205ee2a87
--- /dev/null
+++ b/paddle/operators/fetch_op.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchOp : public framework::OperatorBase {
+ public:
+  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto fetch_var_name = Input("X");
+    auto *fetch_var = scope.FindVar(fetch_var_name);
+    PADDLE_ENFORCE(fetch_var != nullptr,
+                   "Cannot find fetch variable in scope, fetch_var_name is %s",
+                   fetch_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = static_cast<size_t>(Attr<int>("col"));
+
+    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
+    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
+
+    if (col >= fetch_list->size()) {
+      fetch_list->resize(col + 1);
+    }
+    auto &dst_item = fetch_list->at(col);
+
+    // FIXME(yuyang18): Should we assume the fetch operator always generate
+    // CPU outputs?
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(src_item.place());
+
+    Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
+    dev_ctx.Wait();
+    dst_item.set_lod(src_item.lod());
+
+    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
+  }
+};
+
+class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FetchOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fetch op");
+    AddOutput("Out", "The output of fetch op");
+    AddAttr<int>("col", "(int) The column of fetch");
+    AddComment(R"DOC(
+Fetch Operator.
+
+It should not be configured by users directly.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc
new file mode 100644
index 0000000000..c74a5b6ced
--- /dev/null
+++ b/paddle/operators/fill_constant_batch_size_like_op.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_constant_batch_size_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Input"),
+        "Input(Input) of FillConstantBatchSizeLikeOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FillConstantBatchSizeLikeOp should not be null.");
+
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE_GT(shape.size(), 0);
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto output_dim = framework::make_ddim(shape_int64);
+
+    int input_dim_idx = ctx->Attrs().Get<int>("input_dim_idx");
+    PADDLE_ENFORCE_GE(input_dim_idx, 0);
+    PADDLE_ENFORCE_GT(ctx->GetInputDim("Input").size(), input_dim_idx);
+
+    int output_dim_idx = ctx->Attrs().Get<int>("output_dim_idx");
+    PADDLE_ENFORCE_GE(output_dim_idx, 0);
+    PADDLE_ENFORCE_GT(static_cast<int>(shape.size()), output_dim_idx);
+
+    output_dim[output_dim_idx] = ctx->GetInputDim("Input")[input_dim_idx];
+    ctx->SetOutputDim("Out", output_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class FillConstantBatchSizeLikeOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantBatchSizeLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddInput("Input",
+             "(Tensor) Tensor "
+             "whose dim_idx th dimension is used to specify the batch_size");
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<int>("input_dim_idx",
+                 "(int, default 0) The index of input's batch size dimension")
+        .SetDefault(0);
+    AddAttr<int>("output_dim_idx",
+                 "(int, default 0) The index of output's batch size dimension")
+        .SetDefault(0);
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill_constant_batch_size_like,
+                  ops::FillConstantBatchSizeLikeOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  ops::FillConstantBatchSizeLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
+                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
new file mode 100644
index 0000000000..608f4b9162
--- /dev/null
+++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fill_constant_batch_size_like,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           float>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           double>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int>,
+    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+                                           int64_t>);
diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h
new file mode 100644
index 0000000000..66da9d0307
--- /dev/null
+++ b/paddle/operators/fill_constant_batch_size_like_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto value = ctx.Attr<float>("value");
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(ctx.template device_context<DeviceContext>(), out,
+           static_cast<T>(value));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
new file mode 100644
index 0000000000..dcd43a30c8
--- /dev/null
+++ b/paddle/operators/fill_constant_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillConstantOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::DataType>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
+  }
+};
+
+class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddAttr<bool>("force_cpu",
+                  "(bool, default false) Force fill output variable to cpu "
+                  "memory. Otherwise, fill output variable to the running "
+                  "device")
+        .SetDefault(false);
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(
+FillConstantBatchSizeLike Operator.
+
+Fill up a variable with specified constant value.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc
new file mode 100644
index 0000000000..4f5a2ed169
--- /dev/null
+++ b/paddle/operators/fill_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct FillOpVisitor {
+  FillOpVisitor(framework::LoDTensor *tensor, const std::vector<float> &value)
+      : tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    platform::CPUPlace cpu;
+    auto *data = tensor_->mutable_data<T>(cpu);
+    std::transform(value_.data(), value_.data() + tensor_->numel(), data,
+                   [](float dat) { return static_cast<T>(dat); });
+  }
+
+  framework::LoDTensor *tensor_;
+  const std::vector<float> &value_;
+};
+
+class FillOp : public framework::OperatorBase {
+ public:
+  FillOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &out =
+        detail::Ref(detail::Ref(scope.FindVar(Output("Out")),
+                                "Cannot find variable %s", Output("Out"))
+                        .GetMutable<framework::LoDTensor>());
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    auto dtype = static_cast<framework::proto::DataType>(Attr<int>("dtype"));
+    platform::CPUPlace cpu;
+    auto force_cpu = Attr<bool>("force_cpu");
+    out.mutable_data(force_cpu ? cpu : place, framework::ToTypeIndex(dtype));
+
+    framework::LoDTensor tensor;
+
+    if (force_cpu || platform::is_cpu_place(place)) {
+      tensor.ShareDataWith(out);
+    } else {
+      // Always make tensor in CPU memory.
+      tensor.Resize(out.dims());
+      tensor.mutable_data(cpu, framework::ToTypeIndex(dtype));
+    }
+
+    framework::VisitDataType(
+        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
+    if (!force_cpu && platform::is_gpu_place(place)) {
+      // Copy tensor to out
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(tensor, place, dev_ctx, &out);
+    }
+  }
+};
+
+class FillOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(Fill operator
+
+Fill an tensor with `value` and `shape`. The type of the tensor is specify by
+`dtype`.
+)DOC");
+    AddOutput("Out", "(LoDTensor) The output tensor.");
+    AddAttr<std::vector<float>>(
+        "value", "The float values of tensor, which are flatten in row major");
+    AddAttr<std::vector<int>>("shape", "The shape of output tensor");
+    AddAttr<int>("dtype", "The data type of output tensor, Default is float")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddAttr<bool>("force_cpu",
+                  "Whether the output tensor must be at CPU memory or not. "
+                  "Default is false.")
+        .SetDefault(false);
+  }
+};
+
+class FillOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(
+        "Out",
+        framework::make_ddim(context->Attrs().Get<std::vector<int>>("shape")));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
new file mode 100644
index 0000000000..b4ae1de876
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of FillZerosLikeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillZerosLikeOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillZerosLikeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fill-zeros-like op.");
+    AddOutput("Out", "The variable will be filled up with zeros.");
+    AddComment(R"DOC(
+FillZerosLike Operator.
+
+Fill up a variable with zeros.
+The output will have the same size as the input.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp,
+                             ops::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CPUDeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc
new file mode 100644
index 0000000000..b7048e8f58
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    fill_zeros_like,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, int64_t>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::FillZerosLikeKernel<paddle::platform::CUDADeviceContext, bool>);
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
new file mode 100644
index 0000000000..351ecf8b2f
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -0,0 +1,36 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class FillZerosLikeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    math::SetConstant<DeviceContext, T> setter;
+    setter(context.template device_context<DeviceContext>(), out,
+           static_cast<T>(0));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc
new file mode 100644
index 0000000000..d00700823d
--- /dev/null
+++ b/paddle/operators/ftrl_op.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/ftrl_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FTRLOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("SquaredAccumulator"),
+                   "Input(SquaredAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LinearAccumulator"),
+                   "Input(LinearAccumulator) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of FTRL should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("SquaredAccumOut"),
+                   "Output(SquaredAccumOut) of FTRL should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LinearAccumOut"),
+                   "Output(LinearAccumOut) of FTRL should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of FTRL Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("SquaredAccumOut", param_dim);
+    ctx->SetOutputDim("LinearAccumOut", param_dim);
+  }
+};
+
+class FTRLOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FTRLOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("SquaredAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates squared gradients.");
+    AddInput("LinearAccumulator",
+             "(Tensor, default Tensor<float>) "
+             "Accumulator that accumulates linear gradients.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("SquaredAccumOut",
+              "(Tensor) Output accumulated squared"
+              " gradients.");
+    AddOutput("LinearAccumOut",
+              "(Tensor) Output accumulated linear"
+              " gradients.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("lr_power",
+                   "(float, default -0.5f) "
+                   "Learning Rate Power.")
+        .SetDefault(-0.5f);
+    AddComment(R"DOC(
+FTRL (Follow The Regularized Leader) Operator.
+
+Optimizer that implements the FTRL algorithm:
+
+$$
+new\_accum = squared\_accum + grad^2 \\
+if (lr\_power == -0.5) {
+   linear\_accum += grad - (\surd(new\_accum) - \surd(squared\_accum)) /
+                   (learning\_rate * param) \\
+} else {
+   linear\_accum += grad -
+                  (new\_accum^{-lr\_power} - accum^{-lr\_power}) /
+                  (learning\_rate * param) \\
+}
+
+x = (l1 * sign(linear\_accum) - linear\_accum)
+if (lr\_power == -0.5) {
+   y = \frac{\surd(new\_accum)}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+} else {
+   y = \frac{new\_accum^{-lr\_power}}{learning\_rate} + (2 * l2) \\
+   pre\_shrink = \frac{x}{y} \\
+   param = (abs(linear\_accum) > l1).select(pre\_shrink, 0.0) \\
+}
+squared\_accum += grad^2;
+$$
+
+The paper that proposed Follow The Regularized Leader (FTRL):
+(https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu
new file mode 100644
index 0000000000..abbbe7adbe
--- /dev/null
+++ b/paddle/operators/ftrl_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/ftrl_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    ftrl, ops::FTRLOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h
new file mode 100644
index 0000000000..4eea04cd8d
--- /dev/null
+++ b/paddle/operators/ftrl_op.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class FTRLOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
+    auto* lin_accum_out = ctx.Output<Tensor>("LinearAccumOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    sq_accum_out->mutable_data<T>(ctx.GetPlace());
+    lin_accum_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+    auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto sq_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
+    auto lin_accum =
+        EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
+    auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto new_accum = sq_accum + g * g;
+    // Special case for lr_power = -0.5
+    if (lr_power == static_cast<T>(-0.5)) {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
+    } else {
+      l_acc_out.device(place) =
+          lin_accum + g -
+          ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
+           lr.broadcast(grad_dsize)) *
+              p;
+    }
+
+    auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
+    if (lr_power == static_cast<T>(-0.5)) {
+      auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    } else {
+      auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
+               l_acc_out.constant(static_cast<T>(2) * l2);
+      auto pre_shrink = x / y;
+      p_out.device(place) =
+          (l_acc_out.abs() > l_acc_out.constant(l1))
+              .select(pre_shrink, p.constant(static_cast<T>(0)));
+    }
+
+    s_acc_out.device(place) = sq_accum + g * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
new file mode 100644
index 0000000000..9840c066f0
--- /dev/null
+++ b/paddle/operators/gather.cu.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::DeviceContext;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
new file mode 100644
index 0000000000..052db49cb3
--- /dev/null
+++ b/paddle/operators/gather.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory.h>
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+/**
+ * A thin wrapper for gathering on cpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
new file mode 100644
index 0000000000..597fdad079
--- /dev/null
+++ b/paddle/operators/gather_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class GatherOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of GatherOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GatherOp should not be null.");
+
+    auto index_dims = ctx->GetInputDim("Index");
+    PADDLE_ENFORCE(index_dims.size() == 1);
+    int batch_size = ctx->GetInputDim("Index")[0];
+    PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
+    framework::DDim output_dims(ctx->GetInputDim("X"));
+    output_dims[0] = batch_size;
+    ctx->SetOutputDim("Out", output_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class GatherGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GatherOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The source input of gather op");
+    AddInput("Index", "The index input of gather op");
+    AddOutput("Out", "The output of gather op");
+    AddComment(R"DOC(
+Gather Operator.
+
+$Out = X[Index]$
+
+Out is obtained by gathering entries of the outer-most dimension 
+of X indexed by Index and concatenate them together.
+
+Example:
+
+X = [[1, 2],
+     [3, 4],
+     [5, 6]]
+
+Index = [[1, 2]]
+
+Then:
+
+Out = [[3, 4],
+       [5, 6]]
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
+            ops::GatherGradOp);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
new file mode 100644
index 0000000000..eec2415e1d
--- /dev/null
+++ b/paddle/operators/gather_op.cu
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto &place = *ctx.template device_context<platform::CUDADeviceContext>()
+                       .eigen_device();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
new file mode 100644
index 0000000000..1a1ba0c41a
--- /dev/null
+++ b/paddle/operators/gather_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class GatherOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
new file mode 100644
index 0000000000..cbd86b8796
--- /dev/null
+++ b/paddle/operators/gather_test.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gather.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(Gather, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (int i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;
+
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  CPUGather<int>(ctx, *src, *index, output);
+
+  for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
new file mode 100644
index 0000000000..2dca05760e
--- /dev/null
+++ b/paddle/operators/gaussian_random_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <random>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPUGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.Attr<float>("mean");
+    float std = context.Attr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::normal_distribution<T> dist(mean, std);
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of GaussianRandomOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> temp;
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    PADDLE_ENFORCE(shape.size() > 0UL,
+                   "shape can be one int or array. shape must be set.");
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.device_context());
+  }
+};
+
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "Output matrix of gaussian random op");
+
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "The dimension of random tensor.");
+    AddAttr<float>("mean",
+                   "(float, default 0.0) "
+                   "mean of random tensor.")
+        .SetDefault(.0f);
+    AddAttr<float>("std",
+                   "(float, default 1.0) "
+                   "std of random tensor.")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed of generator."
+                 "0 means use system wide seed.")
+        .SetDefault(0);
+    AddAttr<int>("dtype",
+                 "(int, default 5(FP32)) "
+                 "Output data type.")
+        .SetDefault(framework::proto::DataType::FP32);
+
+    AddComment(R"DOC(
+GaussianRandom Operator.
+
+Used to initialize tensors with gaussian random generator.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(gaussian_random, ops::GaussianRandomOp,
+                             ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::CPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
new file mode 100644
index 0000000000..8a70db17e1
--- /dev/null
+++ b/paddle/operators/gaussian_random_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct GaussianGenerator {
+  T mean_, std_;
+  unsigned int seed_;
+
+  __host__ __device__ GaussianGenerator(T mean, T std, int seed)
+      : mean_(mean), std_(std), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::normal_distribution<T> dist(mean_, std_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+template <typename T>
+class GPUGaussianRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T mean = static_cast<T>(context.Attr<float>("mean"));
+    T std = static_cast<T>(context.Attr<float>("std"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      GaussianGenerator<T>(mean, std, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(gaussian_random,
+                        paddle::operators::GPUGaussianRandomKernel<float>);
diff --git a/paddle/operators/get_places_op.cc b/paddle/operators/get_places_op.cc
new file mode 100644
index 0000000000..24fafb2307
--- /dev/null
+++ b/paddle/operators/get_places_op.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thread>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/place.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/platform/gpu_info.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+static size_t CUDADevCount() {
+#ifdef PADDLE_WITH_CUDA
+  return platform::GetCUDADeviceCount();
+#else
+  return 0UL;
+#endif
+}
+
+class GetPlacesOp : public framework::OperatorBase {
+ public:
+  GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    bool is_gpu;
+    if (Attr<std::string>("device_type") == "AUTO") {
+      is_gpu = platform::is_gpu_place(place);
+    } else {
+      is_gpu = Attr<std::string>("device_type") == "CUDA";
+    }
+    auto device_count = static_cast<size_t>(Attr<int>("device_count"));
+    if (device_count == 0) {
+      device_count =
+          is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
+    }
+    PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
+                      is_gpu ? "GPU" : "CPU");
+
+    auto out_var_name = Output("Out");
+    auto &places =
+        *(detail::Ref(scope.FindVar(out_var_name),
+                      "Output variable %s cannot be found", out_var_name)
+              .GetMutable<platform::PlaceList>());
+    places.reserve(device_count);
+    if (is_gpu) {
+      PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
+                        "Only %d CUDA devices found, cannot set to %d",
+                        CUDADevCount(), device_count);
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CUDAPlace(static_cast<int>(i)));
+      }
+    } else {
+      for (size_t i = 0; i < device_count; ++i) {
+        places.emplace_back(platform::CPUPlace());
+      }
+    }
+  }
+};
+
+class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "vector of Place");
+    AddAttr<int>("device_count", "device count").SetDefault(0);
+    AddAttr<std::string>("device_type", "device type")
+        .InEnum({"CUDA", "CPU", "AUTO"})
+        .SetDefault("AUTO");
+    AddComment(R"DOC(
+Returns a list of places based on flags. The list will be used for parallel
+execution.
+)DOC");
+  }
+};
+
+class GetPlacesInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o_name : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o_name).SetType(
+          framework::proto::VarDesc::PLACE_LIST);
+    }
+  }
+};
+
+class GetPlacesInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    // Do nothing
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
+                  ops::GetPlacesInferVarType, ops::GetPlacesInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc
new file mode 100644
index 0000000000..fb901b6394
--- /dev/null
+++ b/paddle/operators/gru_op.cc
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(%s) of GRUOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchResetHiddenPrev"),
+                   "Output(%s) of GRUOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_dims[1], frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("BatchGate", input_dims);
+    ctx->SetOutputDim("BatchResetHiddenPrev", {input_dims[0], frame_size});
+    ctx->SetOutputDim("BatchHidden", {input_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", {input_dims[0], frame_size});
+    ctx->ShareLoD("Input", "Hidden");
+  }
+};
+
+class GRUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) The first input is a LodTensor, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTenosr is a matrix with shape (T X 3D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) The initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.")
+        .AsDispensable();
+    AddInput(
+        "Weight",
+        "(Tensor) The learnable hidden-hidden weight matrix with shape "
+        "(D x 3D), where D is the hidden size. The elements continuous in "
+        "memory can be divided into two parts. The first part are weights of "
+        "the update gate and reset gate with shape (D x 2D), and the second "
+        "part are weights of output candidate with shape (D x D).");
+    AddInput("Bias",
+             "(Tensor, optional) Bias vector with shape (1 x 3D) concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("BatchGate",
+              "(LoDTensor) To compute with batches, sequence data will be "
+              "reorganized into several successive batches each containing "
+              "data from the same time step. The LoDTensor BatchGate contains "
+              "the update gate, reset gate and output candidate values "
+              "organized in batches. The LoD size is 2. The first LoD contains "
+              "the batch offsets and the second LoD contains the indexes in "
+              "the raw sequence data.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchResetHiddenPrev",
+        "(LoDTensor) The reseted hidden state LoDTensor organized in batches. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "BatchHidden",
+        "(LoDTensor) The hidden state LoDTensor organized in batches.  "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.")
+        .AsIntermediate();
+    AddOutput(
+        "Hidden",
+        "(LoDTensor) the hidden state LoDTensor organized in sequences. "
+        "This LoDTensor is a matrix with shape (T X D) and has the same LoD "
+        "with `BatchGate`.");
+    AddAttr<std::string>("activation",
+                         "(string, default tanh) "
+                         "The activation type used for output candidate {h}_t.")
+        .SetDefault("tanh");
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default sigmoid) "
+        "The activation type used in update gate and reset gate.")
+        .SetDefault("sigmoid");
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed GRU.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+GRU Operator implements part calculations of the complete GRU as following:
+
+$$
+update\_gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset\_gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output\_candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+@note To implement the complete GRU, fully-connected operator must be used
+before to feed xu, xr and xc as the Input of GRU operator.
+)DOC");
+  }
+};
+
+class GRUGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(%s) of GRUGradOp should not be null.", "BatchGate");
+    PADDLE_ENFORCE(ctx->HasInput("BatchResetHiddenPrev"),
+                   "Input(%s) of GRUGradOp should not be null.",
+                   "BatchResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("BatchHidden"),
+                   "Input(%s) of GRUOp should not be null.", "BatchHidden");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUGradOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int input_size = input_dims[1];
+    int frame_size = weight_dims[0];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(input_size, frame_size * 3,
+                      "The input_size must be 3 times of frame_size in GRUOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("H0")) {
+      auto h0_dims = ctx->GetInputDim("H0");
+      PADDLE_ENFORCE_EQ(h0_dims[1], frame_size,
+                        "The width of H0 must be equal to frame_size.");
+      auto h0_grad_name = framework::GradVarName("H0");
+      if (ctx->HasOutput(h0_grad_name))
+        ctx->SetOutputDim(h0_grad_name, h0_dims);
+    }
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp);
+REGISTER_OP_CPU_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc
new file mode 100644
index 0000000000..9cb0cc42d5
--- /dev/null
+++ b/paddle/operators/gru_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gru_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_grad, ops::GRUGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
new file mode 100644
index 0000000000..b1957fb9ce
--- /dev/null
+++ b/paddle/operators/gru_op.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    context.ShareLoD("Input", "Hidden");
+
+    auto hidden_dims = hidden->dims();
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, *batch_gate, true, is_reverse);
+
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, *hidden);
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GRUGradKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* batch_gate = context.Input<LoDTensor>("BatchGate");
+    auto* batch_reset_hidden_prev =
+        context.Input<LoDTensor>("BatchResetHiddenPrev");
+    auto* batch_hidden = context.Input<LoDTensor>("BatchHidden");
+    auto* hidden = context.Input<LoDTensor>("Hidden");
+    auto* hidden_grad =
+        context.Input<LoDTensor>(framework::GradVarName("Hidden"));
+    auto* input_grad =
+        context.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* h0_grad = context.Output<Tensor>(framework::GradVarName("H0"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto gate_dims = batch_gate->dims();
+    auto hidden_dims = hidden->dims();
+    int frame_size = hidden_dims[1];
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad;
+    batch_hidden_grad.mutable_data<T>(hidden_dims, context.GetPlace());
+    batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
+    batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
+                                                 context.GetPlace());
+    math::SetConstant<DeviceContext, T> zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
+    zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast<T>(0.0));
+
+    Tensor ordered_h0, ordered_h0_grad;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (h0) {
+      ReorderInitState<DeviceContext, T>(dev_ctx, *h0, order, &ordered_h0,
+                                         true);
+    }
+    if (h0_grad) {
+      ordered_h0_grad.mutable_data<T>(h0_grad->dims(), context.GetPlace());
+      zero(context.template device_context<DeviceContext>(), &ordered_h0_grad,
+           static_cast<T>(0.0));
+    }
+
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    batch_hidden_grad.set_lod(batch_hidden->lod());
+    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
+
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+
+    math::GRUMetaGrad<T> gru_grad;
+    if (weight_grad) {
+      gru_grad.gate_weight_grad =
+          weight_grad->mutable_data<T>(context.GetPlace());
+      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
+      gru_grad.state_weight_grad =
+          weight_grad->data<T>() + 2 * frame_size * frame_size;
+    } else {
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
+    }
+
+    auto batch_starts = batch_hidden_grad.lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      gru_value.gate_value = gate_t.data<T>();
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+
+      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
+      gru_grad.output_grad = hidden_grad_t.data<T>();
+      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
+      gru_grad.gate_grad = gate_grad_t.data<T>();
+      Tensor reset_hidden_prev_grad_t =
+          batch_reset_hidden_prev_grad.Slice(bstart, bend);
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
+      if (n == 0) {
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
+            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
+      } else {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
+        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
+      }
+
+      math::GRUUnitGradFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, active_node,
+          active_gate);
+    }
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+      batch_gate_grad.set_lod(batch_gate->lod());
+      to_seq(dev_ctx, batch_gate_grad, *input_grad);
+    }
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(dev_ctx, batch_gate_grad, bias_grad);
+    }
+    if (h0 && h0_grad) {
+      ReorderInitState<DeviceContext, T>(dev_ctx, ordered_h0_grad, order,
+                                         h0_grad, false);
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
new file mode 100644
index 0000000000..c354293be7
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/gru_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
+                   "Output(%s) of GRUUnitOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
+    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
+    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
+  }
+};
+
+class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+             "input.");
+    AddInput("HiddenPrev",
+             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+             "states of previous time step.");
+    AddInput(
+        "Weight",
+        "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+        "The elements continuous in memory can be divided into two parts. "
+        "The first part are weights of the update gate and reset gate "
+        "with shape [frame_size, frame_size * 2], and the second part are "
+        "weights of output candidate with shape [frame_size, frame_size].");
+    AddInput(
+        "Bias",
+        "(Tensor) Bias vector with shape [1, frame_size * 3] concatenating "
+        "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("Gate",
+              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+              "output of update gate, reset gate and output candidate.")
+        .AsIntermediate();
+    AddOutput("ResetHiddenPrev",
+              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+              "reseted hidden state of previous time step.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(Tensor) The GRU hidden state of the current time step "
+              "with shape [batch_size, frame_size].");
+    AddAttr<int>("activation",
+                 "(enum int, default tanh) "
+                 "The activation type used for output candidate {h}_t.")
+        .SetDefault(tanh)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddAttr<int>("gate_activation",
+                 "(enum int, default sigmoid) "
+                 "The activation type used in update gate and reset gate.")
+        .SetDefault(sigmoid)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddComment(R"DOC(
+GRUUnit Operator implements partial calculations of the GRU unit as following:
+
+$$
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
+$$
+
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
+
+)DOC");
+  }
+};
+
+class GRUUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("Gate"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    // int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
+    if (ctx->HasOutput(hidden_prev_grad_name))
+      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+            ops::GRUUnitGradOp);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
new file mode 100644
index 0000000000..95c8c23dad
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_unit_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit, ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    gru_unit_grad,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GRUUnitGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
new file mode 100644
index 0000000000..a77be46718
--- /dev/null
+++ b/paddle/operators/gru_unit_op.h
@@ -0,0 +1,244 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <typename DeviceContext, typename T>
+class GRUUnitKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    if (act_type == identity)
+      y.device(d) = x;
+    else if (act_type == sigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == tanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == relu)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* gate = context.Output<Tensor>("Gate");
+    gate->mutable_data<T>(context.GetPlace());
+    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<Tensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    auto x = EigenMatrix<T>::From(*input);
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = EigenMatrix<T>::From(*hidden);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    // calculate unactivated gate outputs
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = x +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    } else {
+      g.device(place) = x;
+    }
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_data = gate->data<T>();
+    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+
+    // calculate activited gate
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    r_h_p.device(place) = r * h_p;         // reset previous hidden state
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, false,
+        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+        gate_data + frame_size * 2, frame_size * 3);
+
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    ActCompute(context.Attr<int>("activation"), place,
+               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // calculate final output
+    h.device(place) = u * (c - h_p) + h_p;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GRUUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
+                      DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == identity)
+      dx.device(d) = dy;
+    else if (act_type == sigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == tanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == relu)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* gate = context.Input<Tensor>("Gate");
+    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
+    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* hidden_prev_grad =
+        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+    Tensor gate_grad;
+    Tensor reset_hidden_prev_grad;
+
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_grad_data =
+        gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
+    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.mutable_data<T>(
+        reset_hidden_prev->dims(), context.GetPlace());
+
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto d_h = EigenMatrix<T>::From(*hidden_grad);
+    auto d_g = EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // backward for unactivated update gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
+    // backward for unactivated output candidate
+    ActGradCompute(context.Attr<int>("activation"), place, c, c,
+                   d_g.slice(c_offsets, extents), d_h * u);
+    // backward for reset_hidden_prev
+    math::gemm<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), false, true,
+        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+        0, reset_hidden_prev_grad_data, frame_size);
+    // backward for unactivated reset gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
+                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
+    // backward for weight
+    if (weight_grad) {
+      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
+      // backward for state_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
+          weight_grad_data + frame_size * frame_size * 2, frame_size);
+
+      // backward for update_gate_weight and reset_gate_weight
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), true, false,
+          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
+          frame_size * 2);
+    }
+    // backward for hidden_prev
+    if (hidden_prev_grad) {
+      T* hidden_prev_grad_data =
+          hidden_prev_grad->mutable_data<T>(context.GetPlace());
+      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
+      math::gemm<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), false, true,
+          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
+          frame_size);
+    }
+    // backward for input
+    if (input_grad) {
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto d_x = EigenMatrix<T>::From(*input_grad);
+      d_x.device(place) = d_g;
+    }
+    // backward for bias
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc
new file mode 100644
index 0000000000..19d2e9dc56
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cc
@@ -0,0 +1,113 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HingeLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Logits) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Logits) contains a real value, "
+                      "so the 2nd dimension of Input(Logits) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Logits", "Loss");
+  }
+};
+
+template <typename AttrType>
+class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HingeLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "The input value (Logits) of Hinge loss op."
+             "Logits is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Hinge loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the hinge loss.");
+    AddComment(R"DOC(
+HingeLoss Operator.
+
+Let x be a logit (prediction) and y be the actual label. The logit can
+take any values from (-inf, inf), but the labels should be either -1 or 1.
+Then, the hinge loss is computed as follows:
+
+$$
+L_(x, y) = max(1 - y.x, 0) 
+$$
+
+Note that the labels passed as input will have values as either 0 or 1.
+
+)DOC");
+  }
+};
+
+class HingeLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Input(Logits@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto lab_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Logits");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu
new file mode 100644
index 0000000000..b9cfbc50c4
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/hinge_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss,
+    ops::HingeLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h
new file mode 100644
index 0000000000..91369cfb8a
--- /dev/null
+++ b/paddle/operators/hinge_loss_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    loss->mutable_data<T>(context.GetPlace());
+    auto l = framework::EigenVector<T>::Flatten(*loss);
+    l.device(place) =
+        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
+            .cwiseMax(static_cast<T>(0));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HingeLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* dloss =
+        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    auto dl = framework::EigenVector<T>::Flatten(*dloss);
+
+    if (dpred) {
+      dpred->mutable_data<T>(context.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
+      dx.device(place) =
+          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
+          (-alt_labels);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc
new file mode 100644
index 0000000000..5c92f2c7b2
--- /dev/null
+++ b/paddle/operators/huber_loss_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class HuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
+                      "The rank of Input(X) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                      "Each row of Input(X) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Residual", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+template <typename AttrType>
+class HuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input value of huber loss op."
+             "X is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Y",
+             "The target value of huber loss op."
+             "Y is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Residual",
+              "Intermediate tensor to cache residual value between Y and X."
+              "The shape is same as Input(X) and will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the huber loss.");
+    AddAttr<AttrType>("delta", "Hyper parameter in huber loss.");
+    AddComment(R"DOC(
+HuberLoss Operator.
+
+Huber loss is a loss function used in robust regression. We define X as the
+input value and Y as the target value. Huber loss can evaluate the fitness of
+X to Y. Different from MSE loss, Huber loss is more robust for outliers. The
+shape of X and Y are [batch_size, 1]. The equation is:
+
+$$
+Out_{\delta}(X, Y)_i =
+\begin{cases}
+0.5 * (Y_i - X_i)^2,
+\quad |Y_i - X_i| \leq \delta \\
+\delta * (|Y_i - X_i| - 0.5 * \delta),
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_\delta(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
+
+)DOC");
+  }
+};
+
+class HuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Residual"),
+                   "Input(Residual) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto residual_dims = ctx->GetInputDim("Residual");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(residual_dims, x_dims);
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims);
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker<float>,
+            huber_loss_grad, ops::HuberLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu
new file mode 100644
index 0000000000..ccc83a16ba
--- /dev/null
+++ b/paddle/operators/huber_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/huber_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss,
+    ops::HuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    huber_loss_grad,
+    ops::HuberLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h
new file mode 100644
index 0000000000..4dd20e8b08
--- /dev/null
+++ b/paddle/operators/huber_loss_op.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct HuberLossForward {
+  HOSTDEVICE HuberLossForward(const T& delta) : delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return static_cast<T>(0.5) * val * val;
+    } else {
+      return delta * (abs_val - static_cast<T>(0.5) * delta);
+    }
+  }
+
+  T delta;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HuberLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("Residual");
+    auto* out1 = context.Output<Tensor>("Out");
+    auto delta = static_cast<T>(context.Attr<AttrType>("delta"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    out0->mutable_data<T>(context.GetPlace());
+    auto residual = EigenVector<T>::Flatten(*out0);
+    residual.device(place) = y - x;
+    out1->mutable_data<T>(context.GetPlace());
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = residual.unaryExpr(HuberLossForward<T>(delta));
+  }
+};
+
+template <typename T>
+struct HuberLossBackward {
+  HOSTDEVICE HuberLossBackward(const T& delta, T sign)
+      : sign(sign), delta(delta) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val <= delta) {
+      return sign * val;
+    } else {
+      if (val > 0) {
+        return sign * delta;
+      } else {
+        return -1 * sign * delta;
+      }
+    }
+  }
+
+  T sign;
+  T delta;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class HuberLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Residual");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = static_cast<T>(context.op().Attr<AttrType>("delta"));
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto residual = EigenVector<T>::Flatten(*in0);
+    auto out_grad = EigenVector<T>::Flatten(*in1);
+
+    if (out0) {
+      out0->mutable_data<T>(context.GetPlace());
+      auto x_grad = EigenVector<T>::Flatten(*out0);
+      x_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, -1.0));
+    }
+
+    if (out1) {
+      out1->mutable_data<T>(context.GetPlace());
+      auto y_grad = EigenVector<T>::Flatten(*out1);
+      y_grad.device(place) =
+          out_grad * residual.unaryExpr(HuberLossBackward<T>(delta, 1.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/im2sequence_op.cc b/paddle/operators/im2sequence_op.cc
new file mode 100644
index 0000000000..31baaedf69
--- /dev/null
+++ b/paddle/operators/im2sequence_op.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/im2sequence_op.h"
+
+namespace paddle {
+namespace operators {
+
+class Im2SequenceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Im2SequenceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Im2SequenceOp op should not be null.");
+
+    auto in_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
+                      "Input(X) format must be 4D tensor, eg., NCHW.");
+
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    ctx->SetOutputDim("Out", {batch_size * output_height * output_width,
+                              img_channels * kernels[0] * kernels[1]});
+  }
+};
+
+class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Im2SequenceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor has NCHW format."
+             "N: batch size"
+             "C: channels"
+             "H: height"
+             "W: width");
+    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
+    AddAttr<std::vector<int>>("kernels",
+                              "(vector<int>), the "
+                              "kernels(kernel_height, kernel_width)");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int> default:{1, 1}), the "
+                              "strides(h_stride, w_stride)")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector<int> default:{0, 0, 0, 0}), the "
+                              "paddings(up_pad, left_pad, down_pad, right_pad)")
+        .SetDefault({0, 0, 0, 0});
+    AddComment(R"DOC(
+This op uses kernels to scan images and converts these images to sequences.
+After expanding, The number of time steps are output_height * output_width
+and the dimension of each time step is kernel_height * kernel_width * channels,
+in which:
+
+output_height =
+    1 + (padding_height + padding_down + img_height - kernel_height + stride_height - 1) /
+            stride_height;
+output_width =
+    1 + (padding_left + padding+right + img_width - kernel_width + stride_width - 1) /
+            stride_width;
+
+This op can be used after convolution neural network, and before recurrent neural network.
+
+Given:
+
+x = [[[[ 6.  2.  1.]
+       [ 8.  3.  5.]
+       [ 0.  2.  6.]]
+
+      [[ 2.  4.  4.]
+       [ 6.  3.  0.]
+       [ 6.  4.  7.]]]
+
+     [[[ 6.  7.  1.]
+       [ 5.  7.  9.]
+       [ 2.  4.  8.]]
+
+      [[ 1.  2.  1.]
+       [ 1.  3.  5.]
+       [ 9.  0.  8.]]]]
+x.dims = {2, 2, 3, 3}
+
+And:
+
+kernels = [2, 2]
+strides = [1, 1]
+paddings = [0, 0, 0, 0]
+
+Then:
+
+output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+               [ 2.  1.  3.  5.  4.  4.  3.  0.]
+               [ 8.  3.  0.  2.  6.  3.  6.  4.]
+               [ 3.  5.  2.  6.  3.  0.  4.  7.]
+               [ 6.  7.  5.  7.  1.  2.  1.  3.]
+               [ 7.  1.  7.  9.  2.  1.  3.  5.]
+               [ 5.  7.  2.  4.  1.  3.  9.  0.]
+               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+output.dims = {8, 9}
+output.lod = [[0, 4, 8]]
+
+)DOC");
+  }
+};
+
+class Im2SequenceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(im2sequence, ops::Im2SequenceOp, ops::Im2SequenceOpMaker,
+            im2sequence_grad, ops::Im2SequenceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/im2sequence_op.cu b/paddle/operators/im2sequence_op.cu
new file mode 100644
index 0000000000..9db7529112
--- /dev/null
+++ b/paddle/operators/im2sequence_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/im2sequence_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence,
+    ops::Im2SequenceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    im2sequence_grad,
+    ops::Im2SequenceGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/im2sequence_op.h b/paddle/operators/im2sequence_op.h
new file mode 100644
index 0000000000..f33aec71a9
--- /dev/null
+++ b/paddle/operators/im2sequence_op.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_layout.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+inline int OutputSize(int input_size, int filter_size, int padding_0,
+                      int padding_1, int stride) {
+  const int output_size =
+      (input_size + padding_0 + padding_1 - filter_size) / stride + 1;
+  return output_size;
+}
+
+template <typename DeviceContext, typename T>
+class Im2SequenceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* in = ctx.Input<Tensor>("X");
+    LoDTensor* out = ctx.Output<LoDTensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
+    // being available for python API
+    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
+    //                  "Input(X) layout must be NCHW");
+    auto in_dim = in->dims();
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    const std::vector<int> dilations({1, 1});
+
+    auto out_dims = out->dims();
+    out->Resize({batch_size, out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      const Tensor src =
+          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      Tensor dst = out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+
+      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
+    }
+    out->Resize(out_dims);
+
+    // set lod information
+    // TODO(wanghaoshuang): Move this to InferShape
+    framework::LoD lod(1);
+    lod[0].reserve(batch_size + 1);
+    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+      lod[0].push_back(offset);
+      offset += output_height * output_width;
+    }
+    out->set_lod(lod);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Im2SequenceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    Tensor* d_out =
+        const_cast<Tensor*>(ctx.Input<Tensor>(framework::GradVarName("Out")));
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto x_v = framework::EigenVector<T>::Flatten(*d_x);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    x_v.device(place) = x_v.constant(0.0);
+
+    auto in_dim = in->dims();
+    int batch_size = in_dim[0];
+    int img_channels = in_dim[1];
+    int img_height = in_dim[2];
+    int img_width = in_dim[3];
+
+    auto kernels = ctx.Attr<std::vector<int>>("kernels");
+    auto strides = ctx.Attr<std::vector<int>>("strides");
+    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    int output_height = OutputSize(img_height, kernels[0], paddings[0],
+                                   paddings[2], strides[0]);
+    int output_width =
+        OutputSize(img_width, kernels[1], paddings[1], paddings[3], strides[1]);
+
+    const std::vector<int> dilations({1, 1});
+
+    auto d_out_dims = d_out->dims();
+    d_out->Resize({batch_size, d_out->numel() / batch_size});
+    for (int i = 0; i < batch_size; i++) {
+      Tensor dst =
+          d_x->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+      const Tensor src = d_out->Slice(i, i + 1).Resize(
+          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+      math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      f(dev_ctx, src, dilations, strides, paddings, &dst);
+    }
+    d_out->Resize(d_out_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/operators/images/batch_norm_fork.dot
new file mode 100644
index 0000000000..4bc47713cb
--- /dev/null
+++ b/paddle/operators/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/operators/images/batch_norm_fork.png
new file mode 100644
index 0000000000..aded62bce5
Binary files /dev/null and b/paddle/operators/images/batch_norm_fork.png differ
diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/operators/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000..a99ce81ff3
Binary files /dev/null and b/paddle/operators/images/batch_norm_op_kernel.png differ
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
new file mode 100644
index 0000000000..e0b80cc4e7
--- /dev/null
+++ b/paddle/operators/increment_op.cc
@@ -0,0 +1,111 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class IncrementInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IncrementOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IncrementOp should not be null.");
+    PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X")));
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+};
+
+struct IncrementFunctor {
+  IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out,
+                   float value)
+      : x_(x), out_(out), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    *out_->data<T>() = *x_.data<T>() + static_cast<T>(value_);
+  }
+
+  const framework::LoDTensor &x_;
+  framework::LoDTensor *out_;
+  float value_;
+};
+
+class IncrementOp : public framework::OperatorBase {
+ public:
+  IncrementOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+
+    PADDLE_ENFORCE(platform::is_cpu_place(x.place()));
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    float value = Attr<float>("step");
+    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
+             << value;
+    framework::VisitDataType(framework::ToDataType(out.type()),
+                             IncrementFunctor(x, &out, value));
+  }
+};
+
+class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IncrementOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor of increment operator");
+    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
+    AddAttr<float>("step",
+                   "(float, default 1.0) "
+                   "The step size by which the "
+                   "input tensor will be incremented.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+Increment Operator.
+
+The equation is: 
+$$Out = X + step$$
+
+)DOC");
+  }
+};
+
+class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("increment");
+    grad_op->SetInput("X", Output("Out"));
+    grad_op->SetOutput("Out", Input("X"));
+    grad_op->SetAttr("step", -boost::get<float>(GetAttr("step")));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape,
+                  ops::IncrementOpMaker, ops::IncrementGradOpMaker);
diff --git a/paddle/operators/iou_similarity_op.cc b/paddle/operators/iou_similarity_op.cc
new file mode 100755
index 0000000000..c520b28b83
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IOUSimilarityOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IOUSimilarityOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of IOUSimilarityOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "The rank of Input(X) must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 4UL, "The shape of X is [N, 4]");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2UL, "The rank of Input(Y) must be 2.");
+    PADDLE_ENFORCE_EQ(y_dims[1], 4UL, "The shape of Y is [M, 4]");
+
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", framework::make_ddim({x_dims[0], y_dims[0]}));
+  }
+};
+
+class IOUSimilarityOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IOUSimilarityOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Box list X is a 2-D LoDTensor with shape [N, 4] holds N boxes, "
+             "each box is represented as [xmin, ymin, xmax, ymax], "
+             "the shape of X is [N, 4]. [xmin, ymin] is the left top "
+             "coordinate of the box if the input is image feature map, they "
+             "are close to the origin of the coordinate system. "
+             "[xmax, ymax] is the right bottom coordinate of the box. "
+             "This tensor can contain LoD information to represent a batch "
+             "of inputs. One instance of this batch can contain different "
+             "numbers of entities.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) "
+             "Box list Y holds M boxes, each box is represented as "
+             "[xmin, ymin, xmax, ymax], the shape of X is [N, 4]. "
+             "[xmin, ymin] is the left top coordinate of the box if the "
+             "input is image feature map, and [xmax, ymax] is the right "
+             "bottom coordinate of the box.");
+
+    AddOutput("Out",
+              "(LoDTensor, the lod is same as input X) The output of "
+              "iou_similarity op, a tensor with shape [N, M] "
+              "representing pairwise iou scores.");
+
+    AddComment(R"DOC(
+IOU Similarity Operator.
+Computes intersection-over-union (IOU) between two box lists.
+ Box list 'X' should be a LoDTensor and 'Y' is a common Tensor,
+ boxes in 'Y' are shared by all instance of the batched inputs of X.
+ Given two boxes A and B, the calculation of IOU is as follows:
+
+$$
+IOU(A, B) = 
+\frac{area(A\cap B)}{area(A)+area(B)-area(A\cap B)}
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp,
+                             ops::IOUSimilarityOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.cu b/paddle/operators/iou_similarity_op.cu
new file mode 100755
index 0000000000..fa50526246
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/iou_similarity_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    iou_similarity,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::IOUSimilarityKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/iou_similarity_op.h b/paddle/operators/iou_similarity_op.h
new file mode 100644
index 0000000000..e36177069d
--- /dev/null
+++ b/paddle/operators/iou_similarity_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/for_range.h"
+
+template <typename T>
+inline HOSTDEVICE T IOUSimilarity(T xmin1, T ymin1, T xmax1, T ymax1, T xmin2,
+                                  T ymin2, T xmax2, T ymax2) {
+  constexpr T zero = static_cast<T>(0);
+  T area1 = (ymax1 - ymin1) * (xmax1 - xmin1);
+  T area2 = (ymax2 - ymin2) * (xmax2 - xmin2);
+  T inter_xmax = xmax1 > xmax2 ? xmax2 : xmax1;
+  T inter_ymax = ymax1 > ymax2 ? ymax2 : ymax1;
+  T inter_xmin = xmin1 > xmin2 ? xmin1 : xmin2;
+  T inter_ymin = ymin1 > ymin2 ? ymin1 : ymin2;
+  T inter_height = inter_ymax - inter_ymin;
+  T inter_width = inter_xmax - inter_xmin;
+  inter_height = inter_height > zero ? inter_height : zero;
+  inter_width = inter_width > zero ? inter_width : zero;
+  T inter_area = inter_width * inter_height;
+  T union_area = area1 + area2 - inter_area;
+  T sim_score = inter_area / union_area;
+  return sim_score;
+}
+
+template <typename T>
+struct IOUSimilarityFunctor {
+  IOUSimilarityFunctor(const T* x, const T* y, T* z, int cols)
+      : x_(x), y_(y), z_(z), cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    T x_min1 = x_[row_id * 4];
+    T y_min1 = x_[row_id * 4 + 1];
+    T x_max1 = x_[row_id * 4 + 2];
+    T y_max1 = x_[row_id * 4 + 3];
+    for (size_t i = 0; i < cols_; ++i) {
+      T x_min2 = y_[i * 4];
+      T y_min2 = y_[i * 4 + 1];
+      T x_max2 = y_[i * 4 + 2];
+      T y_max2 = y_[i * 4 + 3];
+
+      T sim = IOUSimilarity(x_min1, y_min1, x_max1, y_max1, x_min2, y_min2,
+                            x_max2, y_max2);
+
+      z_[row_id * cols_ + i] = sim;
+    }
+  }
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class IOUSimilarityKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const framework::LoDTensor* in_x = ctx.Input<framework::LoDTensor>("X");
+    const framework::Tensor* in_y = ctx.Input<framework::Tensor>("Y");
+    framework::LoDTensor* out = ctx.Output<framework::LoDTensor>("Out");
+
+    int x_n = in_x->dims()[0];
+    int y_n = in_y->dims()[0];
+    IOUSimilarityFunctor<T> functor(in_x->data<T>(), in_y->data<T>(),
+                                    out->mutable_data<T>(ctx.GetPlace()), y_n);
+
+    platform::ForRange<DeviceContext> for_range(
+        static_cast<const DeviceContext&>(ctx.device_context()), x_n);
+    for_range(functor);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/is_empty_op.cc b/paddle/operators/is_empty_op.cc
new file mode 100644
index 0000000000..492ae48845
--- /dev/null
+++ b/paddle/operators/is_empty_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kInput[] = "X";
+constexpr char kOutput[] = "Out";
+
+class IsEmptyOp : public framework::OperatorBase {
+ public:
+  IsEmptyOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get input
+    auto *var = scope.FindVar(Input(kInput));
+    PADDLE_ENFORCE_NOT_NULL(var);
+    auto &tensor = var->Get<framework::LoDTensor>();
+    // get output
+    auto *out = scope.FindVar(Output(kOutput));
+    PADDLE_ENFORCE_NOT_NULL(out);
+    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+
+    out_tensor->Resize({1});
+    out_tensor->mutable_data<bool>(platform::CPUPlace())[0] =
+        framework::product(tensor.dims()) == 0;
+  }
+};
+
+class IsEmptyOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IsEmptyOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInput, "(Tensor) Tensor which is to be checked.");
+    AddOutput(kOutput, "(Tensor) a boolean Tensor that indicate empty or not.");
+    AddComment(R"DOC(
+IsEmpty Operator which checks whether a tensor is empty.
+
+It will just return product(tensor.ddims()) > 0;
+              )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(is_empty, paddle::operators::IsEmptyOp,
+                             paddle::operators::IsEmptyOpProtoMaker);
diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc
new file mode 100644
index 0000000000..1a5d6e1926
--- /dev/null
+++ b/paddle/operators/l1_norm_op.cc
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/l1_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class L1NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class L1NormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class L1NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  L1NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of l1_norm op.");
+    AddOutput("Out", "(Scalar) The output of l1_norm op.");
+    AddComment(R"DOC(
+L1 Norm Operator.
+
+Computes the L1 norm of a tensor.
+
+$$Out = \sum{|X|}$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad,
+            ops::L1NormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu
new file mode 100644
index 0000000000..7ecc774670
--- /dev/null
+++ b/paddle/operators/l1_norm_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/l1_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm, ops::L1NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    l1_norm_grad,
+    ops::L1NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h
new file mode 100644
index 0000000000..086d42705d
--- /dev/null
+++ b/paddle/operators/l1_norm_op.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = sum(abs(X))
+template <typename DeviceContext, typename T>
+class L1NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenScalar<T>::From(*Out);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    out.device(place) = x.abs().sum();
+  }
+};
+
+// dX = dout * sign(X)
+template <typename DeviceContext, typename T>
+class L1NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *x = context.Input<framework::Tensor>("X");
+    const framework::Tensor *d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(d_out->numel() == 1, "L1 Norm Gradient should be scalar");
+    framework::Tensor *dx =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(context.GetPlace());
+
+    auto x_eigen = framework::EigenVector<T>::Flatten(*x);
+    auto d_out_eigen = framework::EigenVector<T>::Flatten(*d_out);
+    auto dx_eigen = framework::EigenVector<T>::Flatten(*dx);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> x_dsize(x->numel());
+    dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/label_smooth_op.cc b/paddle/operators/label_smooth_op.cc
new file mode 100644
index 0000000000..c89082f44b
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LabelSmoothOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LabelSmoothOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LabelSmoothOp should not be null.");
+    auto in_dims = ctx->GetInputDim("X");
+    if (ctx->HasInput("PriorDist")) {
+      auto noise_dims = ctx->GetInputDim("PriorDist");
+      auto noise_numel = paddle::framework::product(noise_dims);
+      PADDLE_ENFORCE(
+          in_dims[1] == noise_numel,
+          "The number of elements in Input(PriorDist) must be equal to the "
+          "dimension of each label.");
+    }
+    ctx->ShareLoD("X", /*->*/ "Out");
+    ctx->SetOutputDim("Out", in_dims);
+  }
+};
+
+class LabelSmoothOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LabelSmoothOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) The input labels of LabelSmooth operator. This "
+             "input can be batched labels in one-hot encoding or output from "
+             "softmax, with shape [N x K], where N is the batch size and K is "
+             "the number of classes");
+    AddInput("PriorDist",
+             "(Tensor, optional)"
+             "The prior distribution to be added to the smoothed label. It is "
+             "fixed during training and the number of elements should be equal "
+             "to the dimension K of each label. Default is uniform "
+             "distribution and each element will be set to 1/K if not provided "
+             "in input.")
+        .AsDispensable();
+    AddOutput("Out",
+              "(loDTensor) The smoothed label of LabelSmooth operator. It has"
+              "the same shape and LoD with the Input(LoDTensor).");
+    AddAttr<float>("epsilon",
+                   "(float, default 0.0f)"
+                   "The smoothing parameter of LabelSmooth operator.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+LabelSmooth Operator.
+
+Label smoothing is a mechanism to regularize the classifier layer. In machine 
+learning, optimizing the log-likelihood of the correct label directly may 
+cause two problems. First, it may result in overfitting: if the model learns 
+to assign full probability to the ground-truth label for each training example,
+it is not guaranteed to generalize. Second, it encourages the differences 
+between the largest logit and all others to become large, reducing the ability 
+of the model to adapt. Label smoothing is proposed to encourage the model to 
+be less confident, which replaces the ground-truth label $y$ with the weighted 
+sum of itself and some fixed distribution $\mu$, i.e.
+
+$$
+    \tilde{y} = (1 - \epsilon) * y + \epsilon * \mu,
+$$
+
+where $(1 - \epsilon)$ and $\epsilon$ are the weights respectively, and 
+$\tilde{y}$ is the smoothed label. Usually uniform distribution is used for 
+$\mu$. This change in the ground-truth label is called label-smoothing 
+regularization or LSR.
+
+See more details about label smoothing in https://arxiv.org/abs/1512.00567.
+
+)DOC");
+  }
+};
+
+class LabelSmoothGradOp : public framework::OperatorWithKernel {
+ public:
+  LabelSmoothGradOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(label_smooth, ops::LabelSmoothOp, ops::LabelSmoothOpMaker,
+            label_smooth_grad, ops::LabelSmoothGradOp);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.cu b/paddle/operators/label_smooth_op.cu
new file mode 100644
index 0000000000..5a0cec12bc
--- /dev/null
+++ b/paddle/operators/label_smooth_op.cu
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/label_smooth_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    label_smooth_grad,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LabelSmoothGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/label_smooth_op.h b/paddle/operators/label_smooth_op.h
new file mode 100644
index 0000000000..87bc9f793e
--- /dev/null
+++ b/paddle/operators/label_smooth_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LabelSmoothKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::LoDTensor>("Out");
+    auto* in_t = ctx.Input<framework::LoDTensor>("X");
+    auto* dist_t = ctx.Input<framework::Tensor>("PriorDist");
+    auto label_dim = in_t->dims()[1];
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto in = framework::EigenVector<T>::Flatten(*in_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    if (dist_t) {
+      auto dist = framework::EigenVector<T>::Flatten(*dist_t);
+      out.device(dev) =
+          static_cast<T>(1 - epsilon) * in +
+          epsilon * dist.broadcast(Eigen::DSizes<int, 1>(in_t->numel()));
+    } else {
+      out.device(dev) = static_cast<T>(1 - epsilon) * in +
+                        static_cast<T>(epsilon / label_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LabelSmoothGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_in_t = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_in_t->mutable_data<T>(ctx.GetPlace());
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto d_in = framework::EigenVector<T>::Flatten(*d_in_t);
+
+    auto epsilon = ctx.Attr<float>("epsilon");
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    d_in.device(dev) = static_cast<T>(1 - epsilon) * d_out;
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc
new file mode 100644
index 0000000000..1c6d2ae4d0
--- /dev/null
+++ b/paddle/operators/layer_norm_op.cc
@@ -0,0 +1,370 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/layer_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename T>
+using EigenMatrixMapRowMajor = Eigen::Map<
+    Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+template <typename T>
+using ConstEigenMatrixMapRowMajor = Eigen::Map<
+    const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
+
+class LayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of LayerNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto begin_norm_axis = ctx->Attrs().Get<int>("begin_norm_axis");
+    PADDLE_ENFORCE_LT(begin_norm_axis, x_dim.size(),
+                      "'begin_norm_axis' must be less than the rank of X.");
+
+    auto matrix_dim = framework::flatten_to_2d(x_dim, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {left});
+    ctx->SetOutputDim("Variance", {left});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LayerNormOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor.");
+    AddInput("Scale",
+             "(Tensor, optional) Scale is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "(Tensor, optional) Bias is a 1-dimensional tensor of size "
+             "H(`begin_norm_axis` splits the tensor(`X`) to a matrix [N,H])."
+             "It is applied to the output.")
+        .AsDispensable();
+    AddOutput("Y", "(LoDTensor) Result after normalization.");
+    AddOutput("Mean", "(Tensor) Mean of the current mini batch.")
+        .AsIntermediate();
+    AddOutput("Variance", "(Tensor) Variance of the current mini batch.")
+        .AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-5) Constant for "
+                   "numerical stability")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 0.001f,
+                         "'epsilon' should be between 0.0 and 0.001.");
+        });
+    AddAttr<int>("begin_norm_axis",
+                 "(int default:1), the "
+                 "axis of `begin_norm_axis ... Rank(X) - 1` will be "
+                 "normalized. `begin_norm_axis` splits the tensor(`X`) to a "
+                 "matrix [N,H].")
+        .SetDefault(1)
+        .AddCustomChecker([](const int &begin_norm_axis) {
+          PADDLE_ENFORCE_GT(begin_norm_axis, 0,
+                            "'begin_norm_axis' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Layer Normalization.
+
+Layer Norm has been implemented as discussed in the paper:
+https://arxiv.org/abs/1607.06450
+...
+)DOC");
+  }
+};
+
+template <typename T>
+class LayerNormKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+
+    auto *output = ctx.Output<Tensor>("Y");
+    auto *mean = ctx.Output<Tensor>("Mean");
+    auto *var = ctx.Output<Tensor>("Variance");
+    output->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    auto input_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+
+    auto mean_map = EigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = EigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+    auto output_map = EigenMatrixMapRowMajor<T>(output->data<T>(), left, right);
+
+    auto squre = [](T ele) { return ele * ele; };
+    auto add_epslion = [epsilon](T ele) { return ele + epsilon; };
+
+    mean_map = input_map.rowwise().mean();
+    var_map = (input_map - mean_map.replicate(1, right))
+                  .unaryExpr(squre)
+                  .rowwise()
+                  .mean()
+                  .unaryExpr(add_epslion);
+
+    auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+    // TODO(zcd): Some thinking about output_map, is it appropriate that
+    // `output_map` and `input_map` point to the same memory.
+    auto inv_std = var_map.unaryExpr(inv_std_func);
+    if (scale && bias) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1)) +
+                   bias_map.replicate(left, 1);
+    } else if (scale) {
+      auto scale_map =
+          ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right))
+                       .cwiseProduct(scale_map.replicate(left, 1));
+    } else if (bias) {
+      auto bias_map = ConstEigenMatrixMapRowMajor<T>(bias->data<T>(), 1, right);
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right)) +
+                   bias_map.replicate(left, 1);
+    } else {
+      output_map = (input_map - mean_map.replicate(1, right))
+                       .cwiseProduct(inv_std.replicate(1, right));
+    }
+  }
+};
+
+class LayerNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of LayerNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of LayerNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+template <typename T>
+class LayerNormGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *mean = ctx.Input<Tensor>("Mean");
+    const auto *var = ctx.Input<Tensor>("Variance");
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+
+    const auto &x_dims = x->dims();
+
+    const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
+    auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
+    int left = static_cast<int>(matrix_dim[0]);
+    int right = static_cast<int>(matrix_dim[1]);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto x_map = ConstEigenMatrixMapRowMajor<T>(x->data<T>(), left, right);
+    auto d_y_map = ConstEigenMatrixMapRowMajor<T>(d_y->data<T>(), left, right);
+    auto mean_map = ConstEigenMatrixMapRowMajor<T>(mean->data<T>(), left, 1);
+    auto var_map = ConstEigenMatrixMapRowMajor<T>(var->data<T>(), left, 1);
+
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      auto d_bias_map = EigenMatrixMapRowMajor<T>(d_bias->data<T>(), 1, right);
+      d_bias_map = d_y_map.colwise().sum();
+    }
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      auto d_scale_map =
+          EigenMatrixMapRowMajor<T>(d_scale->data<T>(), 1, right);
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // There are two equation to compute d_scale. One uses "Y" and the other
+      // does not use "Y"
+      d_scale_map =
+          ((x_map - mean_map.replicate(1, right))
+               .cwiseProduct(
+                   var_map.unaryExpr(inv_std_func).replicate(1, right))
+               .cwiseProduct(d_y_map))
+              .colwise()
+              .sum();
+    }
+
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      auto d_x_map = EigenMatrixMapRowMajor<T>(d_x->data<T>(), left, right);
+      auto triple_product_func = [](T ele) { return ele * ele * ele; };
+      auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); };
+      // TODO(zcd): these code can be refined
+      if (d_scale) {
+        auto scale_map =
+            ConstEigenMatrixMapRowMajor<T>(scale->data<T>(), 1, right);
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map)
+                          .cwiseProduct(scale_map.replicate(left, 1));
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .cwiseProduct(scale_map.replicate(left, 1))
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(scale_map.replicate(left, 1))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+
+        d_x_map = dx_end + dx_mean + dx_var;
+      } else {
+        // dy_dx
+        auto dx_end = var_map.unaryExpr(inv_std_func)
+                          .replicate(1, right)
+                          .cwiseProduct(d_y_map);
+        // dy_dmean_dx
+        auto dx_mean = (T(-1.0) / right) *
+                       var_map.unaryExpr(inv_std_func)
+                           .replicate(1, right)
+                           .cwiseProduct(d_y_map)
+                           .rowwise()
+                           .sum()
+                           .replicate(1, right);
+        // dy_var_dx
+        auto dvar_end_part = (x_map - mean_map.replicate(1, right))
+                                 .cwiseProduct(d_y_map)
+                                 .rowwise()
+                                 .sum();
+        auto dvar_end = var_map.unaryExpr(inv_std_func)
+                            .unaryExpr(triple_product_func)
+                            .cwiseProduct(dvar_end_part)
+                            .replicate(1, right);
+        auto dx_var =
+            (T(-1.0) / right) *
+            (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end);
+
+        d_x_map = dx_end + dx_mean + dx_var;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker,
+            layer_norm_grad, ops::LayerNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h
new file mode 100644
index 0000000000..bca35b91e6
--- /dev/null
+++ b/paddle/operators/layer_norm_op.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+template <typename DeviceContext, typename T>
+class LayerNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc
new file mode 100644
index 0000000000..e24bf622b7
--- /dev/null
+++ b/paddle/operators/linear_chain_crf_op.cc
@@ -0,0 +1,269 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/linear_chain_crf_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LinearChainCRFOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LinearChainCRFOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Emission",
+             "(LoDTensor, default LoDTensor<float>) "
+             "A 2-D LoDTensor with shape [N x D], where N is the size of the "
+             "mini-batch and D is the total tag number. The unscaled emission "
+             "weight matrix for the linear chain CRF. ");
+    AddInput("Transition",
+             "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
+             "[(D + 2) x D]. The learnable parameter for the linear_chain_crf "
+             "operator. See more details in the operator's comments.");
+    AddInput("Label",
+             "(LoDTensor, default LoDTensor<int64_t>) A LoDTensor with shape "
+             "[N x 1], where N is the total element number in a mini-batch. "
+             "The ground truth.");
+    AddOutput(
+        "Alpha",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
+        "The forward vectors for the entire batch. Denote it as $\alpha$. "
+        "$\alpha$ is a memo table used to calculate the normalization "
+        "factor in CRF. $\alpha[k, v]$ stores the unnormalized "
+        "probabilites of all possible unfinished sequences of tags that end at "
+        "position $k$ with tag $v$. For each $k$, "
+        "$\alpha[k, v]$ is a vector of length $D$ with a component for "
+        "each tag value $v$. This vector is called a forward vecotr and "
+        "will also be used in backward computations.")
+        .AsIntermediate();
+    AddOutput(
+        "EmissionExps",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape [N x D]. "
+        "The exponentials of Input(Emission). This is an intermediate "
+        "computational result in forward computation, and will be reused in "
+        "backward computation.")
+        .AsIntermediate();
+    AddOutput(
+        "TransitionExps",
+        "(Tensor, default Tensor<float>) A 2-D Tensor with shape "
+        "[(D + 2) x D]. The exponentials of Input(Transition). This is an "
+        "intermediate computational result in forward computation, and "
+        "will be reused in backward computation.")
+        .AsIntermediate();
+    AddOutput(
+        "LogLikelihood",
+        "(Tensor, default Tensor<float>) The logarithm of the conditional "
+        "likelihood of each training sample in a mini-batch. This is a 2-D "
+        "tensor with shape [S x 1], where S is the sequence number in a "
+        "mini-batch. Note: S is equal to the sequence number in a mini-batch. "
+        "The output is no longer a LoDTensor.");
+    AddComment(R"DOC(
+LinearChainCRF Operator.
+
+Conditional Random Field defines an undirected probabilistic graph with nodes
+denoting random variables and edges denoting dependencies between these
+variables. CRF learns the conditional probability $P(Y|X)$, where
+$X = (x_1, x_2, ... , x_n)$ are structured inputs and
+$Y = (y_1, y_2, ... , y_n)$ are labels for the inputs.
+
+Linear chain CRF is a special case of CRF that is useful for sequence labeling
+task. Sequence labeling tasks do not assume a lot of conditional
+independences among inputs. The only constraint they impose is that the input
+and output must be linear sequences. Thus, the graph of such a CRF is a simple
+chain or a line, which results in the linear chain CRF.
+
+This operator implements the Forward-Backward algorithm for the linear chain
+CRF. Please refer to http://www.cs.columbia.edu/~mcollins/fb.pdf and
+http://cseweb.ucsd.edu/~elkan/250Bwinter2012/loglinearCRFs.pdf for details.
+
+Equation:
+1. Denote Input(Emission) to this operator as $x$ here.
+2. The first D values of Input(Transition) to this operator are for starting
+weights, denoted as $a$ here.
+3. The next D values of Input(Transition) of this operator are for ending
+weights, denoted as $b$ here.
+4. The remaning values of Input(Transition) are for transition weights,
+denoted as $w$ here.
+5. Denote Input(Label) as $s$ here.
+
+The probability of a sequence $s$ of length $L$ is defined as:
+$$P(s) = (1/Z) \exp(a_{s_1} + b_{s_L}
+                + \sum_{l=1}^L x_{s_l}
+                + \sum_{l=2}^L w_{s_{l-1},s_l})$$
+
+where $Z$ is a normalization value so that the sum of $P(s)$ over
+all possible sequences is 1, and $x$ is the emission feature weight
+to the linear chain CRF.
+
+Finally, the linear chain CRF operator outputs the logarithm of the conditional
+likelihood of each training sample in a mini-batch.
+
+NOTE:
+1. The feature function for a CRF is made up of the emission features and the
+transition features. The emission feature weights are NOT computed in
+this operator. They MUST be computed first before this operator is called.
+
+2. Because this operator performs global normalization over all possible
+sequences internally, it expects UNSCALED emission feature weights.
+Please do not call this op with the emission feature being output of any
+nonlinear activation.
+
+3. The 2nd dimension of Input(Emission) MUST be equal to the tag number.
+
+)DOC");
+  }
+};
+
+class LinearChainCRFOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Emission"),
+                   "Input(Emission) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Transition"),
+                   "Input(Transition) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Alpha"),
+                   "Output(Alpha) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("EmissionExps"),
+                   "Output(EmissionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("TransitionExps"),
+                   "Output(TransitionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("LogLikelihood"),
+                   "Output(LogLikelihood) should be not null.");
+
+    auto emission_dims = ctx->GetInputDim("Emission");
+    PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
+                      "The Input(Emission) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
+
+    auto transition_dims = ctx->GetInputDim("Transition");
+    PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
+                      "The Input(Transition) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_dims[0] - 2, transition_dims[1],
+        "An invalid dimension for the Input(Transition), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[1], transition_dims[1],
+        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
+        "should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
+    PADDLE_ENFORCE_EQ(
+        emission_dims[0], label_dims[0],
+        "The height of Input(Emission) and the height of Input(Label) "
+        "should be the same.");
+
+    ctx->SetOutputDim("Alpha", emission_dims);
+    ctx->SetOutputDim("EmissionExps", emission_dims);
+    ctx->SetOutputDim("TransitionExps", transition_dims);
+    // TODO(caoying) This is tricky. The 1st dimension of Output(LogLikelihood)
+    // is the sequence number in a mini-batch. The dimension set here should be
+    // resized to its correct size in the function Compute. Fix this once we can
+    // get LoD information in the InferShape interface.
+    ctx->SetOutputDim("LogLikelihood", {emission_dims[0], 1});
+  }
+
+ protected:
+  // Explicitly set that the data type of computation kernel of linear_chain_crf
+  // is determined by its input "Emission".
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class LinearChainCRFGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("EmissionExps"),
+                   "Input(EmissionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("TransitionExps"),
+                   "Input(TransitionExps) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("LogLikelihood")),
+                   "Input(LogLikelihood@GRAD) shoudl be not null.");
+
+    auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
+    PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
+                      "The Input(EmissionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE(emission_exps_dims[0],
+                   "An empty mini-batch is not allowed.");
+
+    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
+    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
+                      "The Input(TransitionExps) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        transition_exps_dims[0] - 2, transition_exps_dims[1],
+        "An invalid dimension for the Input(TransitionExps), which should "
+        "be a 2-D tensor with shape [(D + 2) x D].");
+    PADDLE_ENFORCE_EQ(
+        emission_exps_dims[1], transition_exps_dims[1],
+        "The 2nd dimension of the Input(EmissionExps) and the "
+        "Input(TransitionExps) should be equal to the tag number.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
+                   "The Input(Label) should be a 2-D tensor with the 2nd "
+                   "dimensions fixed to 1.");
+    PADDLE_ENFORCE_EQ(
+        emission_exps_dims[0], label_dims[0],
+        "The height of Input(EmissionExps) and the height of Input(Label) "
+        "should be the same.");
+
+    if (ctx->HasOutput(framework::GradVarName("Emission"))) {
+      ctx->SetOutputDim(framework::GradVarName("Emission"), emission_exps_dims);
+    }
+    if (ctx->HasOutput(framework::GradVarName("Transition"))) {
+      ctx->SetOutputDim(framework::GradVarName("Transition"),
+                        transition_exps_dims);
+    }
+  }
+
+ protected:
+  // Explicitly set that the data type of output of the linear_chain_crf_grad
+  // operator is determined by its input: gradients of LogLikelihood.
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<LoDTensor>(framework::GradVarName("LogLikelihood"))
+                ->type()),
+        platform::CPUPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker,
+            linear_chain_crf_grad, ops::LinearChainCRFGradOp);
+REGISTER_OP_CPU_KERNEL(
+    linear_chain_crf,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    linear_chain_crf_grad,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CPUDeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu
new file mode 100644
index 0000000000..da612510b4
--- /dev/null
+++ b/paddle/operators/linear_chain_crf_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/linear_chain_crf_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    linear_chain_crf,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    linear_chain_crf_grad,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LinearChainCRFGradOpKernel<paddle::platform::CUDADeviceContext,
+                                    double>);
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
new file mode 100644
index 0000000000..afc197a1c3
--- /dev/null
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+static inline T NormalizeL1(T* x, size_t len) {
+  T sum = 0.;
+  for (size_t i = 0; i < len; ++i) sum += x[i];
+  // (This comment is from the old LinearChainCRFLayer.)
+  // Right now, we just bet that sum won't be zero. If this really happens, we
+  // will figure out what should be done then.
+  PADDLE_ENFORCE(sum,
+                 "The unnormalized probabilities of all possible unfinished "
+                 "sequences must be greater than 0.");
+  T s = 1. / sum;
+  for (size_t i = 0; i < len; ++i) x[i] *= s;
+  return sum;
+}
+
+template <typename T>
+struct ScalarMul {
+  explicit ScalarMul(const T& scalar) : scalar(scalar) {}
+  T operator()(const T& val) const { return val * scalar; }
+
+  T scalar;
+};
+
+using framework::LoDTensor;
+using framework::LoD;
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class LinearChainCRFOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // TODO(caoying) The checks related to LoD information should be
+    // moved into InferShape once after the InferShape is refactored.
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Emission")->NumLevels(), 1UL,
+                      "The Input(Emission) should be a sequence.");
+    PADDLE_ENFORCE_EQ(ctx.Input<LoDTensor>("Label")->NumLevels(), 1UL,
+                      "The Input(Label) should be a sequence.");
+    auto in_lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(in_lod.size(), "Input(Label) must be a sequence.");
+    const size_t level = 0;
+    const size_t seq_num = in_lod[level].size() - 1;
+
+    const LoDTensor* emission_weights = ctx.Input<LoDTensor>("Emission");
+    const Tensor* transition_weights = ctx.Input<Tensor>("Transition");
+    const LoDTensor* label = ctx.Input<LoDTensor>("Label");
+
+    Tensor* emission_exps = ctx.Output<Tensor>("EmissionExps");
+    Tensor* transition_exps = ctx.Output<Tensor>("TransitionExps");
+    Tensor* alpha = ctx.Output<Tensor>("Alpha");
+    Tensor* ll = ctx.Output<Tensor>("LogLikelihood");
+
+    // Because the computation codes only runs on CPU, here the memory for all
+    // the outputs is FIXED to be allocated on the CPU memory.
+    emission_exps->mutable_data<T>(platform::CPUPlace());
+    transition_exps->mutable_data<T>(platform::CPUPlace());
+    alpha->mutable_data<T>(platform::CPUPlace());
+
+    // Resize the output tensor to its correct dimension.
+    ll->Resize({static_cast<int>(seq_num), 1});
+    ll->mutable_data<T>(platform::CPUPlace());
+
+    // Now, all the inputs and outputs should be on the CPU memory.
+    auto emission_dims = emission_weights->dims();
+    const size_t batch_size = emission_dims[0];
+    const size_t tag_num = emission_dims[1];
+
+    Tensor emission_row_max;
+    emission_row_max.mutable_data<T>(
+        framework::make_ddim({static_cast<int64_t>(batch_size), 1}),
+        platform::CPUPlace());
+
+    auto& place = *ctx.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    auto x = EigenMatrix<T>::From(*emission_weights);
+    auto x_row_max = EigenMatrix<T>::From(emission_row_max);
+    x_row_max.device(place) =
+        x.maximum(Eigen::DSizes<int, 1>(1))
+            .reshape(Eigen::DSizes<int, 2>(int(batch_size), 1));
+
+    auto x_exps = EigenMatrix<T>::From(*emission_exps);
+    x_exps.device(place) =
+        (x - x_row_max.broadcast(Eigen::DSizes<int, 2>(1, tag_num))).exp();
+
+    auto w = EigenMatrix<T>::From(*transition_weights);
+    auto w_exps = EigenMatrix<T>::From(*transition_exps);
+    w_exps.device(place) = w.exp();
+
+    T* log_likelihood = ll->data<T>();
+    for (size_t i = 0; i < seq_num; ++i) {
+      int start_pos = static_cast<int>(in_lod[level][i]);
+      int end_pos = static_cast<int>(in_lod[level][i + 1]);
+      if (end_pos == start_pos) {
+        // If an empty input sequence is given, pad 0 for its cost.
+        log_likelihood[i] = 0.;
+        continue;
+      }
+
+      const Tensor one_seq = emission_weights->Slice(start_pos, end_pos);
+      Tensor one_seq_row_max = emission_row_max.Slice(start_pos, end_pos);
+      Tensor one_seq_exps = emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+
+      log_likelihood[i] = ForwardOneSequence(
+          one_seq, one_seq_row_max, one_seq_exps, *transition_weights,
+          *transition_exps, one_seq_label, &one_seq_alpha);
+    }
+  };
+
+ private:
+  T ForwardOneSequence(const Tensor& emission, const Tensor& emission_row_max,
+                       const Tensor& emission_exps, const Tensor& trans_weights,
+                       const Tensor& trans_weight_exps, const Tensor& label,
+                       Tensor* alpha) const {
+    const T* x = emission.data<T>();
+    const T* x_row_max = emission_row_max.data<T>();
+    const T* x_exps = emission_exps.data<T>();
+    const T* w = trans_weights.data<T>();
+    const T* w_exps = trans_weight_exps.data<T>();
+    T* alpha_value = alpha->data<T>();
+
+    auto x_dims = emission.dims();
+    const size_t seq_length = x_dims[0];
+    const size_t tag_num = x_dims[1];
+    // The 1st row of w are transition weights for start mask.
+    // The 2nd row of w are transition weights for end mask.
+    // Transition weights between other tags begin from the 3rd row of w.
+    const size_t state_trans_base_idx = 2;
+
+    for (size_t i = 0; i < tag_num; ++i) {
+      alpha_value[i] = w_exps[i] * x_exps[i];
+    }
+    T ll = -x_row_max[0] - std::log(NormalizeL1<T>(alpha_value, tag_num));
+
+    for (size_t k = 1; k < seq_length; ++k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T sum = 0.;
+        for (size_t j = 0; j < tag_num; ++j) {
+          sum += alpha_value[(k - 1) * tag_num + j] *  // (*)
+                 w_exps[(j + state_trans_base_idx) * tag_num + i];
+        }
+        alpha_value[k * tag_num + i] = x_exps[k * tag_num + i] * sum;
+      }
+      // NormalizeL1 is to avoid underflow or overflow at (*).
+      ll -= x_row_max[k] +
+            std::log(NormalizeL1<T>(alpha_value + k * tag_num, tag_num));
+    }
+    T sum = 0.;
+    for (size_t i = 0; i < tag_num; ++i) {
+      sum += alpha_value[(seq_length - 1) * tag_num + i] * w_exps[tag_num + i];
+    }
+    ll -= std::log(sum);
+    // Now ll is equal to -log(Z).
+
+    const int64_t* lbl = label.data<int64_t>();
+    PADDLE_ENFORCE_LT(
+        static_cast<size_t>(*std::max_element(lbl, lbl + seq_length)), tag_num,
+        "An invalid tag label that execesses the largest tag number.");
+
+    // Calculate the nominator part, which depends on the label sequence.
+    ll += w[lbl[0]] /*start transition*/ + x[lbl[0]] +
+          w[tag_num + lbl[seq_length - 1]] /*end transition*/;
+    for (size_t k = 1; k < seq_length; ++k) {
+      ll += x[k * tag_num + lbl[k]] +
+            w[(lbl[k - 1] + state_trans_base_idx) * tag_num + lbl[k]];
+    }
+    return -ll;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const size_t level = 0;  // currently, only support sequence.
+    auto lod = ctx.Input<LoDTensor>("Label")->lod();
+    PADDLE_ENFORCE(lod.size(), "Input(Label) must be a sequence.");
+
+    const Tensor* label = ctx.Input<LoDTensor>("Label");
+    const Tensor* emission_exps = ctx.Input<Tensor>("EmissionExps");
+    const Tensor* transition_exps = ctx.Input<Tensor>("TransitionExps");
+    const Tensor* alpha = ctx.Input<Tensor>("Alpha");
+    const T* ll_grad =
+        ctx.Input<Tensor>(framework::GradVarName("LogLikelihood"))->data<T>();
+
+    Tensor* emission_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Emission"));
+    Tensor* transition_grad =
+        ctx.Output<Tensor>(framework::GradVarName("Transition"));
+
+    // TODO(caoying) Fix this constraint. When the Input(Emission) is from the
+    // data reader operator, it can have no gradients.
+    PADDLE_ENFORCE(emission_grad, "Output(Emission@Grad) should not be null.");
+    emission_grad->mutable_data<T>(platform::CPUPlace());
+    if (transition_grad) {
+      transition_grad->mutable_data<T>(platform::CPUPlace());
+      math::set_constant(ctx.device_context(), transition_grad, 0.);
+    }
+    // Now, all the inputs and outputs should be on the CPU memory.
+
+    auto emission_dims = emission_exps->dims();
+    // Beta is the memo table used in dynamic programming to calculate the
+    // backwark vectors. For a backward vector i (the i-th row of beta), it
+    // captures the unnormalized probabilities of partial sequences starting
+    // at position i.
+    Tensor beta;
+    beta.mutable_data<T>(emission_dims, platform::CPUPlace());
+
+    for (size_t i = 0; i < lod[level].size() - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      if (end_pos == start_pos) continue;
+
+      const Tensor one_seq_emission_exps =
+          emission_exps->Slice(start_pos, end_pos);
+      const Tensor one_seq_label = label->Slice(start_pos, end_pos);
+      const Tensor one_seq_alpha = alpha->Slice(start_pos, end_pos);
+      Tensor one_seq_beta = beta.Slice(start_pos, end_pos);
+      Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos);
+
+      BackwardOneSequence(
+          ctx.template device_context<platform::CPUDeviceContext>(), ll_grad[i],
+          one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label,
+          &one_seq_beta, transition_grad, &one_seq_emission_grad);
+    }
+  };
+
+ private:
+  void BackwardOneSequence(const platform::CPUDeviceContext& ctx,
+                           const T ll_grad, const Tensor& emission_exps,
+                           const Tensor& transition_exps, const Tensor& alpha,
+                           const Tensor& label, Tensor* beta,
+                           Tensor* transition_grad,
+                           Tensor* emission_grad) const {
+    const T* w_exps = transition_exps.data<T>();
+    const T* x_exps = emission_exps.data<T>();
+    const int64_t* label_value = label.data<int64_t>();
+    T* beta_value = beta->data<T>();
+
+    auto x_dims = emission_exps.dims();
+    const size_t seq_length = x_dims[0];
+    const size_t tag_num = x_dims[1];
+    const size_t state_trans_base_idx = 2;
+
+    // Calculate the backward vectors: beta.
+    // First, calculate the initialition state.
+    for (size_t i = 0; i < tag_num; ++i) {
+      beta_value[(seq_length - 1) * tag_num + i] = w_exps[tag_num + i];
+    }
+    NormalizeL1<T>(beta_value + (seq_length - 1) * tag_num, tag_num);
+    for (int k = static_cast<int>(seq_length) - 2; k >= 0; --k) {
+      for (size_t i = 0; i < tag_num; ++i) {
+        T sum = 0.;
+        for (size_t j = 0; j < tag_num; ++j) {
+          sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
+                 x_exps[(k + 1) * tag_num + j] *
+                 beta_value[(k + 1) * tag_num + j];
+        }
+        beta_value[k * tag_num + i] = sum;
+      }
+      // NormalizeL1 is to avoid underflow or overflow at (**).
+      NormalizeL1<T>(beta_value + k * tag_num, tag_num);
+    }
+
+    auto x_grad_mat = EigenMatrix<T>::From(*emission_grad);
+    auto alpha_mat = EigenMatrix<T>::From(alpha);
+    auto beta_mat = EigenMatrix<T>::From(*beta);
+
+    auto* place = ctx.eigen_device();
+    auto prob = alpha_mat * beta_mat;
+    auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
+                       .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
+                       .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
+    x_grad_mat.device(*place) =
+        (prob / row_sum).unaryExpr(ScalarMul<T>(ll_grad));
+
+    for (size_t k = 0; k < seq_length; ++k) {
+      x_grad_mat(k, label_value[k]) -= static_cast<T>(ll_grad);
+    }
+
+    if (transition_grad) {
+      T* trans_grad = transition_grad->data<T>();
+      for (size_t k = 0; k < tag_num; ++k) {
+        // Do not multiply by the output gradient here, because x_grad_mat has
+        // alrealy done this.
+        trans_grad[k] += x_grad_mat(/*from start state*/ 0, k);
+        trans_grad[tag_num + k] +=
+            x_grad_mat(/*to end state*/ seq_length - 1, k);
+      }
+
+      auto x_exps_mat = EigenMatrix<T>::From(emission_exps);
+
+      // TODO(caoying): Fix this to avoid using this local variable if we can
+      // profile the training process.
+      Tensor tmp;
+      tmp.mutable_data<T>(beta->dims(), platform::CPUPlace());
+      auto tmp_mat = EigenMatrix<T>::From(tmp);
+      auto prob = beta_mat * x_exps_mat;
+      auto row_sum = prob.sum(Eigen::DSizes<int, 1>(1))
+                         .reshape(Eigen::DSizes<int, 2>(seq_length, 1))
+                         .broadcast(Eigen::DSizes<int, 2>(1, tag_num));
+      tmp_mat.device(*place) = prob / row_sum;
+
+      for (size_t k = 1; k < seq_length; ++k) {
+        T sum = 0.;
+        for (size_t i = 0; i < tag_num; ++i) {
+          for (size_t j = 0; j < tag_num; ++j) {
+            sum += w_exps[(i + state_trans_base_idx) * tag_num + j] *  // (**)
+                   alpha_mat(k - 1, i) * tmp_mat(k, j);
+          }
+        }
+        sum = 1. / sum;
+        for (size_t i = 0; i < tag_num; ++i) {
+          for (size_t j = 0; j < tag_num; ++j) {
+            trans_grad[(i + state_trans_base_idx) * tag_num + j] +=
+                sum * w_exps[(i + state_trans_base_idx) * tag_num + j] *
+                alpha_mat(k - 1, i) * tmp_mat(k, j) * ll_grad;
+          }
+        }
+        trans_grad[(label_value[k - 1] + state_trans_base_idx) * tag_num +
+                   label_value[k]] -= static_cast<T>(ll_grad);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/load_combine_op.cc b/paddle/operators/load_combine_op.cc
new file mode 100644
index 0000000000..f4be793d7b
--- /dev/null
+++ b/paddle/operators/load_combine_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadCombineOp : public framework::OperatorBase {
+ public:
+  LoadCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin),
+                   "Cannot open file %s for load_combine op", filename);
+
+    auto out_var_names = Outputs("Out");
+    PADDLE_ENFORCE_GT(
+        static_cast<int>(out_var_names.size()), 0,
+        "The number of output variables should be greater than 0.");
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < out_var_names.size(); i++) {
+      auto *out_var = scope.FindVar(out_var_names[i]);
+
+      PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                     out_var_names[i]);
+
+      auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+      // Error checking
+      PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot read more from file %s",
+                     filename);
+
+      // Get data from fin to tensor
+      DeserializeFromStream(fin, tensor, dev_ctx);
+
+      if (platform::is_gpu_place(place)) {
+        // copy CPU to GPU
+        framework::LoDTensor cpu_tensor;
+        cpu_tensor.ShareDataWith(*tensor);
+        cpu_tensor.set_lod(tensor->lod());
+
+        // reset tensor
+        out_var->Clear();
+        tensor = out_var->GetMutable<framework::LoDTensor>();
+        tensor->set_lod(cpu_tensor.lod());
+        Copy(cpu_tensor, place, dev_ctx, tensor);
+      }
+    }
+  }
+};
+
+class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput(
+        "Out",
+        "(vector) The output LoDTensors that will be read from the input file.")
+        .AsDuplicable();
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "LoDTensors will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+LoadCombine Operator.
+
+LoadCombine operator loads LoDTensor variables from a file. The file should 
+contain one or more LoDTensors serialized using the SaveCombine operator. The 
+LoadCombine operator applies a deserialization strategy to appropriately load 
+the LodTensors, and this strategy complements the serialization strategy used 
+in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled
+with the SaveCombine operator, and can only deserialize one or more LoDTensors 
+that were saved using the SaveCombine operator.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load_combine, ops::LoadCombineOp,
+                  ops::LoadCombineOpProtoMaker);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
new file mode 100644
index 0000000000..f886b423ac
--- /dev/null
+++ b/paddle/operators/load_op.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <fstream>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class LoadOp : public framework::OperatorBase {
+ public:
+  LoadOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    std::ifstream fin(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s for load op",
+                   filename);
+
+    auto out_var_name = Output("Out");
+    auto *out_var = scope.FindVar(out_var_name);
+    PADDLE_ENFORCE(out_var != nullptr, "Output variable %s cannot be found",
+                   out_var_name);
+
+    auto *tensor = out_var->GetMutable<framework::LoDTensor>();
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    DeserializeFromStream(fin, tensor, dev_ctx);
+
+    if (platform::is_gpu_place(place)) {
+      // copy CPU to GPU
+      framework::LoDTensor cpu_tensor;
+      cpu_tensor.ShareDataWith(*tensor);
+      cpu_tensor.set_lod(tensor->lod());
+
+      // reset tensor
+      out_var->Clear();
+      tensor = out_var->GetMutable<framework::LoDTensor>();
+      tensor->set_lod(cpu_tensor.lod());
+      Copy(cpu_tensor, place, dev_ctx, tensor);
+    }
+  }
+};
+
+class LoadOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoadOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) The tensor need to be loaded");
+    AddAttr<std::string>("file_path",
+                         "(string) "
+                         "Variable will be loaded from \"file_path\".")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+    AddComment(R"DOC(
+Load Operator.
+
+Load operator will load a tensor variable from disk file.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(load, ops::LoadOp, ops::LoadOpProtoMaker);
diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc
new file mode 100644
index 0000000000..d2c52745cf
--- /dev/null
+++ b/paddle/operators/lod_array_length_op.cc
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDArrayLengthOp : public framework::OperatorBase {
+ public:
+  LoDArrayLengthOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensorArray>();
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize({1});
+    auto cpu = platform::CPUPlace();
+    *out.mutable_data<int64_t>(cpu) = static_cast<int64_t>(x.size());
+  }
+};
+
+class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDArrayLengthProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensorArray) The input tensor array.");
+    AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t");
+    AddComment(R"DOC(
+LoDArrayLength Operator.
+
+This operator obtains the length of lod tensor array:
+
+$$Out = len(X)$$
+
+NOTE: The output is a CPU Tensor since the control variable should be only in
+CPU and the length of LoDTensorArray should be used as control variables.
+
+)DOC");
+  }
+};
+
+class LoDArrayLengthInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput("Out"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_array_length, ops::LoDArrayLengthOp,
+                  ops::LoDArrayLengthInferShape, ops::LoDArrayLengthProtoMaker,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_rank_table_op.cc b/paddle/operators/lod_rank_table_op.cc
new file mode 100644
index 0000000000..692b9bf371
--- /dev/null
+++ b/paddle/operators/lod_rank_table_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+
+class LoDRankTableOp : public framework::OperatorBase {
+ public:
+  LoDRankTableOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDRankTable>();
+    VLOG(10) << "Level = " << static_cast<size_t>(Attr<int>("level"));
+    out->Reset(x.lod(), static_cast<size_t>(Attr<int>("level")));
+    VLOG(10) << Input("X") << "'s lod information is " << *out;
+  }
+};
+
+class LoDRankTableOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDRankTableOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) input lod tensor, must contain lod information.");
+    AddOutput("Out", "(LoDRankTable) The rank table of specific level.");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(R"DOC(Create LoDRanTable by LoDTensor
+
+LoD Rank Table stores the `level` of `lod` which is ordered by sequence
+length in descending order. It is useful when implement dynamic RNN and is
+shared by dynamic RNN memory, dynamic RNN slice input and dynamic RNN slice
+output operators.
+)DOC");
+  }
+};
+
+class LoDRankTableInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "LoDRankTable must has input X");
+  }
+};
+
+class LoDRankTableInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &o : op_desc.Output("Out")) {
+      block->FindRecursiveOrCreateVar(o).SetType(
+          framework::proto::VarDesc::LOD_RANK_TABLE);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(lod_rank_table, paddle::operators::LoDRankTableOp,
+                  paddle::operators::LoDRankTableOpProtoMaker,
+                  paddle::operators::LoDRankTableInferShape,
+                  paddle::operators::LoDRankTableInferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc
new file mode 100644
index 0000000000..3d7b15edcf
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LoDResetOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of LoDResetOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LoDResetOp should not be null.");
+    // If target LoD is not set form Input(), then it must be set from Attr().
+    if (!ctx->HasInput("TargetLoD")) {
+      auto level0 = ctx->Attrs().Get<std::vector<int>>("target_lod");
+      PADDLE_ENFORCE(level0.size() > 1,
+                     "Target LoD is not found, should be set to be a valid one "
+                     "through Input() or Attr().");
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDResetOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The input tensor of lod_reset operator.");
+    AddInput("TargetLoD",
+             "(Tensor, optional) The target level 0 LoD from Input().")
+        .AsDispensable();
+    AddOutput("Out", "(LoDTensor) The output tensor of lod_reset operator.");
+    AddAttr<std::vector<int>>("target_lod",
+                              "The target level 0 LoD from Attr().")
+        .SetDefault(std::vector<int>{});
+    AddComment(R"DOC(LoDReset operator
+
+Reset LoD of Input(X) into a new one specified by Input(TargetLoD) or
+Attr(target_lod), or set LoD for Input(X) if it doesn't have one.
+Currently the lod_reset operator only supports the reset of level 0 LoD.
+At least one of Input(TargetLoD) and Attr(target_lod) must be set,
+and if both of them are set, Input(TargetLoD) will be chosen as the
+target LoD.
+
+An example:
+Given a float LoDTensor X with shape (6, 1), its transpose form represents
+
+    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
+
+with LoD = [[0, 2, 5, 6]] and the three (transposed) sequences look like
+
+    [1.0, 2.0], [3.0, 4.0, 5.0], [6.0].
+
+If target LoD = [0, 4, 6], the lod_reset operator will reset the LoD and
+the sequences that the LoDTensor Output(Out) contains becomes:
+
+    [1.0, 2.0, 3.0, 4.0], [5.0, 6.0].
+
+)DOC");
+  }
+};
+
+class LoDResetGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, lod_reset_grad,
+            ops::LoDResetGradOp);
+REGISTER_OP_CPU_KERNEL(lod_reset,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, float>,
+                       ops::LoDResetKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lod_reset_grad, ops::LoDResetGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LoDResetGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu
new file mode 100644
index 0000000000..910866ea63
--- /dev/null
+++ b/paddle/operators/lod_reset_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lod_reset_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset, ops::LoDResetKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lod_reset_grad,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LoDResetGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
new file mode 100644
index 0000000000..c1bbba7a83
--- /dev/null
+++ b/paddle/operators/lod_reset_op.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class LoDResetKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* lod_t = ctx.Input<framework::Tensor>("TargetLoD");
+
+    std::vector<int> level0;
+    if (lod_t) {
+      auto* lod = lod_t->data<int>();
+      if (platform::is_gpu_place(ctx.GetPlace())) {
+        framework::Tensor lod_cpu;
+        framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                        &lod_cpu);
+        lod = lod_cpu.data<int>();
+      }
+      level0 = std::vector<int>(lod, lod + lod_t->numel());
+    } else {
+      level0 = ctx.Attr<std::vector<int>>("target_lod");
+    }
+
+    PADDLE_ENFORCE(level0.size() > 1UL,
+                   "The size of target LoD should be greater than 1.");
+    PADDLE_ENFORCE(level0[0] == 0,
+                   "Target LoD should be a vector starting from 0.");
+    PADDLE_ENFORCE(level0.back() == in->dims()[0],
+                   "Target LoD should be a vector end with the "
+                   "first dimension of Input(X).");
+    for (size_t i = 0; i < level0.size() - 1; ++i) {
+      PADDLE_ENFORCE(level0[i + 1] > level0[i],
+                     "Target LoD should be an ascending vector.");
+    }
+
+    out->ShareDataWith(*in);
+    // cast level0 to size_t
+    std::vector<size_t> ulevel0(level0.size(), 0);
+    std::transform(level0.begin(), level0.end(), ulevel0.begin(),
+                   [](int a) { return static_cast<size_t>(a); });
+    framework::LoD target_lod;
+    target_lod.push_back(ulevel0);
+    out->set_lod(target_lod);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LoDResetGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    d_x->ShareDataWith(*d_out);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
new file mode 100644
index 0000000000..685a807a8a
--- /dev/null
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+class LoDTensorToArrayOp : public framework::OperatorBase {
+ public:
+  LoDTensorToArrayOp(const std::string &type,
+                     const framework::VariableNameMap &inputs,
+                     const framework::VariableNameMap &outputs,
+                     const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
+                          Input("X"))
+                  .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
+                           .Get<framework::LoDRankTable>();
+    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
+                     .GetMutable<framework::LoDTensorArray>();
+    auto &items = rank_table.items();
+    auto max_seq_len = items[0].length;
+    auto rank_level = rank_table.level();
+
+    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
+                      "Input should be a LOD tensor, and size is at least %d",
+                      rank_level + 1);
+    out.resize(max_seq_len);
+    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
+
+    // set out[i] lod
+    for (size_t t = 0; t < max_seq_len; t++) {
+      auto &lod = *out[t].mutable_lod();
+      lod.clear();
+      for (auto &item : items) {
+        if (t >= item.length) {
+          break;
+        }
+        size_t start_idx = x.lod()[rank_level][item.index] + t;
+        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+            x.lod(), start_idx, start_idx + 1, rank_level + 1);
+        auto &lod_length = lod_and_offset.first;
+        framework::AppendLoD(&lod, lod_length);
+        size_t start_offset = lod_and_offset.second.first;
+        size_t end_offset = lod_and_offset.second.second;
+        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+      }
+    }
+    for (size_t i = 0; i < max_seq_len; ++i) {
+      auto &ranges = copy_ranges[i];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out[i].Resize(x_dim);
+      out[i].mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto &dev_ctx = *pool.Get(place);
+
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class LoDTensorToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LoDTensorToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("RankTable", "");
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class LoDTensorToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of LoDTensorToArrayOp should not be null.");
+    PADDLE_ENFORCE(
+        context->HasInput("RankTable"),
+        "Input(RankTable) of LoDTensorToArrayOp should not be null.");
+
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of LoDTensorToArrayOp should not be null.");
+
+    auto x_dim = context->GetInputDim("X");
+    // The first dim of each LoDTensor in Output can only be set at run-time.;
+    // We still have to Resize each LoDTensor in Output.
+    context->SetOutputDim("Out", x_dim);
+  }
+};
+
+class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    for (auto &out_var : op_desc.Output("Out")) {
+      block->Var(out_var)->SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
+    }
+  }
+};
+
+class LoDTensorToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("array_to_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lod_tensor_to_array, ops::LoDTensorToArrayOp,
+                  ops::LoDTensorToArrayOpProtoMaker,
+                  ops::LoDTensorToArrayInferShape,
+                  ops::LoDTensorToArrayInferVarType,
+                  ops::LoDTensorToArrayGradMaker);
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000..f714945354
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000..be283e4700
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    log_loss, ops::LogLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    log_loss_grad,
+    ops::LogLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000..743eddb740
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc
new file mode 100644
index 0000000000..fedd325cf4
--- /dev/null
+++ b/paddle/operators/logical_op.cc
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename OpComment>
+class BinaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BinaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X",
+             string::Sprintf("(LoDTensor) Left hand operand of %s operator",
+                             comment.type));
+    AddInput("Y",
+             string::Sprintf("(LoDTensor) Right hand operand of %s operator",
+                             comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X and Y, and returns the Out. X, Y and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UnaryLogicalOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    OpComment comment;
+    AddInput("X", string::Sprintf("(LoDTensor) Operand of %s operator",
+                                  comment.type));
+    AddOutput("Out", string::Sprintf(
+                         "(LoDTensor) n-dim bool tensor. Each element is %s",
+                         comment.equation));
+    AddComment(string::Sprintf(R"DOC(%s Operator
+
+It operates element-wise on X, and returns the Out. X and Out are N-dim boolean tensors.
+Each element of Out is calculated by %s
+)DOC",
+                               comment.type, comment.equation));
+  }
+};
+
+template <typename OpComment>
+class BinaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(framework::product(dim_x), framework::product(dim_y),
+                      "The number of elements in X and Y should be same");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+template <typename OpComment>
+class UnaryLogicalOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    OpComment comment;
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of %s operator must not be null", comment.type);
+    auto dim_x = context->GetInputDim("X");
+
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class LogicalOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
+    // LogicalOp kernel's device type is decided by input tensor place
+    kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
+    return kt;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                             \
+    static char type[];                                                    \
+    static char equation[];                                                \
+  };                                                                       \
+  char _##op_type##Comment::type[]{#op_type};                              \
+  char _##op_type##Comment::equation[]{_equation};                         \
+  REGISTER_OPERATOR(                                                       \
+      op_type, ::paddle::operators::LogicalOp,                             \
+      ::paddle::operators::BinaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::BinaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+#define REGISTER_UNARY_LOGICAL_OP(op_type, _equation)                     \
+  struct _##op_type##Comment {                                            \
+    static char type[];                                                   \
+    static char equation[];                                               \
+  };                                                                      \
+  char _##op_type##Comment::type[]{#op_type};                             \
+  char _##op_type##Comment::equation[]{_equation};                        \
+  REGISTER_OPERATOR(                                                      \
+      op_type, ::paddle::operators::LogicalOp,                            \
+      ::paddle::operators::UnaryLogicalOpProtoMaker<_##op_type##Comment>, \
+      ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \
+      ::paddle::framework::EmptyGradOpMaker);
+
+REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_OP(logical_xor,
+                           "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$");
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu
new file mode 100644
index 0000000000..87f2287b8f
--- /dev/null
+++ b/paddle/operators/logical_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/logical_op.h"
+
+REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA,
+                               paddle::operators::LogicalAndFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA,
+                               paddle::operators::LogicalOrFunctor);
+REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA,
+                              paddle::operators::LogicalNotFunctor);
+REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA,
+                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h
new file mode 100644
index 0000000000..4138576856
--- /dev/null
+++ b/paddle/operators/logical_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct LogicalAndFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a && b; }
+};
+
+template <typename T>
+struct LogicalOrFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const { return a || b; }
+};
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a) const { return !a; }
+};
+
+template <typename T>
+struct LogicalXorFunctor {
+  using ELEM_TYPE = T;
+  HOSTDEVICE bool operator()(const T& a, const T& b) const {
+    return (a || b) && !(a && b);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class BinaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* y = context.Input<framework::Tensor>("Y");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor binary_func;
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(), y->data<T>(),
+          out->mutable_data<bool>(context.GetPlace()), binary_func);
+  }
+};
+
+template <typename DeviceContext, typename Functor>
+class UnaryLogicalOpKernel
+    : public framework::OpKernel<typename Functor::ELEM_TYPE> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    using T = typename Functor::ELEM_TYPE;
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    Functor unary_func;
+    platform::Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x->data<T>(),
+          x->data<T>() + x->numel(),
+          out->mutable_data<bool>(context.GetPlace()), unary_func);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                 \
+      op_type, ::paddle::operators::BinaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
+
+#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \
+  REGISTER_OP_##dev##_KERNEL(                                \
+      op_type, ::paddle::operators::UnaryLogicalOpKernel<    \
+                   ::paddle::platform::dev##DeviceContext, functor<bool>>);
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
new file mode 100644
index 0000000000..2405852f53
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lookup_table_op.h"
+#include "paddle/framework/var_type_inference.h"
+
+namespace paddle {
+namespace operators {
+
+class LookupTableOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"),
+                   "Input(W) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                   "Input(Ids) of LookupTableOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LookupTableOp should not be null.");
+
+    auto table_dims = ctx->GetInputDim("W");
+    auto ids_dims = ctx->GetInputDim("Ids");
+
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+
+    ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
+    ctx->ShareLoD("Ids", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
+  }
+};
+
+class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LookupTableOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("W",
+             "An input represents embedding tensors, "
+             "which is a learnable parameter.");
+    AddInput("Ids",
+             "An input with type int32 or int64 "
+             "contains the ids to be looked up in W. "
+             "Ids must be a column vector with rank = 2. "
+             "The 2nd dimension size must be 1.");
+    AddOutput("Out", "The lookup results, which have the same type as W.");
+    AddAttr<bool>("is_sparse",
+                  "(boolean, default false) "
+                  "Sparse update")
+        .SetDefault(false);
+    AddAttr<int64_t>("padding_idx",
+                     "(int64, default -1) "
+                     "If the value is -1, it makes no effect to lookup. "
+                     "Otherwise the given value indicates padding the output "
+                     "with zeros whenever lookup encounters it in Ids.")
+        .SetDefault(-1);
+    AddComment(R"DOC(
+Lookup Table Operator.
+
+This operator is used to perform lookups on the parameter W,
+then concatenated into a dense tensor.
+
+The input Ids can carry the LoD (Level of Details) information,
+or not. And the output only shares the LoD information with input Ids.
+
+)DOC");
+  }
+};
+
+class LookupTableOpGradDescMaker
+    : public framework::DefaultGradOpDescMaker<true> {
+  using ::paddle::framework::DefaultGradOpDescMaker<
+      true>::DefaultGradOpDescMaker;
+
+ protected:
+  virtual std::string GradOpType() const { return "lookup_table_grad"; }
+};
+
+class LookupTableOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto table_dims = ctx->GetInputDim("W");
+    ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
+        ctx.device_context());
+  }
+};
+
+class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto attr = op_desc.GetAttr("is_sparse");
+    bool is_sparse = boost::get<bool>(attr);
+    if (is_sparse) {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to SelectedRows";
+      block->Var(out_var_name)
+          ->SetType(framework::proto::VarDesc::SELECTED_ROWS);
+    } else {
+      VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
+              << " is set to LoDTensor";
+      block->Var(out_var_name)->SetType(framework::proto::VarDesc::LOD_TENSOR);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(lookup_table, ops::LookupTableOp,
+                  ops::LookupTableOpGradDescMaker, ops::LookupTableOpMaker);
+REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad,
+                  ops::LookupTableOpGradVarTypeInference);
+
+REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel<float>,
+                       ops::LookupTableKernel<double>);
+REGISTER_OP_CPU_KERNEL(lookup_table_grad, ops::LookupTableGradKernel<float>,
+                       ops::LookupTableGradKernel<double>);
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
new file mode 100644
index 0000000000..d97390fa1c
--- /dev/null
+++ b/paddle/operators/lookup_table_op.cu
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/lookup_table_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX,
+          bool PaddingFlag>
+__global__ void LookupTable(T* output, const T* table, const int64_t* ids,
+                            const int64_t N, const int64_t K, const int64_t D,
+                            const int64_t padding_idx) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int64_t id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    T* out = output + idy * D;
+    const T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      if (PaddingFlag) {
+        if (id == padding_idx)
+          out[i] = static_cast<T>(0);
+        else
+          out[i] = tab[i];
+      } else {
+        out[i] = tab[i];
+      }
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void LookupTableGrad(T* table, const T* output, const int64_t* ids,
+                                const int64_t N, const int64_t K,
+                                const int64_t D) {
+  int idx = threadIdx.x;
+  int idy = blockIdx.x + threadIdx.y * GridDimX;
+
+  while (idy < K) {
+    int id = ids[idy];
+    PADDLE_ASSERT(id >= 0);
+    PADDLE_ASSERT(id < N);
+    const T* out = output + idy * D;
+    T* tab = table + id * D;
+    for (int i = idx; i < D; i += BlockDimX) {
+      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+    }
+    idy += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class LookupTableCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* table_t = context.Input<LoDTensor>("W");
+    auto* ids_t = context.Input<LoDTensor>("Ids");
+    auto* output_t = context.Output<LoDTensor>("Out");
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    size_t N = table_t->dims()[0];
+    size_t D = table_t->dims()[1];
+    size_t K = ids_t->numel();
+    auto* ids = ids_t->data<int64_t>();
+    auto* table = table_t->data<T>();
+    auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+    dim3 threads(128, 8);
+    dim3 grids(8, 1);
+
+    if (padding_idx == -1)
+      LookupTable<
+          T, 128, 8, 8,
+          false><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+    else
+      LookupTable<
+          T, 128, 8, 8,
+          true><<<grids, threads, 0, context.cuda_device_context().stream()>>>(
+          output, table, ids, N, K, D, padding_idx);
+  }
+};
+
+template <typename T>
+class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* table = context.Input<LoDTensor>("W");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      auto stream = dev_ctx.stream();
+      // copy GPU memory to CPU pinned memory
+      framework::Vector<int64_t> new_rows;
+      new_rows.resize(ids_dim[0]);
+      auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
+
+      memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data,
+                   ids_dim[0] * sizeof(int64_t), stream);
+
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      auto* d_table_data = d_table_value->data<T>();
+      auto* d_output_data = d_output->data<T>();
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
+                   d_output->numel() * sizeof(T), stream);
+
+    } else {
+      auto ids_t = context.Input<LoDTensor>("Ids");
+      auto d_output_t = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto d_table_t = context.Output<LoDTensor>(framework::GradVarName("W"));
+
+      int N = d_table_t->dims()[0];
+      int D = d_table_t->dims()[1];
+      int K = ids_t->numel();
+      const int64_t* ids = ids_t->data<int64_t>();
+      const T* d_output = d_output_t->data<T>();
+      T* d_table = d_table_t->mutable_data<T>(context.GetPlace());
+
+      auto t = framework::EigenVector<T>::Flatten(*d_table_t);
+      t.device(*dev_ctx.eigen_device()) = t.constant(static_cast<T>(0));
+
+      dim3 threads(128, 8);
+      dim3 grids(8, 1);
+      LookupTableGrad<T, 128, 8, 8><<<grids, threads, 0, dev_ctx.stream()>>>(
+          d_table, d_output, ids, N, K, D);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel<float>,
+                        ops::LookupTableCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lookup_table_grad,
+                        ops::LookupTableGradCUDAKernel<float>,
+                        ops::LookupTableGradCUDAKernel<double>);
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
new file mode 100644
index 0000000000..0842c422f7
--- /dev/null
+++ b/paddle/operators/lookup_table_op.h
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/selected_rows.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+template <typename T>
+class LookupTableKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* table_t = context.Input<LoDTensor>("W");      // float tensor
+    auto* ids_t = context.Input<LoDTensor>("Ids");      // int tensor
+    auto* output_t = context.Output<LoDTensor>("Out");  // float tensor
+    int64_t padding_idx = context.Attr<int64_t>("padding_idx");
+
+    int N = table_t->dims()[0];
+    int D = table_t->dims()[1];
+    auto* ids = ids_t->data<int64_t>();
+    auto* table = table_t->data<T>();
+    auto* output = output_t->mutable_data<T>(context.GetPlace());
+
+    if (padding_idx == -1) {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids[i], N);
+        PADDLE_ENFORCE_GE(ids[i], 0);
+        memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+      }
+    } else {
+      for (int64_t i = 0; i < ids_t->numel(); ++i) {
+        if (ids[i] == padding_idx) {
+          memset(output + i * D, 0, D * sizeof(T));
+        } else {
+          PADDLE_ENFORCE_LT(ids[i], N);
+          PADDLE_ENFORCE_GE(ids[i], 0);
+          memcpy(output + i * D, table + ids[i] * D, D * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class LookupTableGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool is_sparse = context.Attr<bool>("is_sparse");
+    // Since paddings are not trainable and fixed in forward, the gradient of
+    // paddings makes no sense and we don't deal with it in backward.
+    if (is_sparse) {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* table = context.Input<LoDTensor>("W");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      framework::Vector<int64_t> new_rows;
+      new_rows.reserve(ids_dim[0]);
+      for (int64_t i = 0; i < ids_dim[0]; i++) {
+        new_rows.push_back(ids_data[i]);
+      }
+      d_table->set_rows(new_rows);
+
+      auto* d_table_value = d_table->mutable_value();
+      d_table_value->Resize({ids_dim[0], table->dims()[1]});
+      d_table_value->mutable_data<T>(context.GetPlace());
+
+      d_table->set_height(table->dims()[0]);
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table_value->data<T>();
+
+      PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
+      memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
+    } else {
+      auto* ids = context.Input<LoDTensor>("Ids");
+      auto* d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
+      auto* d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
+      auto* table = context.Input<LoDTensor>("W");
+
+      auto* ids_data = ids->data<int64_t>();
+      auto ids_dim = ids->dims();
+
+      int N = table->dims()[0];
+      int D = d_output->dims()[1];
+
+      auto* d_output_data = d_output->data<T>();
+      auto* d_table_data = d_table->mutable_data<T>(context.GetPlace());
+
+      memset(d_table_data, 0, d_table->numel() * sizeof(T));
+
+      for (int64_t i = 0; i < ids->numel(); ++i) {
+        PADDLE_ENFORCE_LT(ids_data[i], N);
+        PADDLE_ENFORCE_GE(ids_data[i], 0);
+        for (int j = 0; j < D; ++j) {
+          d_table_data[ids_data[i] * D + j] += d_output_data[i * D + j];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc
new file mode 100644
index 0000000000..95673ba19e
--- /dev/null
+++ b/paddle/operators/lrn_op.cc
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lrn_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+struct LRNFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    auto x_v = framework::EigenVector<T>::Flatten(input);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+
+    auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
+    e_mid = e_mid.constant(k);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(input);
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch >= 0 && ch < C) {
+            auto s = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            auto r = e_x.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                               Eigen::array<int, 4>({{1, 1, H, W}}));
+
+            s += alpha * r.square();
+          }
+        }
+      }
+    }
+
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+    out_e = x_v * e_mid.reshape(Eigen::DSizes<int, 1>(e_mid.size())).pow(-beta);
+  }
+};
+template struct LRNFunctor<platform::CPUDeviceContext, float>;
+template struct LRNFunctor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct LRNGradFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    T ratio = -2 * alpha * beta;
+    auto x_g_e = framework::EigenVector<T>::Flatten(*x_g);
+    x_g_e = x_g_e.constant(0.0);
+
+    auto e_x = framework::EigenTensor<T, 4>::From(x);
+    auto e_x_g = framework::EigenTensor<T, 4>::From(*x_g);
+    auto e_out = framework::EigenTensor<T, 4>::From(out);
+    auto e_out_g = framework::EigenTensor<T, 4>::From(out_g);
+    auto e_mid = framework::EigenTensor<T, 4>::From(mid);
+
+    const int start = -(n - 1) / 2;
+    const int end = start + n;
+    for (int m = 0; m < N; m++) {
+      for (int i = 0; i < C; i++) {
+        auto i_x = e_x.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                             Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_x_g = e_x_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                     Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        auto i_mid = e_mid.slice(Eigen::array<int, 4>({{m, i, 0, 0}}),
+                                 Eigen::array<int, 4>({{1, 1, H, W}}));
+
+        i_x_g = i_mid.pow(-beta) * i_out_g;
+        for (int c = start; c <= end; c++) {
+          int ch = i + c;
+          if (ch < 0 || ch >= C) {
+            continue;
+          }
+
+          auto c_out = e_out.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_mid = e_mid.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                   Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          auto c_out_g = e_out_g.slice(Eigen::array<int, 4>({{m, ch, 0, 0}}),
+                                       Eigen::array<int, 4>({{1, 1, H, W}}));
+
+          i_x_g += ratio * c_out_g * c_out * i_x / c_mid;
+        }
+      }
+    }
+  }
+};
+template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
+template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
+
+class LRNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of LRNOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MidOut"),
+                   "MidOut(Out) of LRNOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
+
+    ctx->SetOutputDim("Out", x_dim);
+    ctx->SetOutputDim("MidOut", x_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename T>
+class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LRNOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of LRN operator. "
+             "It must be a 4D tenor with NCHW format.");
+    AddOutput("Out",
+              "(Tensor) The output of LRN operator, which is also the 4D "
+              "tensor with NCHW format.");
+    AddOutput("MidOut",
+              "(Tensor) Middle result of LRN operator. It's computed in "
+              "forward process and also used in backward process.");
+
+    AddAttr<int>("n",
+                 "(int default 5) "
+                 "n is the \"adjacent\" kernel that maps "
+                 "at the same spatial position.")
+        .SetDefault(5)
+        .GreaterThan(0);
+
+    AddAttr<T>("k",
+               "(float, default 2.0) "
+               "k is the bias.")
+        .SetDefault(2.0)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("alpha",
+               "(float, default 0.0001) "
+               "alpha is the scale number.")
+        .SetDefault(0.0001)
+        .GreaterThan(0.0);
+
+    AddAttr<T>("beta",
+               "(float, default 0.75) "
+               "beta is the power number.")
+        .SetDefault(0.75)
+        .GreaterThan(0.0);
+
+    AddComment(R"DOC(
+Local Response Normalization Operator.
+
+This operator comes from the paper:
+<<ImageNet Classification with Deep Convolutional Neural Networks>>.
+
+The original formula is:
+
+$$
+Output(i, x, y) = Input(i, x, y) / \left(
+k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
+(Input(j, x, y))^2
+\right)^{\beta}
+$$
+
+Function implementation:
+
+Inputs and outpus are in NCHW format, while input.shape.ndims() equals 4.
+And dimensions 0 ~ 3 represent batch size, feature maps, rows,
+and columns, respectively.
+
+Input and Output in the formula above is for each map(i) of one image, and
+Input(i, x, y), Output(i, x, y) represents an element in an image.
+
+C is the number of feature maps of one image. n is a hyper-parameter
+configured when operator is initialized. The sum in the denominator
+is the sum of the same positions in the neighboring maps.
+
+)DOC");
+  }
+};
+
+class LRNOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker<float>, lrn_grad, ops::LRNOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu
new file mode 100644
index 0000000000..eb9d66a73d
--- /dev/null
+++ b/paddle/operators/lrn_op.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lrn_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C,
+                                   int H, int W, int size, T k, T alpha) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+
+    in += offset;
+    mid += offset;
+    const int step = H * W;
+    const int pre_pad = (size - 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    T accum = 0;
+    int index = 0;
+    while (index < C + post_pad) {
+      if (index < C) {
+        T val = in[index * step];
+        accum += val * val;
+      }
+      if (index >= size) {
+        T val = in[(index - size) * step];
+        accum -= val * val;
+      }
+      if (index >= post_pad) {
+        mid[(index - post_pad) * step] = k + accum * alpha;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid,
+                                T negative_beta, T* out) {
+  const int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index < input_size) {
+    out[index] = in[index] * pow(mid[index], negative_beta);
+  }
+}
+
+template <typename T>
+void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs,
+                    T* outputs, T* mid, int N, int C, int H, int W, int n, T k,
+                    T alpha, T beta) {
+  int img_size = N * H * W;
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormFillScale<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, inputs, mid, C, H, W, n, k, alpha);
+
+  int input_size = N * H * W * C;
+  grid_size = (input_size + block_size - 1) / block_size;
+  KeCMRNormOutput<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      input_size, inputs, mid, -beta, outputs);
+}
+
+template <typename T>
+struct LRNFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta) {
+    CrossMapNormal<T>(
+        ctx, input.data<T>(), out->mutable_data<T>(ctx.GetPlace()),
+        mid->mutable_data<T>(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template struct LRNFunctor<platform::CUDADeviceContext, float>;
+template struct LRNFunctor<platform::CUDADeviceContext, double>;
+
+template <typename T>
+__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out,
+                              const T* mid, T* x_g, const T* out_g, int C,
+                              int H, int W, int size, T negative_beta,
+                              T ratio) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < img_size) {
+    const int w = idx % W;
+    const int h = (idx / W) % H;
+    const int n = idx / W / H;
+    const int offset = (n * C * H + h) * W + w;
+    x += offset;
+    out += offset;
+    mid += offset;
+    out_g += offset;
+    x_g += offset;
+
+    const int step = H * W;
+    const int pre_pad = size - (size + 1) / 2;
+    const int post_pad = size - pre_pad - 1;
+
+    int index = 0;
+    T accum = 0;
+    // TODO(gongwb): optimize this with thread shared array.
+    while (index < C + post_pad) {
+      if (index < C) {
+        x_g[index * step] = 0.0;
+        accum += out_g[index * step] * out[index * step] / mid[index * step];
+      }
+      if (index >= size) {
+        accum -= out_g[(index - size) * step] * out[(index - size) * step] /
+                 mid[(index - size) * step];
+      }
+      if (index >= post_pad) {
+        x_g[(index - post_pad) * step] +=
+            out_g[(index - post_pad) * step] *
+                pow(mid[(index - post_pad) * step], negative_beta) -
+            ratio * x[(index - post_pad) * step] * accum;
+      }
+      ++index;
+    }
+  }
+}
+
+template <typename T>
+void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x,
+                        const T* out, const T* mid, T* x_g, const T* out_g,
+                        int N, int C, int H, int W, int n, T alpha, T beta) {
+  int img_size = N * H * W;
+
+  const int block_size = 1024;
+  int grid_size = (img_size + block_size - 1) / block_size;
+
+  auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+  KeCMRNormDiff<T><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta,
+      2.0f * alpha * beta);
+}
+
+template <typename T>
+struct LRNGradFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta) {
+    CrossMapNormalGrad<T>(ctx, x.data<T>(), out.data<T>(), mid.data<T>(),
+                          x_g->mutable_data<T>(ctx.GetPlace()), out_g.data<T>(),
+                          N, C, H, W, n, alpha, beta);
+  }
+};
+
+template struct LRNGradFunctor<platform::CUDADeviceContext, float>;
+template struct LRNGradFunctor<platform::CUDADeviceContext, double>;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lrn, ops::LRNKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    lrn_grad, ops::LRNGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h
new file mode 100644
index 0000000000..ef3a2883a8
--- /dev/null
+++ b/paddle/operators/lrn_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename place, typename T>
+struct LRNFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& input, framework::Tensor* out,
+                  framework::Tensor* mid, int N, int C, int H, int W, int n,
+                  T k, T alpha, T beta);
+};
+
+template <typename DeviceContext, typename T>
+class LRNKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+
+  // f(x) = x * ( k + alpha * SUM((x)^2) )^(-beta)
+  // x represents inputs
+  // f(x) represents outputs
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // input
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    auto x_dims = x.dims();
+
+    // NCHW
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    // MidOut save the intermediate result for backward
+    Tensor* mid = ctx.Output<Tensor>("MidOut");
+    mid->mutable_data<T>(ctx.GetPlace());
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<float>("alpha");
+    T beta = ctx.Attr<float>("beta");
+    T k = ctx.Attr<float>("k");
+
+    PADDLE_ENFORCE(n > 0, "n should >= 0");
+    PADDLE_ENFORCE(alpha >= 0.0, "alpha should >= 0.0");
+    PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0");
+    PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0");
+
+    LRNFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta);
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct LRNGradFunctor {
+  void operator()(const framework::ExecutionContext& ctx,
+                  const framework::Tensor& x, const framework::Tensor& out,
+                  const framework::Tensor& mid, framework::Tensor* x_g,
+                  const framework::Tensor& out_g, int N, int C, int H, int W,
+                  int n, T alpha, T beta);
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * MidOut ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ */
+template <typename DeviceContext, typename T>
+class LRNGradKernel : public framework::OpKernel<T> {
+ public:
+  using Tensor = framework::Tensor;
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor& x = *ctx.Input<Tensor>("X");
+    const Tensor& out = *ctx.Input<Tensor>("Out");
+    const Tensor& out_g = *ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const Tensor& mid = *ctx.Input<Tensor>("MidOut");
+
+    auto x_g = ctx.Output<Tensor>(framework::GradVarName("X"));
+    x_g->mutable_data<T>(ctx.GetPlace());
+
+    auto x_dims = x.dims();
+    int N = x_dims[0];
+    int C = x_dims[1];
+    int H = x_dims[2];
+    int W = x_dims[3];
+
+    int n = ctx.Attr<int>("n");
+    T alpha = ctx.Attr<T>("alpha");
+    T beta = ctx.Attr<T>("beta");
+
+    LRNGradFunctor<DeviceContext, T> f;
+    f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
new file mode 100644
index 0000000000..afb095a04e
--- /dev/null
+++ b/paddle/operators/lstm_op.cc
@@ -0,0 +1,281 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(Hidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchGate) of LSTM should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(Cell) and Input(Hidden) of LSTM should not "
+                     "be null at the same time.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      frame_size);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    ctx->SetOutputDim("Hidden", out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->ShareLoD("Input", "Hidden");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the first input is a LodTensor, which support "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `H0` and `C0` can be NULL but only at the same time.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (D x 4D), where D is the hidden size. "
+             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+    AddInput("Bias",
+             "(Tensor) the learnable weights, which contains two parts: "
+             "input-hidden bias weight and peephole connections weight if "
+             "setting `use_peepholes` True. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Hidden",
+              "(LoDTensor) the hidden state of LSTM operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTM operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the nonlinear computation. This "
+              "LoDTensor has the same shape as the reorganized input, which "
+              "is also be called batch input. The LoD size is 2. The first "
+              "LoD is the batch offsets and the second LoD contains the "
+              "indexes, which denote the position of reorganized sequence "
+              "in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) This LoDTensor is obtained in the forward and used "
+              "in the backward.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTM.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory (LSTM) Operator.
+
+The defalut implementation is diagonal/peephole connection
+(https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the non-line activations, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\tilde{c_t}$ is also called candidate hidden state,
+which is computed based on the current input and the previous hidden state.
+
+Set `use_peepholes` False to disable peephole connection. The formula
+is omitted here, please refer to the paper
+http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connect operator before LSTM operator.
+
+)DOC");
+  }
+};
+
+class LSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(Hidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTM should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTM should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc
new file mode 100644
index 0000000000..cfcc1fc92a
--- /dev/null
+++ b/paddle/operators/lstm_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstm, ops::LSTMKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstm_grad, ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
new file mode 100644
index 0000000000..c57ee414dc
--- /dev/null
+++ b/paddle/operators/lstm_op.h
@@ -0,0 +1,372 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstm_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmMetaValue will be updated later.
+
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
+    } else {
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
+    }
+    lstm_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTM reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstm_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_hidden, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_hidden.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor out_t = batch_hidden.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_hidden_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTM reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+        Tensor ordered_h0;
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      }
+
+      lstm_value.gate_value = gate_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstm_value.prev_state_value = lstm_value.state_value;
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_hidden, *hidden_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_out = ctx.Input<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+
+    auto* hidden_g = ctx.Input<LoDTensor>(framework::GradVarName("Hidden"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = hidden_g->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstm_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstm_value.check_ig = bias_data + 4 * frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
+    } else {
+      lstm_value.check_ig = nullptr;
+      lstm_value.check_fg = nullptr;
+      lstm_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstm_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
+      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
+    } else {
+      lstm_grad.check_ig_grad = nullptr;
+      lstm_grad.check_fg_grad = nullptr;
+      lstm_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden, batch_hidden_g, batch_cell;
+    ToBatch(device_ctx, *hidden_out, out_dims, batch_hidden);
+    ToBatch(device_ctx, *hidden_g, out_dims, batch_hidden_g);
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstm_value.gate_value = gate.data<T>();
+      lstm_value.state_value = cell.data<T>();
+      lstm_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstm_grad.state_grad = cell_g.data<T>();
+      lstm_grad.gate_grad = gate_g.data<T>();
+      lstm_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstm_value.prev_state_value = cell_pre.data<T>();
+        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_hidden_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* backward weight */
+          auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_hidden, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+        if (h0 && h0_g) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0),
+                                         &ordered_h0_g, static_cast<T>(0.0));
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
new file mode 100644
index 0000000000..c2d2c43982
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstm_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LstmUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("C_prev"),
+                   "Input(C_prev) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("C"),
+                   "Output(C) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("H"),
+                   "Output(H) of LSTM should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto c_prev_dims = ctx->GetInputDim("C_prev");
+
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], c_prev_dims[0],
+                      "Batch size of inputs and states must be equal");
+    PADDLE_ENFORCE_EQ(x_dims[1], c_prev_dims[1] * 4,
+                      "Dimension of FC should equal to prev state * 4");
+
+    int b_size = c_prev_dims[0];  // batch size
+    int s_dim = c_prev_dims[1];   // state dim
+    ctx->SetOutputDim("C", {b_size, s_dim});
+    ctx->SetOutputDim("H", {b_size, s_dim});
+  }
+};
+
+class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LstmUnitOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
+    AddInput(
+        "C_prev",
+        "The cell state tensor of last time-step in the Lstm Unit operator.");
+    AddOutput("C", "The cell tensor of Lstm Unit operator.");
+    AddOutput("H", "The hidden state tensor of Lstm Unit operator.");
+    AddAttr<float>("forget_bias",
+                   "(float, default 0.0) "
+                   "The forget bias of Lstm Unit.")
+        .SetDefault(0.0);
+    AddComment(R"DOC(
+Lstm Unit Operator
+
+Equation:
+
+$$
+i, f, o, j = split(X) \\
+C = C_{prev} * sigm(f + forget\_bias) + sigm(i) * tanh(j) \\
+H = C * sigm(o)
+$$
+
+)DOC");
+  }
+};
+
+class LstmUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
+                   "Input(C@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
+                   "Input(H@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("C_prev"),
+                      ctx->GetInputDim("C_prev"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
+            ops::LstmUnitGradOp);
+REGISTER_OP_CPU_KERNEL(lstm_unit,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
new file mode 100644
index 0000000000..5ee5ddd280
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.cu
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op_gpu.cu
+*/
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename Dtype>
+__device__ Dtype cuda_sigmoid(const Dtype x) {
+  return Dtype(1) / (Dtype(1) + exp(-x));
+}
+
+template <typename Dtype>
+__device__ Dtype cuda_tanh(const Dtype x) {
+  return Dtype(1 - exp(-2. * x)) / (Dtype(1) + exp(-2. * x));
+}
+
+template <typename T>
+__global__ void LSTMUnitKernel(const int nthreads, const int dim,
+                               const T* C_prev, const T* X, T* C, T* H,
+                               const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+
+    const T* X_offset = X + 4 * dim * n;
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = f * c_prev + i * g;
+    C[index] = c;
+    const T tanh_c = cuda_tanh(c);
+    H[index] = o * tanh_c;
+  }
+}
+
+template <typename T>
+__global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
+                                       const T* C_prev, const T* X, const T* C,
+                                       const T* H, const T* C_diff,
+                                       const T* H_diff, T* C_prev_diff,
+                                       T* X_diff, const T forget_bias) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int n = index / dim;
+    const int d = index % dim;
+    const T* X_offset = X + 4 * dim * n;
+    T* c_prev_diff = C_prev_diff + index;
+    T* X_diff_offset = X_diff + 4 * dim * n;
+    T* i_diff = X_diff_offset + d;
+    T* f_diff = X_diff_offset + 1 * dim + d;
+    T* o_diff = X_diff_offset + 2 * dim + d;
+    T* g_diff = X_diff_offset + 3 * dim + d;
+
+    const T i = cuda_sigmoid(X_offset[d]);
+    const T f = cuda_sigmoid(X_offset[1 * dim + d] + forget_bias);
+    const T o = cuda_sigmoid(X_offset[2 * dim + d]);
+    const T g = cuda_tanh(X_offset[3 * dim + d]);
+    const T c_prev = C_prev[index];
+    const T c = C[index];
+    const T tanh_c = cuda_tanh(c);
+    const T c_term_diff =
+        C_diff[index] + H_diff[index] * o * (1 - tanh_c * tanh_c);
+    *c_prev_diff = c_term_diff * f;
+    *i_diff = c_term_diff * g * i * (1 - i);
+    *f_diff = c_term_diff * c_prev * f * (1 - f);
+    *o_diff = H_diff[index] * tanh_c * o * (1 - o);
+    *g_diff = c_term_diff * i * (1 - g * g);
+  }
+}
+
+template <typename T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int block = 512;
+    int n = b_size * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, forget_bias);
+  }
+};
+
+template <typename T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int block = 512;
+    int n = N * D;
+    int grid = (n + block - 1) / block;
+
+    LSTMUnitGradientKernel<T><<<grid, block>>>(n, D, C_prev, X, C, H, C_diff,
+                                               H_diff, C_prev_diff, X_diff,
+                                               forget_bias);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                        ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                        ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
new file mode 100644
index 0000000000..fa8d141bcb
--- /dev/null
+++ b/paddle/operators/lstm_unit_op.h
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/* Acknowledgement: the following code is strongly inspired by
+https://github.com/caffe2/caffe2/blob/master/caffe2/operators/lstm_unit_op.h
+*/
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+template <typename T>
+inline T sigmoid(T x) {
+  return 1. / (1. + exp(-x));
+}
+
+template <typename T>
+inline T tanh(T x) {
+  return 2. * sigmoid(2. * x) - 1.;
+}
+
+template <typename DeviceContext, typename T>
+class LstmUnitKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto* x_tensor = ctx.Input<framework::Tensor>("X");
+    auto* c_prev_tensor = ctx.Input<framework::Tensor>("C_prev");
+    auto* c_tensor = ctx.Output<framework::Tensor>("C");
+    auto* h_tensor = ctx.Output<framework::Tensor>("H");
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    int b_size = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    T* C = c_tensor->mutable_data<T>(ctx.GetPlace());
+    T* H = h_tensor->mutable_data<T>(ctx.GetPlace());
+
+    const T* X = x_tensor->data<T>();
+    const T* C_prev = c_prev_tensor->data<T>();
+
+    for (int n = 0; n < b_size; ++n) {
+      for (int d = 0; d < D; ++d) {
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = f * c_prev + i * g;
+        C[d] = c;
+        const T tanh_c = tanh(c);
+        H[d] = o * tanh_c;
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto x_tensor = ctx.Input<Tensor>("X");
+    auto c_prev_tensor = ctx.Input<Tensor>("C_prev");
+    auto c_tensor = ctx.Input<Tensor>("C");
+    auto h_tensor = ctx.Input<Tensor>("H");
+
+    auto hdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("H"));
+    auto cdiff_tensor = ctx.Input<Tensor>(framework::GradVarName("C"));
+
+    auto xdiff_tensor = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto c_prev_diff_tensor =
+        ctx.Output<Tensor>(framework::GradVarName("C_prev"));
+
+    auto* X = x_tensor->data<T>();
+    auto* C_prev = c_prev_tensor->data<T>();
+    auto* C = c_tensor->data<T>();
+    auto* H = h_tensor->data<T>();
+
+    auto* H_diff = hdiff_tensor->data<T>();
+    auto* C_diff = cdiff_tensor->data<T>();
+
+    auto* C_prev_diff = c_prev_diff_tensor->mutable_data<T>(ctx.GetPlace());
+    auto* X_diff = xdiff_tensor->mutable_data<T>(ctx.GetPlace());
+
+    int N = c_tensor->dims()[0];
+    int D = c_tensor->dims()[1];
+
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
+
+    for (int n = 0; n < N; ++n) {
+      for (int d = 0; d < D; ++d) {
+        T* c_prev_diff = C_prev_diff + d;
+        T* i_diff = X_diff + d;
+        T* f_diff = X_diff + 1 * D + d;
+        T* o_diff = X_diff + 2 * D + d;
+        T* g_diff = X_diff + 3 * D + d;
+
+        const T i = sigmoid(X[d]);
+        const T f = sigmoid(X[1 * D + d] + forget_bias);
+        const T o = sigmoid(X[2 * D + d]);
+        const T g = tanh(X[3 * D + d]);
+        const T c_prev = C_prev[d];
+        const T c = C[d];
+        const T tanh_c = tanh(c);
+        const T c_term_diff = C_diff[d] + H_diff[d] * o * (1 - tanh_c * tanh_c);
+        *c_prev_diff = c_term_diff * f;
+        *i_diff = c_term_diff * g * i * (1 - i);
+        *f_diff = c_term_diff * c_prev * f * (1 - f);
+        *o_diff = H_diff[d] * tanh_c * o * (1 - o);
+        *g_diff = c_term_diff * i * (1 - g * g);
+      }
+      C_prev += D;
+      X += 4 * D;
+      C += D;
+      H += D;
+      C_diff += D;
+      H_diff += D;
+      X_diff += 4 * D;
+      C_prev_diff += D;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lstmp_op.cc b/paddle/operators/lstmp_op.cc
new file mode 100644
index 0000000000..c96b30ba35
--- /dev/null
+++ b/paddle/operators/lstmp_op.cc
@@ -0,0 +1,331 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMPOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Projection"),
+                   "Output(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchGate"),
+                   "Output(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchCellPreAct"),
+                   "Output(BatchCellPreAct) of LSTMP operator should not be "
+                   "null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchHidden"),
+                   "Output(BatchHidden) of LSTMP operator should not be null.");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(in_dims.size(), 2,
+                      "Input(X)'s rank of LSTMP operator must be 2.");
+
+    int frame_size = in_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto proj_dims = ctx->GetInputDim("ProjWeight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], proj_dims[1],
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      proj_dims[1]);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+
+    PADDLE_ENFORCE_EQ(proj_dims.size(), 2,
+                      "The rank of Input(ProjWeight) should be 2.");
+    PADDLE_ENFORCE_EQ(proj_dims[0], frame_size,
+                      "The first dimension of Input(ProjWeight) "
+                      "should be %d.",
+                      frame_size);
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(C0) of LSTMP operator should not be null after "
+                     "Input(H0) provided.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+      ctx->SetOutputDim("OrderedP0", {h_dims[0], proj_dims[1]});
+    }
+
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+
+    if (ctx->Attrs().Get<bool>("use_peepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+
+    framework::DDim out_dims({in_dims[0], frame_size});
+    framework::DDim proj_out_dims({in_dims[0], proj_dims[1]});
+    ctx->SetOutputDim("Projection", proj_out_dims);
+    ctx->SetOutputDim("Cell", out_dims);
+    ctx->SetOutputDim("BatchGate", in_dims);
+    ctx->SetOutputDim("BatchCellPreAct", out_dims);
+    ctx->SetOutputDim("BatchHidden", out_dims);
+    ctx->ShareLoD("Input", "Projection");
+    ctx->ShareLoD("Input", "Cell");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+class LSTMPOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMPOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the input for sequence data, which supports "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size and D is the hidden size.")
+        .AsDispensable();
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `C0` should not be null if `H0` provided.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (P x 4D), where P is the projection layer size "
+             "and  D is the hidden size."
+             " - Weight = {W_cr, W_ir, W_fr, W_or}");
+    AddInput("ProjWeight",
+             "(Tensor) the learnable weight of the projection layer."
+             " - The shape is (D x P), where P is the recurrent projection "
+             "layer size and  D is the hidden size."
+             " - ProjWeight = {W_rh}");
+    AddInput("Bias",
+             "(Tensor) the learnable biases, which contains two parts: "
+             "input-hidden biases and peephole connections weights if "
+             "setting `use_peepholes` to `True`. "
+             "1. `use_peepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `use_peepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("Projection",
+              "(LoDTensor) the projection of the hidden state of LSTMP "
+              "operator. The shape is (T x P), and LoD is the same with the "
+              "`Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state of LSTMP operator. "
+              "The shape is (T x D), and lod is the same with the `Input`.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the activations. This LoDTensor has the "
+              "same shape as the reorganized input, which is also be called "
+              "batch input. The LoD size is 2. The first-level LoD is the "
+              "batch offsets and the second contains the indices, which "
+              "denotes the position of reorganized sequence in the raw input.")
+        .AsIntermediate();
+    AddOutput("BatchCellPreAct",
+              "(LoDTensor) the pre-activation cell state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("BatchHidden",
+              "(LoDTensor) the hidden state reorganized in batch. "
+              "This LoDTensor is obtained in the forward and used in the "
+              "backward.")
+        .AsIntermediate();
+    AddOutput("OrderedP0",
+              "(Tensor) the projection of the initial hidden state "
+              "H0. This is a tensor with shape (N x P), where N is the "
+              "batch size and P is the hidden size.")
+        .AsIntermediate();
+    AddAttr<bool>("use_peepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("is_reverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTMP.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gate_activation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("cell_activation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("candidate_activation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddAttr<std::string>("proj_activation",
+                         "(string, default: tanh)"
+                         "The activation for projection output, "
+                         "`tanh` by defalut.")
+        .SetDefault("tanh")
+        .InEnum({"sigmoid", "tanh", "relu", "identity"});
+    AddComment(R"DOC(
+Long-Short Term Memory with recurrent Projection layer (LSTMP) Operator.
+
+LSTMP has a separate projection layer after the LSTM layer, projecting the 
+original hidden state to a lower-dimensional one, which is proposed to reduce 
+the number of total parameters and furthermore computational complexity for 
+the LSTM, espeacially for the case that the size of output units is relative 
+large (https://research.google.com/pubs/archive/43905.pdf). 
+
+The formula is as follows:
+
+$$
+i_t = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i) \\
+
+f_t = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f) \\
+
+\tilde{c_t} = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c) \\
+
+o_t = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o) \\
+
+c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
+
+h_t = o_t \odot act_h(c_t) \\
+
+r_t = \overline{act_h}(W_{rh}h_t)
+$$
+
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
+are diagonal weight matrices for peephole connections. In our implementation,
+we use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
+is the activation, such as logistic sigmoid function, and
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
+and cell activation vectors, respectively, all of which have the same size as
+the cell output activation vector $h$. Here $h$ is usually called the hidden 
+state and $r$ denotes its recurrent projection. And $\tilde{c_t}$ is also 
+called the candidate hidden state, whose computation is based on the current 
+input and previous hidden state.
+
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
+are the cell input and cell output activation functions and `tanh` is usually
+used for them. $\overline{act_h}$ is the activation function for the 
+projection output, usually using `identity` or same as $act_h$.
+
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
+operations on the input $x_{t}$ are NOT included in this operator.
+Users can choose to use fully-connected operator before LSTMP operator.
+
+)DOC");
+  }
+};
+
+class LSTMPGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Projection"),
+                   "Input(Projection) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Cell"),
+                   "Input(Cell) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(Weight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ProjWeight"),
+                   "Input(ProjWeight) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"),
+                   "Input(Bias) of LSTMP operator should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasInput("BatchGate"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BatchCellPreAct"),
+                   "Input(BatchGate) of LSTMP operator should not be null.");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name))
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+    };
+
+    SetOutGradDim("Input");
+    SetOutGradDim("Weight");
+    SetOutGradDim("ProjWeight");
+    SetOutGradDim("Bias");
+    SetOutGradDim("H0");
+    SetOutGradDim("C0");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstmp, ops::LSTMPOp, ops::LSTMPOpMaker, lstmp_grad,
+            ops::LSTMPGradOp);
+REGISTER_OP_CPU_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    lstmp_grad, ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.cu b/paddle/operators/lstmp_op.cu
new file mode 100644
index 0000000000..7fcbcfecc8
--- /dev/null
+++ b/paddle/operators/lstmp_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstmp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lstmp, ops::LSTMPKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    lstmp_grad,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LSTMPGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/lstmp_op.h b/paddle/operators/lstmp_op.h
new file mode 100644
index 0000000000..ee82d5c10a
--- /dev/null
+++ b/paddle/operators/lstmp_op.h
@@ -0,0 +1,491 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+inline void ReorderInitState(const DeviceContext& ctx,
+                             const framework::Tensor& src, const size_t* index,
+                             framework::Tensor* dst, bool indexed_src) {
+  math::CopyMatrixRowsFunctor<DeviceContext, T> row_shuffle;
+  dst->mutable_data<T>(src.dims(), ctx.GetPlace());
+  row_shuffle(ctx, src, index, *dst, indexed_src);
+}
+
+template <typename DeviceContext, typename T>
+class LSTMPKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const math::detail::ActivationType act_type, const Device& d,
+                  X x, Y y) const {
+    if (act_type == math::detail::ActivationType::kIdentity)
+      y.device(d) = x;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* hidden_t0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Output<Tensor>("OrderedP0");
+    auto* cell_t0 = ctx.Input<Tensor>("C0");
+
+    auto* batch_gate = ctx.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* proj_out = ctx.Output<LoDTensor>("Projection");
+    proj_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    to_batch(device_ctx, *input, *batch_gate, true, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+
+    if (bias) {
+      Tensor b = *bias;
+      b.Resize({bias->numel(), 1});
+      Tensor gate_bias = b.Slice(0, 4 * frame_size);
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
+    }
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      // the code style in LstmpMetaValue will be updated later.
+
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+    lstmp_value.prev_state_value = nullptr;
+    Tensor ordered_c0;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (cell_t0) {
+      // Since the batch computing for LSTMP reorders the input sequence
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(device_ctx, *cell_t0, order,
+                                         &ordered_c0, true);
+      lstmp_value.prev_state_value = ordered_c0.data<T>();
+    }
+
+    // Use the local variable as here.
+    LoDTensor batch_proj, batch_cell;
+    auto* batch_cell_pre_act = ctx.Output<LoDTensor>("BatchCellPreAct");
+    batch_cell_pre_act->mutable_data<T>(dims, ctx.GetPlace());
+    auto* batch_hidden = ctx.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(dims, ctx.GetPlace());    // T x D
+    batch_proj.mutable_data<T>(proj_dims, ctx.GetPlace());  // T x P
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());       // T x D
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      Tensor proj_t = batch_proj.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act->Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_t = batch_proj.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, pre_proj_t, false, *weight,
+                                       false, static_cast<T>(1.0), &gate_t,
+                                       static_cast<T>(1.0));
+      } else if (hidden_t0) {
+        // If n == 0 and there is no initialized hidden state, that is to say
+        // the H0 is zeros, the calculation W_h * H0 will be skiped.
+        // If n == 0 and there is initialized hidden state, calculate W_h * H0.
+
+        // Since the batch computing for LSTMP reorders the input sequence
+        // according to their length. The initialized hidden state also needs
+        // to reorder.
+
+        Tensor ordered_h0;
+        ordered_proj0->mutable_data<T>(ctx.GetPlace());
+        ReorderInitState<DeviceContext, T>(device_ctx, *hidden_t0, order,
+                                           &ordered_h0, true);
+        math::matmul<DeviceContext, T>(device_ctx, ordered_h0, false,
+                                       *proj_weight, false, static_cast<T>(1.0),
+                                       ordered_proj0, static_cast<T>(0.0));
+        if (proj_act != math::detail::ActivationType::kIdentity) {
+          auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+          ActCompute(cell_act, place, proj0_dev, proj0_dev);
+        }
+        math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, false,
+                                       *weight, false, static_cast<T>(1.0),
+                                       &gate_t, static_cast<T>(1.0));
+      }
+
+      lstmp_value.gate_value = gate_t.data<T>();
+      lstmp_value.output_value = hidden_t.data<T>();
+      lstmp_value.state_value = cell_t.data<T>();
+      lstmp_value.state_active_value = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, frame_size, cur_batch_size, gate_act,
+          cell_act, cand_act);
+      lstmp_value.prev_state_value = lstmp_value.state_value;
+      math::matmul<DeviceContext, T>(device_ctx, hidden_t, false, *proj_weight,
+                                     false, static_cast<T>(1.0), &proj_t,
+                                     static_cast<T>(0.0));
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto proj_t_dev = EigenMatrix<T>::From(proj_t);
+        ActCompute(cell_act, place, proj_t_dev, proj_t_dev);
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_proj.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(device_ctx, batch_proj, *proj_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(device_ctx, batch_cell, *cell_out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class LSTMPGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const math::detail::ActivationType act_type,
+                      const Device& d, X x, Y y, DX dx, DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == math::detail::ActivationType::kIdentity)
+      dx.device(d) = dy;
+    else if (act_type == math::detail::ActivationType::kSigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kTanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == math::detail::ActivationType::kReLU)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<LoDTensor>("Input");
+    auto* weight = ctx.Input<Tensor>("Weight");
+    auto* proj_weight = ctx.Input<Tensor>("ProjWeight");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* proj_out = ctx.Input<LoDTensor>("Projection");
+    auto* cell_out = ctx.Input<LoDTensor>("Cell");
+
+    auto* batch_gate = ctx.Input<LoDTensor>("BatchGate");
+    auto* batch_cell_pre_act = ctx.Input<LoDTensor>("BatchCellPreAct");
+    auto* batch_hidden = ctx.Input<LoDTensor>("BatchHidden");
+
+    auto* projection_g =
+        ctx.Input<LoDTensor>(framework::GradVarName("Projection"));
+
+    auto* in_g = ctx.Output<LoDTensor>(framework::GradVarName("Input"));
+    auto* weight_g = ctx.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* proj_weight_g =
+        ctx.Output<Tensor>(framework::GradVarName("ProjWeight"));
+    auto* bias_g = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* ordered_proj0 = ctx.Input<Tensor>("OrderedP0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+
+    auto* h0_g = ctx.Output<Tensor>(framework::GradVarName("H0"));
+    auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
+
+    auto& device_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (weight_g) {
+      weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, weight_g, static_cast<T>(0.0));
+    }
+    if (proj_weight_g) {
+      proj_weight_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, proj_weight_g, static_cast<T>(0.0));
+    }
+
+    // ordered_h0/c0 is the reordered hidden/cell initialization.
+    // ordered_h0_g/c0_g is the reordered gradient of hidden/cell
+    // initialization.
+    Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g;
+    const size_t* order = batch_gate->lod()[2].data();
+    if (c0) {
+      ReorderInitState<DeviceContext, T>(device_ctx, *c0, order, &ordered_c0,
+                                         true);
+    }
+    if (c0 && c0_g) {
+      ordered_c0_g.mutable_data<T>(c0_g->dims(), ctx.GetPlace());
+    }
+
+    auto in_dims = input->dims();
+    auto out_dims = cell_out->dims();
+    framework::DDim proj_dims({in_dims[0], proj_weight->dims()[1]});
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    PADDLE_ENFORCE_EQ(frame_size, out_dims[1]);
+
+    math::LstmMetaValue<T> lstmp_value;
+    if (bias && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_data = const_cast<T*>(bias->data<T>());
+      lstmp_value.check_ig = bias_data + 4 * frame_size;
+      lstmp_value.check_fg = lstmp_value.check_ig + frame_size;
+      lstmp_value.check_og = lstmp_value.check_fg + frame_size;
+    } else {
+      lstmp_value.check_ig = nullptr;
+      lstmp_value.check_fg = nullptr;
+      lstmp_value.check_og = nullptr;
+    }
+
+    math::LstmMetaGrad<T> lstmp_grad;
+
+    if (bias && bias_g) {
+      bias_g->mutable_data<T>(ctx.GetPlace());
+      zero(device_ctx, bias_g, static_cast<T>(0.0));
+    }
+    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
+      T* bias_g_data = bias_g->data<T>();
+      lstmp_grad.check_ig_grad = bias_g_data + 4 * frame_size;
+      lstmp_grad.check_fg_grad = lstmp_grad.check_ig_grad + frame_size;
+      lstmp_grad.check_og_grad = lstmp_grad.check_fg_grad + frame_size;
+    } else {
+      lstmp_grad.check_ig_grad = nullptr;
+      lstmp_grad.check_fg_grad = nullptr;
+      lstmp_grad.check_og_grad = nullptr;
+    }
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+
+    auto ToBatch = [&batch_gate, &to_batch](
+        const DeviceContext& ctx, const framework::LoDTensor& src,
+        const framework::DDim& dims, framework::LoDTensor& dst) {
+      dst.mutable_data<T>(dims, ctx.GetPlace());
+      dst.set_lod(batch_gate->lod());
+      to_batch(ctx, src, dst, false);
+    };
+
+    LoDTensor batch_hidden_g, batch_proj, batch_proj_g, batch_cell;
+    batch_hidden_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    ToBatch(device_ctx, *proj_out, proj_dims, batch_proj);        // T x P
+    ToBatch(device_ctx, *projection_g, proj_dims, batch_proj_g);  // T x P
+    ToBatch(device_ctx, *cell_out, out_dims, batch_cell);         // T x D
+
+    LoDTensor batch_cell_g, batch_gate_g;
+    batch_cell_g.mutable_data<T>(out_dims, ctx.GetPlace());
+    // TODO(qingqing) support the case output cell has gradient.
+    // to_batch(device_ctx, *cell_g, batch_cell_g, false);
+    zero(device_ctx, &batch_cell_g, static_cast<T>(0.0));
+    batch_gate_g.mutable_data<T>(batch_gate->dims(), ctx.GetPlace());
+    batch_gate_g.set_lod(batch_gate->lod());
+
+    auto gate_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("gate_activation"));
+    auto cell_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("cell_activation"));
+    auto cand_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("candidate_activation"));
+    auto proj_act = math::detail::GetActivationType(
+        ctx.Attr<std::string>("proj_activation"));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor cur_proj = batch_proj.Slice(bstart, bend);
+      Tensor proj_g = batch_proj_g.Slice(bstart, bend);
+      if (proj_act != math::detail::ActivationType::kIdentity) {
+        auto cur_proj_dev = EigenMatrix<T>::From(cur_proj);
+        auto proj_g_dev = EigenMatrix<T>::From(proj_g);
+        ActGradCompute(cell_act, place, cur_proj_dev, cur_proj_dev, proj_g_dev,
+                       proj_g_dev);
+      }
+      /* hidden state backwarad */
+      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
+      math::matmul<DeviceContext, T>(device_ctx, proj_g, false, *proj_weight,
+                                     true, static_cast<T>(1.0), &out_g,
+                                     static_cast<T>(0.0));
+      /* projection weight backward*/
+      if (proj_weight_g) {
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        math::matmul<DeviceContext, T>(device_ctx, hidden_t, true, proj_g,
+                                       false, static_cast<T>(1.0),
+                                       proj_weight_g, static_cast<T>(1.0));
+      }
+
+      Tensor gate = batch_gate->Slice(bstart, bend);
+      Tensor cell = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
+      lstmp_value.gate_value = gate.data<T>();
+      lstmp_value.state_value = cell.data<T>();
+      lstmp_value.state_active_value = cell_pre_act.data<T>();
+
+      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
+      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
+      lstmp_grad.state_grad = cell_g.data<T>();
+      lstmp_grad.gate_grad = gate_g.data<T>();
+      lstmp_grad.output_grad = out_g.data<T>();
+
+      if (n > 0) {
+        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
+        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
+        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
+        lstmp_value.prev_state_value = cell_pre.data<T>();
+        lstmp_grad.prev_state_grad = cell_pre_g.data<T>();
+      } else {
+        lstmp_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
+        lstmp_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+      }
+
+      int cur_batch_size = bend - bstart;
+      math::LstmUnitGradFunctor<DeviceContext, T>::compute(
+          device_ctx, lstmp_value, lstmp_grad, frame_size, cur_batch_size,
+          gate_act, cell_act, cand_act);
+
+      if (n > 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_proj_g = batch_proj_g.Slice(pre_h_start, pre_h_end);
+        math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight, true,
+                                       static_cast<T>(1.0), &pre_proj_g,
+                                       static_cast<T>(1.0));
+        if (weight_g) {
+          /* weight backward*/
+          auto pre_proj = batch_proj.Slice(pre_h_start, pre_h_end);
+          math::matmul<DeviceContext, T>(device_ctx, pre_proj, true, gate_g,
+                                         false, static_cast<T>(1.0), weight_g,
+                                         static_cast<T>(1.0));
+        }
+      } else {
+        if (h0 && weight_g) {
+          ReorderInitState<DeviceContext, T>(device_ctx, *h0, order,
+                                             &ordered_h0, true);
+          if (weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, *ordered_proj0, true,
+                                           gate_g, false, static_cast<T>(1.0),
+                                           weight_g, static_cast<T>(1.0));
+          }
+        }
+        if (h0 && (h0_g || proj_weight_g)) {
+          ordered_h0_g.mutable_data<T>(h0_g->dims(), ctx.GetPlace());
+          Tensor proj0_g;
+          proj0_g.Resize({in_dims[0], proj_weight->dims()[1]});
+          proj0_g.mutable_data<T>(ctx.GetPlace());
+          math::matmul<DeviceContext, T>(device_ctx, gate_g, false, *weight,
+                                         true, static_cast<T>(1.0), &proj0_g,
+                                         static_cast<T>(0.0));
+          if (proj_act != math::detail::ActivationType::kIdentity) {
+            auto proj0_dev = EigenMatrix<T>::From(*ordered_proj0);
+            auto proj0_g_dev = EigenMatrix<T>::From(proj0_g);
+            ActGradCompute(cell_act, place, proj0_dev, proj0_dev, proj0_g_dev,
+                           proj0_g_dev);
+          }
+          if (h0_g) {
+            math::matmul<DeviceContext, T>(
+                device_ctx, proj0_g, false, *proj_weight, true,
+                static_cast<T>(1.0), &ordered_h0_g, static_cast<T>(0.0));
+          }
+          if (proj_weight_g) {
+            math::matmul<DeviceContext, T>(device_ctx, ordered_h0, true,
+                                           proj0_g, false, static_cast<T>(1.0),
+                                           proj_weight_g, static_cast<T>(1.0));
+          }
+        }
+      }
+    }
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    if (in_g) {
+      /* backward data */
+      in_g->mutable_data<T>(ctx.GetPlace());
+      to_seq(device_ctx, batch_gate_g, *in_g);
+    }
+    if (bias && bias_g) {
+      /* backward bias */
+      Tensor b_g = *bias_g;
+      b_g.Resize({bias_g->numel(), 1});
+      Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
+      math::ColwiseSum<DeviceContext, T> col_sum;
+      col_sum(device_ctx, batch_gate_g, &gate_bias_g);
+    }
+
+    if (h0 && h0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_h0_g, order, h0_g,
+                                         false);
+    }
+    if (c0 && c0_g) {
+      ReorderInitState<DeviceContext, T>(device_ctx, ordered_c0_g, order, c0_g,
+                                         false);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
new file mode 100644
index 0000000000..e0df307774
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/margin_rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MarginRankLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto x1_dims = ctx->GetInputDim("X1");
+    auto x2_dims = ctx->GetInputDim("X2");
+    PADDLE_ENFORCE(
+        (label_dims == x1_dims) && (x1_dims == x2_dims) &&
+            (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensor with shape [batch_size x 1].");
+    ctx->SetOutputDim("Activated", label_dims);
+    ctx->SetOutputDim("Out", label_dims);
+  }
+};
+
+template <typename T>
+class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MarginRankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X1",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "one item X1 to be ranked, from pairwise ranking model.");
+    AddInput("X2",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "another item X2 to be ranked, from pairwise ranking model.");
+    AddInput("Label",
+             "(2-D tensor with shape [batch_size x 1]) "
+             "The label indicating X1 ranked higher than X2 or not, "
+             "can only be +1 or -1.");
+    AddOutput("Activated",
+              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
+              "to indicate whether each element of Output(Out) is activated.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(2-D tensor with shape [batch_size x 1]) "
+              "The output loss of MarginRankLoss operator.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
+    AddComment(R"DOC(
+MarginRankLoss Operator.
+
+This operator measures the loss given a pair of training sample
+{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
+indicating X1 is ranked higher than `X2` and `Label = -1` otherwise. The loss 
+is calculated as:
+
+$loss(X1, X2, Label) = \max(0, -Label * (X1 - X2) + margin)$
+
+The attribute `margin` here helps make the predictions more robust.
+Denote the item ranked higher as the positive sample, otherwise the negative 
+sample. If the score of the two samples satisfies 
+
+$positive sample - negative sample < margin$
+
+the pair of samples will contribute to the final loss, which will backpropagate 
+and train the ranking model to enlarge the difference between the two scores.
+
+For batch input with size `batch_size`, `X1`, `X2` and `Label`
+all have the same shape [batch_size x 1].
+
+)DOC");
+  }
+};
+
+class MarginRankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Activated"),
+                   "Intermediate(Activated) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Label");
+    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
+    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
+            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
+            ops::MarginRankLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
new file mode 100644
index 0000000000..798c3ed182
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/margin_rank_loss_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
new file mode 100644
index 0000000000..7438e881e1
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ReLU {
+  HOSTDEVICE T operator()(const T& val) const {
+    return val > 0 ? val : static_cast<T>(0);
+  }
+};
+
+template <typename T>
+struct Heaviside {
+  HOSTDEVICE T operator()(const T& val) const {
+    return static_cast<T>(val > 0 ? 1 : 0);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MarginRankLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+    auto* act_t = ctx.Output<framework::Tensor>("Activated");
+
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* x1_t = ctx.Input<framework::Tensor>("X1");
+    auto* x2_t = ctx.Input<framework::Tensor>("X2");
+
+    out_t->mutable_data<T>(ctx.GetPlace());
+    act_t->mutable_data<T>(ctx.GetPlace());
+
+    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
+    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
+    act.device(dev) = out.unaryExpr(Heaviside<T>());
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MarginRankLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_x1_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X1"));
+    auto* d_x2_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
+
+    auto* act_t = ctx.Input<framework::Tensor>("Activated");
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    // compute d_x1
+    if (d_x1_t) {
+      d_x1_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
+      d_x1.device(dev) = -d_out * act * label;
+    }
+    // compute d_x2
+    if (d_x2_t) {
+      d_x2_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
+      d_x2.device(dev) = d_out * act * label;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
new file mode 100644
index 0000000000..28c5aec199
--- /dev/null
+++ b/paddle/operators/math/CMakeLists.txt
@@ -0,0 +1,45 @@
+add_subdirectory(detail)
+
+if(WITH_GPU)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
+    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
+    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
+    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
+    nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
+    nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
+    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
+    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
+    nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
+else()
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
+    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
+    cc_library(softmax SRCS softmax.cc DEPS device_context)
+    cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
+    cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
+    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
+    cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
+    cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
+    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
+    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
+    cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
+endif()
+
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
+cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
+cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc
new file mode 100644
index 0000000000..980dd90df8
--- /dev/null
+++ b/paddle/operators/math/context_project.cc
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
+template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu
new file mode 100644
index 0000000000..934e3df645
--- /dev/null
+++ b/paddle/operators/math/context_project.cu
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/math/context_project.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class ContextProjectFunctor<platform::CUDADeviceContext, float>;
+template class ContextProjectFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
new file mode 100644
index 0000000000..218de9fb95
--- /dev/null
+++ b/paddle/operators/math/context_project.h
@@ -0,0 +1,306 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+/*
+ * \brief Context projection concatenates features in adjacent time-steps in
+ * a sequence. The i-th row of the output is the concatenation of
+ * context_length rows of the input. The context_length rows are the
+ * consecutive rows from the i+shift_start row.
+ * ContextProjectGradFunctor is the inverse process of ContextProjectFunctor.
+ *
+ * \param in            Input data.
+ * \param Shape         The shape of Input data:
+ *                        [mini-batch, input_hidden_size].
+ *
+ * \param padding_data  Padding data.
+ * \param Shape         The shape of Padding data:
+ *                        [up_pad + down_pad, input_hidden_size].
+ *
+ * \param col           Col data.
+ * \param Shape         The shape of Col data:
+ *                        [mini-batch, context_length * input_hidden_size].
+ *
+ * For a mini-batch of 2 variable lengths sentences, containing 3, and 1
+ * time-steps:
+ *
+ * Assumed input (X) is a [4, M, N] float LoDTensor, and X->lod()[0] = [0, 3,
+ * 4].
+ * Besides, for the sake of simplicity, we assume M=1 and N=2.
+ *
+ * X = [[a1, a2;
+ *       b1, b2;
+ *       c1, c2]
+ *      [d1, d2]]
+ *
+ * This is to say that input (X) has 4 words and the dimension of each word
+ * representation is 2.
+ *
+ * - Case1:
+ *   If context_start is -1 and padding_trainable is false, we use zero to pad
+ *   instead of learned weight to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out =[[0,  0,  a1, a2, b1, b2;
+ *          a1, a2, b1, b2, c1, c2;
+ *          b1, b2, c1, c2, 0,  0 ]
+ *          [0,  0, d1, d2, 0,  0 ]]
+ *
+ * - Case2:
+ *   If context_start is -1 and padding_trainable is true, we use learned weight
+ *   to pad,
+ *   and the context_length is 3, the output (Out) is:
+ *
+ *   Out = [[w1, w2, a1, a2, b1, b2;
+ *           a1, a2, b1, b2, c1, c2;
+ *           b1, b2, c1, c2, w3, w4]
+ *          [w1, w2, d1, d2, w3, w4]]
+ *
+ */
+
+template <typename DeviceContext, typename T>
+class ContextProjectFunctor {
+ public:
+  void operator()(const DeviceContext& context, const LoDTensor& in,
+                  const Tensor& padding_data, bool padding_trainable,
+                  const int context_start, const int context_length,
+                  const int context_stride, const int up_pad,
+                  const int down_pad, Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, float> im2col_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      input_row_begin = (context_start > 0)
+                            ? static_cast<int>(lod_level_0[i]) + context_start
+                            : static_cast<int>(lod_level_0[i]);
+      input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+      Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                static_cast<int>(lod_level_0[i + 1]));
+
+      sequence_height = static_cast<int>(out_t.dims()[0]);
+
+      if (input_row_begin < input_row_end) {
+        Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+        std::vector<int64_t> output_shape(
+            {sequence_height, 1, 1, context_length,
+             sequence_width});  // output_height, output_width,
+        // input_channels, filter_height, filter_width
+        out_t.Resize(framework::make_ddim(output_shape));
+
+        std::vector<int64_t> input_shape(
+            {1, input_row_end - input_row_begin,
+             sequence_width});  // input_channels, input_height, input_width
+        in_t.Resize(framework::make_ddim(input_shape));
+        im2col_ocf(context, in_t, dilation, stride, padding, &out_t);
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+    if (padding_trainable) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        // add up trainable data
+        out_t.Resize({sequence_height * context_length, sequence_width});
+
+        if (up_pad > 0) {  // add up pad
+          int padding_rows = std::min(
+              up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+          for (int k = 0; k < padding_rows; ++k) {
+            int padding_size =
+                k + context_length < up_pad ? context_length : up_pad - k;
+            Tensor out_t_sub = out_t.Slice(k * context_length,
+                                           k * context_length + padding_size);
+            Tensor w_sub = padding_data.Slice(k, k + padding_size);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+          }
+        }
+        if (down_pad > 0) {  // add down pad
+          int down_pad_begin_row =
+              std::max(0,
+                       (sequence_height - context_start - context_length) + 1) +
+              1;
+          int padding_begin = std::max(0, context_start - sequence_height);
+          int padding_size =
+              sequence_height - context_start >= context_length
+                  ? 1
+                  : context_length - (sequence_height - context_start);
+          if (context_start >= sequence_height) padding_size = context_length;
+          int padding_idx = padding_begin;
+          for (int t = 0; t + down_pad_begin_row <= sequence_height;
+               ++t, ++padding_size) {
+            if (context_start >= sequence_height) padding_size = context_length;
+            if (padding_size > context_length) {
+              padding_size = context_length;
+              padding_idx++;
+            }
+            if (padding_begin > 0 || sequence_height == context_start)
+              padding_idx = padding_begin + t;
+
+            Tensor out_t_sub = out_t.Slice(
+                (down_pad_begin_row + t) * context_length - padding_size,
+                (down_pad_begin_row + t) * context_length);
+            Tensor w_sub = padding_data.Slice(
+                up_pad + padding_idx, up_pad + padding_idx + padding_size);
+            framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
+          }
+        }
+        out_t.Resize({sequence_height, context_length * sequence_width});
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ContextProjectGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const LoDTensor& in,
+                  bool padding_trainable, const int context_start,
+                  const int context_length, const int context_stride,
+                  const int up_pad, const int down_pad, bool pad_grad,
+                  bool input_grad, Tensor* padding_data, Tensor* col) {
+    auto lod_level_0 = in.lod()[0];
+
+    math::Col2ImFunctor<math::ColFormat::kOCF, DeviceContext, float> col2im_ocf;
+
+    std::vector<int> dilation({1, 1});
+    std::vector<int> padding({up_pad, 0, down_pad, 0});
+    std::vector<int> stride({context_stride, 1});
+
+    int input_row_begin, input_row_end;
+    int sequence_height, sequence_width;
+    sequence_width = in.dims()[1];
+
+    if (input_grad) {
+      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+        input_row_begin = (context_start > 0)
+                              ? static_cast<int>(lod_level_0[i]) + context_start
+                              : static_cast<int>(lod_level_0[i]);
+        input_row_end = static_cast<int>(lod_level_0[i + 1]);
+
+        Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                  static_cast<int>(lod_level_0[i + 1]));
+
+        sequence_height = static_cast<int>(out_t.dims()[0]);
+
+        if (input_row_begin < input_row_end) {
+          Tensor in_t = in.Slice(input_row_begin, input_row_end);
+
+          std::vector<int64_t> output_shape(
+              {sequence_height, 1, 1, context_length,
+               sequence_width});  // output_height, output_width,
+          // input_channels, filter_height, filter_width
+          out_t.Resize(framework::make_ddim(output_shape));
+
+          std::vector<int64_t> input_shape(
+              {1, input_row_end - input_row_begin,
+               sequence_width});  // input_channels, input_height, input_width
+          in_t.Resize(framework::make_ddim(input_shape));
+
+          col2im_ocf(context, out_t, dilation, stride, padding, &in_t);
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+    if (pad_grad) {
+      if (padding_trainable) {
+        for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+          Tensor out_t = col->Slice(static_cast<int>(lod_level_0[i]),
+                                    static_cast<int>(lod_level_0[i + 1]));
+
+          sequence_height = static_cast<int>(out_t.dims()[0]);
+          out_t.Resize({sequence_height * context_length, sequence_width});
+
+          if (up_pad > 0) {
+            int padding_rows = std::min(
+                up_pad, static_cast<int>(lod_level_0[i + 1] - lod_level_0[i]));
+
+            for (int k = 0; k < padding_rows; ++k) {
+              int padding_size =
+                  k + context_length < up_pad ? context_length : up_pad - k;
+              Tensor out_t_sub = out_t.Slice(k * context_length,
+                                             k * context_length + padding_size);
+              Tensor w_sub = padding_data->Slice(k, k + padding_size);
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
+            }
+          }
+          if (down_pad > 0) {
+            int down_pad_begin_row =
+                std::max(
+                    0, (sequence_height - context_start - context_length) + 1) +
+                1;
+            int padding_begin = std::max(0, context_start - sequence_height);
+            int padding_size =
+                sequence_height - context_start >= context_length
+                    ? 1
+                    : context_length - (sequence_height - context_start);
+            if (context_start >= sequence_height) padding_size = context_length;
+            int padding_idx = padding_begin;
+            for (int t = 0; t + down_pad_begin_row <= sequence_height;
+                 ++t, ++padding_size) {
+              if (context_start >= sequence_height)
+                padding_size = context_length;
+              if (padding_size > context_length) {
+                padding_size = context_length;
+                padding_idx++;
+              }
+              if (padding_begin > 0 || sequence_height == context_start)
+                padding_idx = padding_begin + t;
+
+              Tensor out_t_sub = out_t.Slice(
+                  (down_pad_begin_row + t) * context_length - padding_size,
+                  (down_pad_begin_row + t) * context_length);
+              Tensor w_sub = padding_data->Slice(
+                  up_pad + padding_idx, up_pad + padding_idx + padding_size);
+              axpy<DeviceContext, T>(context, w_sub.numel(), static_cast<T>(1),
+                                     out_t_sub.data<T>(), w_sub.data<T>());
+            }
+          }
+          out_t.Resize({sequence_height, context_length * sequence_width});
+        }
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cc b/paddle/operators/math/cos_sim_functor.cc
new file mode 100644
index 0000000000..6af9f0fcd9
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cc
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    for (size_t row_id = 0; row_id < rows; ++row_id) {
+      auto xy_norm_prod = x_norm[row_id] * y_norm[0];
+      auto dz_data = dz[row_id];
+      auto z_data = z[row_id];
+      auto* x_data = x + cols * row_id;
+      auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+      auto y_norm_square = y_norm[0] * y_norm[0];
+      auto reciprocal_y_norm_square = 1 / y_norm_square;
+      for (size_t i = 0; i < cols; ++i) {
+        dy[i] += dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                            z_data * y[i] * reciprocal_y_norm_square);
+      }
+    }
+  }
+};
+
+template struct CosSimDyFunctor<platform::CPUDeviceContext, float>;
+template struct CosSimDyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.cu b/paddle/operators/math/cos_sim_functor.cu
new file mode 100644
index 0000000000..6eb0a4ea4c
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cos_sim_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void CosSimDyKernel(const T* x_norm, const T* y_norm, const T* x,
+                               const T* y, const T* z, const T* dz,
+                               const size_t rows, const size_t cols, T* dy) {
+  int grid_size = blockDim.x * gridDim.x;
+  T y_norm_data = y_norm[0];
+  for (int row_id = blockIdx.x * blockDim.x + threadIdx.x; row_id < rows;
+       row_id += grid_size) {
+    T xy_norm_prod = x_norm[row_id] * y_norm_data;
+    T dz_data = dz[row_id];
+    T z_data = z[row_id];
+    const T* x_data = x + cols * row_id;
+    T reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+
+    T y_norm_square = y_norm_data * y_norm_data;
+    T reciprocal_y_norm_square = 1 / y_norm_square;
+    for (size_t i = 0; i < cols; ++i) {
+      T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
+                             z_data * y[i] * reciprocal_y_norm_square);
+      platform::CudaAtomicAdd(dy + i, dy_data);
+    }
+  }
+}
+
+template <typename T>
+struct CosSimDyFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx, const T* x_norm,
+                  const T* y_norm, const T* x, const T* y, const T* z,
+                  const T* dz, const size_t rows, const size_t cols,
+                  T* dy) const {
+    const int block_size = 512;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, (rows + block_size - 1) / block_size);
+    CosSimDyKernel<T><<<grid, threads, 0, ctx.stream()>>>(
+        x_norm, y_norm, x, y, z, dz, rows, cols, dy);
+  }
+};
+
+template struct CosSimDyFunctor<platform::CUDADeviceContext, float>;
+template struct CosSimDyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cos_sim_functor.h b/paddle/operators/math/cos_sim_functor.h
new file mode 100644
index 0000000000..aae8ab5b7a
--- /dev/null
+++ b/paddle/operators/math/cos_sim_functor.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include <stdlib.h>
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool same_row>
+struct CosSimFunctor {
+  CosSimFunctor(const T* x, const T* y, T* x_norm, T* y_norm, T* z, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto* x = x_ + cols_ * row_id;
+    T xx = 0, xy = 0, yy = 0;
+    if (same_row) {
+      auto* y = y_ + cols_ * row_id;
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      y_norm_[row_id] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    } else {  // This can be wrote in a better way.
+      T tep_x, tep_y;
+      for (size_t i = 0; i < cols_; ++i) {
+        tep_x = x[i];
+        tep_y = y_[i];
+        xx += tep_x * tep_x;
+        yy += tep_y * tep_y;
+        xy += tep_x * tep_y;
+      }
+      xx = sqrt(xx);
+      yy = sqrt(yy);
+      if (row_id == 0) y_norm_[0] = yy;
+      x_norm_[row_id] = xx;
+      z_[row_id] = xy / (xx * yy);
+    }
+  }
+
+  T* x_norm_;
+  T* y_norm_;
+  const T* x_;
+  const T* y_;
+  T* z_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimGradFunctor {
+  CosSimGradFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                    const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[row_id];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+
+    auto* dx = dx_ + cols_ * row_id;
+    auto* x = x_ + cols_ * row_id;
+    auto* y = y_ + cols_ * row_id;
+
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename T>
+struct CosSimDxFunctor {
+  CosSimDxFunctor(const T* x_norm, const T* y_norm, const T* x, const T* y,
+                  const T* z, const T* dz, T* dx, int cols)
+      : x_norm_(x_norm),
+        y_norm_(y_norm),
+        x_(x),
+        y_(y),
+        z_(z),
+        dz_(dz),
+        dx_(dx),
+        cols_(static_cast<size_t>(cols)) {}
+
+  inline HOSTDEVICE void operator()(size_t row_id) const {
+    auto xy_norm_prod = x_norm_[row_id] * y_norm_[0];
+    auto dz = dz_[row_id];
+    auto z = z_[row_id];
+    auto* x = x_ + cols_ * row_id;
+    auto reciprocal_xy_norm_prod = 1 / xy_norm_prod;
+    auto x_norm_square = x_norm_[row_id] * x_norm_[row_id];
+    auto* dx = dx_ + cols_ * row_id;
+    auto reciprocal_x_norm_square = 1 / x_norm_square;
+
+    for (size_t i = 0; i < cols_; ++i) {
+      dx[i] = dz * (y_[i] * reciprocal_xy_norm_prod -
+                    z * x[i] * reciprocal_x_norm_square);
+    }
+  }
+  const T* x_norm_;
+  const T* y_norm_;
+  const T* x_;
+  const T* y_;
+  const T* z_;
+  const T* dz_;
+  T* dx_;
+  const size_t cols_;
+};
+
+template <typename DeviceContext, typename T>
+struct CosSimDyFunctor {
+  void operator()(const DeviceContext& ctx, const T* x_norm, const T* y_norm,
+                  const T* x, const T* y, const T* z, const T* dz,
+                  const size_t rows, const size_t cols, T* dy) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
new file mode 100644
index 0000000000..d9cb016fb4
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel) {
+    const int batch_size = prob->dims()[0];
+    if (softLabel) {
+      auto in = EigenMatrix<T>::From(*prob);
+      auto lbl = EigenMatrix<T>::From(*labels);
+      auto loss = EigenMatrix<T>::From(*out);
+
+      loss.device(*ctx.eigen_device()) =
+          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
+                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+    } else {
+      const int class_num = prob->dims()[1];
+      const T* prob_data = prob->data<T>();
+      T* loss_data = out->data<T>();
+
+      const int64_t* label_data = labels->data<int64_t>();
+      for (int i = 0; i < batch_size; ++i) {
+        int index = i * class_num + label_data[i];
+        loss_data[i] = -math::TolerableValue<T>()(std::log(prob_data[index]));
+      }
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
+template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
new file mode 100644
index 0000000000..16c9e7b28e
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.cu
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/cross_entropy.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
+                                   const int N, const int D) {
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
+    Y[i] = -math::TolerableValue<T>()(log(X[i * D + label[i]]));
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ T sum_single_warp(T val) {
+  val += __shfl_down(val, 16);
+  val += __shfl_down(val, 8);
+  val += __shfl_down(val, 4);
+  val += __shfl_down(val, 2);
+  val += __shfl_down(val, 1);
+  return val;
+}
+
+// CUDA do not support dynamic arrary in template
+// https://stackoverflow.com/questions/20497209
+template <typename T>
+struct SharedMemory {
+  // Ensure that we won't compile any un-specialized types
+  __device__ T* GetPointer() { return NULL; }
+};
+
+template <>
+struct SharedMemory<float> {
+  __device__ float* GetPointer() {
+    extern __shared__ float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct SharedMemory<double> {
+  __device__ double* GetPointer() {
+    extern __shared__ double s_double[];
+    return s_double;
+  }
+};
+
+template <typename T>
+__global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
+                                       const int class_num) {
+  int tid = threadIdx.x;
+  SharedMemory<T> d_sum_shared;
+  T* d_sum = d_sum_shared.GetPointer();
+  d_sum[tid] = 0;
+
+  int cur_idx = tid;
+  int next_idx = blockIdx.x * class_num + tid;
+  while (cur_idx < class_num) {
+    d_sum[tid] +=
+        math::TolerableValue<T>()(std::log(X[next_idx])) * label[next_idx];
+    next_idx += blockDim.x;
+    cur_idx += blockDim.x;
+  }
+  __syncthreads();
+
+  for (unsigned int stride = blockDim.x >> 1; stride >= 32; stride >>= 1) {
+    if (tid < stride) d_sum[tid] += d_sum[tid + stride];
+    __syncthreads();
+  }
+
+  T val = d_sum[tid];
+  val = sum_single_warp<T>(val);
+  if (tid == 0) Y[blockIdx.x] = -val;
+}
+}  // namespace
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  framework::Tensor* out, const framework::Tensor* prob,
+                  const framework::Tensor* labels, bool softLabel) {
+    const T* prob_data = prob->data<T>();
+    T* loss_data = out->mutable_data<T>(ctx.GetPlace());
+
+    int batch_size = prob->dims()[0];
+    int class_num = prob->dims()[1];
+
+    if (softLabel) {
+      const T* label_data = labels->data<T>();
+      int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
+
+      SoftCrossEntropyKernel<T><<<
+          batch_size, block, block * sizeof(T),
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, class_num);
+    } else {
+      const int64_t* label_data = labels->data<int64_t>();
+      int block = 512;
+      int grid = (batch_size + block - 1) / block;
+      CrossEntropyKernel<T><<<grid, block, 0, ctx.stream()>>>(
+          loss_data, prob_data, label_data, batch_size, class_num);
+    }
+  }
+};
+
+template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
+template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
new file mode 100644
index 0000000000..b3b6d767a8
--- /dev/null
+++ b/paddle/operators/math/cross_entropy.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct TolerableValue {
+  HOSTDEVICE T operator()(const T& x) const {
+    PADDLE_ASSERT(std::is_floating_point<T>::value);
+    const T kApproInf = 1e20;
+
+    if (x == INFINITY) return kApproInf;
+    if (x == -INFINITY) return -kApproInf;
+    return x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CrossEntropyFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::Tensor* out,
+                  const framework::Tensor* prob,
+                  const framework::Tensor* labels, const bool softLabel);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/operators/math/detail/CMakeLists.txt
new file mode 100644
index 0000000000..0df1c060f9
--- /dev/null
+++ b/paddle/operators/math/detail/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(activation_functions SRCS avx_functions.cc)
diff --git a/paddle/operators/math/detail/activation_functions.h b/paddle/operators/math/detail/activation_functions.h
new file mode 100644
index 0000000000..585a012343
--- /dev/null
+++ b/paddle/operators/math/detail/activation_functions.h
@@ -0,0 +1,191 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <math.h>
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/hostdevice.h"
+
+#ifdef __AVX__
+#include <immintrin.h>
+#endif
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+enum ActivationType {
+  kSigmoid,
+  kReLU,
+  kTanh,
+  kIdentity,
+};
+
+inline ActivationType GetActivationType(const std::string &type) {
+  if (type == "sigmoid") {
+    return ActivationType::kSigmoid;
+  } else if (type == "relu") {
+    return ActivationType::kReLU;
+  } else if (type == "tanh") {
+    return ActivationType::kTanh;
+  } else if (type == "identity" || type == "") {
+    return ActivationType::kIdentity;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+namespace forward {
+
+template <typename T>
+DEVICE T Identity(const T a) {
+  return a;
+}
+
+template <typename T>
+DEVICE T Relu(const T a) {
+  return a > static_cast<T>(0.0) ? a : static_cast<T>(0.0);
+}
+
+template <typename T>
+DEVICE T Sigmoid(const T a) {
+  const T min = SIGMOID_THRESHOLD_MIN;
+  const T max = SIGMOID_THRESHOLD_MAX;
+  T tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<T>(1.0) / (static_cast<T>(1.0) + exp(-tmp));
+}
+
+template <typename T>
+DEVICE T Tanh(const T a) {
+  T tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+DEVICE T Identity(const T a, const T b) {
+  return a;
+}
+
+template <typename T>
+DEVICE T Relu(const T a, const T b) {
+  return a * (b > 0.0 ? 1.0 : 0.0);
+}
+
+template <typename T>
+DEVICE T Sigmoid(const T a, const T b) {
+  return a * b * (1.0 - b);
+}
+
+template <typename T>
+DEVICE T Tanh(const T a, const T b) {
+  return a * (1.0 - b * b);
+}
+
+}  // namespace backward
+
+template <typename T>
+struct Active {
+  typedef T (*Act)(T);
+  typedef T (*ActGrad)(T, T);
+};
+
+static DEVICE Active<float>::Act kActFloat[] = {
+    &forward::Sigmoid<float>, &forward::Relu<float>, &forward::Tanh<float>,
+    &forward::Identity<float>};
+
+static DEVICE Active<float>::ActGrad kActGradFloat[] = {
+    &backward::Sigmoid<float>, &backward::Relu<float>, &backward::Tanh<float>,
+    &backward::Identity<float>};
+
+static DEVICE Active<double>::Act kActDouble[] = {
+    &forward::Sigmoid<double>, &forward::Relu<double>, &forward::Tanh<double>,
+    &forward::Identity<double>};
+
+static DEVICE Active<double>::ActGrad kActGradDouble[] = {
+    &backward::Sigmoid<double>, &backward::Relu<double>,
+    &backward::Tanh<double>, &backward::Identity<double>};
+
+namespace forward {
+inline DEVICE float activation(float a, int index) {
+  return kActFloat[index](a);
+}
+
+inline DEVICE double activation(double a, int index) {
+  return kActDouble[index](a);
+}
+
+}  // namespace forward
+
+namespace backward {
+inline DEVICE float activation(float a, float b, int index) {
+  return kActGradFloat[index](a, b);
+}
+
+inline DEVICE double activation(double a, double b, int index) {
+  return kActGradDouble[index](a, b);
+}
+}  // namespace backward
+
+#ifdef __AVX__
+namespace forward {
+namespace avx {
+__m256 Relu(const __m256 a);
+__m256 Sigmoid(const __m256 a);
+__m256 Tanh(const __m256 a);
+__m256 Identity(const __m256 a);
+}  // namespace avx
+}  // namespace forward
+
+namespace backward {
+namespace avx {
+__m256 Relu(const __m256 a, const __m256 b);
+__m256 Sigmoid(const __m256 a, const __m256 b);
+__m256 Tanh(const __m256 a, const __m256 b);
+__m256 Identity(const __m256 a, const __m256 b);
+}  // namespace avx
+}  // namespace backward
+
+static Active<__m256>::Act kActAvx[] = {
+    &forward::avx::Sigmoid, &forward::avx::Relu, &forward::avx::Tanh,
+    &forward::avx::Identity};
+
+static Active<__m256>::ActGrad kActGradAvx[] = {
+    &backward::avx::Sigmoid, &backward::avx::Relu, &backward::avx::Tanh,
+    &backward::avx::Identity};
+
+namespace forward {
+inline __m256 activation(__m256 a, int index) { return kActAvx[index](a); }
+}  // namespace forward
+
+namespace backward {
+inline __m256 activation(__m256 a, __m256 b, int index) {
+  return kActGradAvx[index](a, b);
+}
+}  // namespace backward
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/avx_functions.cc b/paddle/operators/math/detail/avx_functions.cc
new file mode 100644
index 0000000000..921364788c
--- /dev/null
+++ b/paddle/operators/math/detail/avx_functions.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef __AVX__
+
+#include <immintrin.h>
+#include "paddle/operators/math/detail/activation_functions.h"
+// TODO(qingqing) refine this dependence
+#include "paddle/cuda/src/avx_mathfun.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+__m256 Exp(__m256 a) { return exp256_ps(a); }
+
+namespace forward {
+namespace avx {
+__m256 Relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
+
+__m256 Sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = Exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
+
+__m256 Tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = Exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
+
+__m256 Identity(const __m256 a) { return a; }
+
+}  // namespace avx
+}  // namespace forward
+
+namespace backward {
+namespace avx {
+__m256 Relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+                       _mm256_set1_ps(1.0f)));
+}
+
+__m256 Sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
+
+__m256 Tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
+
+__m256 Identity(const __m256 a, const __m256 b) { return a; }
+}  // namespace avx
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
+
+#endif
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
new file mode 100644
index 0000000000..a61b232f42
--- /dev/null
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -0,0 +1,426 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class OpResetOutput, typename T>
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
+                                       ActivationType active_gate) {
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
+  }
+}
+
+template <class OpFinalOutput, typename T>
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
+                                       ActivationType active_node) {
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
+  }
+}
+
+template <class OpResetOutput, typename T>
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
+                                     ActivationType active_gate) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
+
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
+  }
+#endif
+}
+
+template <class OpFinalOutput, typename T>
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
+                                     ActivationType active_node) {
+#ifdef __AVX__
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
+    }
+
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
+
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
+  }
+#endif
+}
+
+template <class OpResetOutput, typename T>
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    } else {
+      hl_naive_gru_forward_reset_output(
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpFinalOutput, typename T>
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 GRUMetaValue<T> value, int frame_size,
+                                 int batch_size, ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
+    } else {
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
+                                      ActivationType active_node) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
+                                      ActivationType active_gate) {
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
+    }
+  }
+}
+
+template <class OpStateGrad, typename T>
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
+                                    ActivationType active_node) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpResetGrad, typename T>
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
+                                    ActivationType active_gate) {
+#ifdef __AVX__
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
+    }
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
+    }
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
+    }
+
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
+
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
+    }
+  }
+#endif
+}
+
+template <class OpStateGrad, typename T>
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    } else {
+      hl_naive_gru_backward_state_grad(
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+template <class OpResetGrad, typename T>
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                                int frame_size, int batch_size,
+                                ActivationType active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    } else {
+      hl_naive_gru_backward_reset_grad(
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
+    }
+
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
+    }
+
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
+    }
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
new file mode 100644
index 0000000000..1783d46096
--- /dev/null
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
+  }
+
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
+
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
+                                        ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
+  }
+
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
+
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
+  }
+
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
+
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_node) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
+  }
+
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
+  }
+
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
+                                       ActivationType active_gate) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
+  }
+
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
+  }
+
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
+
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
+  }
+}
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
new file mode 100644
index 0000000000..4d8245cb5d
--- /dev/null
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+// TODO(guosheng): refine code style in gru_kernel
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <typename T>
+class gru_resetOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             ActivationType act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_finalOutput {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output, ActivationType act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
+  }
+#endif
+#endif
+};
+}  // namespace forward
+
+namespace backward {
+
+template <typename T>
+class gru_stateGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, ActivationType act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             ActivationType act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
+  }
+#endif
+#endif
+};
+
+template <typename T>
+class gru_resetGrad {
+ public:
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, ActivationType act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#ifndef __NVCC__
+#ifndef __AVX__
+  static const bool avx = false;
+#else
+  static const bool avx = true;
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             ActivationType act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
new file mode 100644
index 0000000000..42888fcdb0
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -0,0 +1,312 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class T, class Op>
+void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                     int frame_size, ActivationType active_node,
+                                     ActivationType active_gate,
+                                     ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    value.state_value[i] = r_state;
+    value.state_active_value[i] = r_state_atv;
+    value.output_value[i] = r_out;
+  }
+}
+
+template <class T, class Op>
+void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                      LstmMetaGrad<T> grad, int frame_size,
+                                      ActivationType active_node,
+                                      ActivationType active_gate,
+                                      ActivationType active_state) {
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI;
+  T r_checkF;
+  T r_checkO;
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  T *value_in = value.gate_value;
+  T *value_ig = value.gate_value + frame_size;
+  T *value_fg = value.gate_value + frame_size * 2;
+  T *value_og = value.gate_value + frame_size * 3;
+  T *grad_in = grad.gate_grad;
+  T *grad_ig = grad.gate_grad + frame_size;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
+  T *grad_og = grad.gate_grad + frame_size * 3;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
+    r_state = value.state_value[i];
+    r_state_atv = value.state_active_value[i];
+    r_output_grad = grad.output_grad[i];
+    r_state_grad = grad.state_grad[i];
+    if (value.prev_state_value) {
+      r_prev_state = value.prev_state_value[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
+
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
+  }
+}
+
+template <class T, class Op>
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                   int frame_size, ActivationType active_node,
+                                   ActivationType active_gate,
+                                   ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_state;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_state_atv;
+  __m256 r_out;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
+    }
+
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
+
+    value_in[i] = r_value_in;
+    value_ig[i] = r_value_ig;
+    value_fg[i] = r_value_fg;
+    value_og[i] = r_value_og;
+    ((__m256 *)value.state_value)[i] = r_state;
+    ((__m256 *)value.state_active_value)[i] = r_state_atv;
+    ((__m256 *)value.output_value)[i] = r_out;
+  }
+#endif
+}
+
+template <class T, class Op>
+void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                    LstmMetaGrad<T> grad, int frame_size,
+                                    ActivationType active_node,
+                                    ActivationType active_gate,
+                                    ActivationType active_state) {
+#ifdef __AVX__
+  __m256 r_value_in;
+  __m256 r_value_ig;
+  __m256 r_value_fg;
+  __m256 r_value_og;
+  __m256 r_grad_in;
+  __m256 r_grad_ig;
+  __m256 r_grad_fg;
+  __m256 r_grad_og;
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state_grad;
+  __m256 r_state_grad;
+  __m256 r_state;
+  __m256 r_state_atv;
+  __m256 r_output_grad;
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
+  __m256 r_checkIGrad;
+  __m256 r_checkFGrad;
+  __m256 r_checkOGrad;
+
+  __m256 *value_in = (__m256 *)value.gate_value;
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
+  __m256 *grad_in = (__m256 *)grad.gate_grad;
+  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
+  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
+  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_in = value_in[i];
+    r_value_ig = value_ig[i];
+    r_value_fg = value_fg[i];
+    r_value_og = value_og[i];
+    if (value.check_ig) {
+      r_checkI = ((__m256 *)value.check_ig)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
+    }
+    r_state = ((__m256 *)value.state_value)[i];
+    r_state_atv = ((__m256 *)value.state_active_value)[i];
+    r_output_grad = ((__m256 *)grad.output_grad)[i];
+    r_state_grad = ((__m256 *)grad.state_grad)[i];
+    if (value.prev_state_value) {
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
+    }
+
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
+
+    grad_in[i] = r_grad_in;
+    grad_ig[i] = r_grad_ig;
+    grad_fg[i] = r_grad_fg;
+    grad_og[i] = r_grad_og;
+    ((__m256 *)grad.state_grad)[i] = r_state_grad;
+
+    if (grad.prev_state_grad)
+      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
+      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
+  }
+#endif
+}
+
+template <class T, class Op>
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
+                                     active_gate, active_state);
+  } else {
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
+                                       active_gate, active_state);
+  }
+}
+
+template <class T, class Op>
+void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frame_size, ActivationType active_node,
+                       ActivationType active_gate,
+                       ActivationType active_state) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
+                                      active_gate, active_state);
+  } else {
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
+                                        active_node, active_gate, active_state);
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
new file mode 100644
index 0000000000..e31e657e8b
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -0,0 +1,255 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
+                              int batch_size, ActivationType active_node,
+                              ActivationType active_gate,
+                              ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.output_value += batch_idx * frame_size;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+  }
+
+  T r_state;
+  T r_prev_state = 0;
+  T r_state_atv;
+  T r_out;
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
+  }
+
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
+     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
+     active_state);
+
+  value.gate_value[frame_idx] = r_value_in;
+  value.gate_value[frame_idx + frame_size] = r_value_ig;
+  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
+  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
+
+  value.state_value[frame_idx] = r_state;
+  value.state_active_value[frame_idx] = r_state_atv;
+  value.output_value[frame_idx] = r_out;
+}
+
+/*
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
+ */
+template <class T, class Op, bool is_batch>
+__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
+                               LstmMetaGrad<T> grad, int frame_size,
+                               int batch_size, ActivationType active_node,
+                               ActivationType active_gate,
+                               ActivationType active_state) {
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    value.gate_value += batch_idx * frame_size * 4;
+    value.state_value += batch_idx * frame_size;
+    value.state_active_value += batch_idx * frame_size;
+    grad.gate_grad += batch_idx * frame_size * 4;
+    grad.state_grad += batch_idx * frame_size;
+    grad.output_grad += batch_idx * frame_size;
+  }
+
+  T r_value_in;
+  T r_value_ig;
+  T r_value_fg;
+  T r_value_og;
+  T r_grad_in;
+  T r_grad_ig;
+  T r_grad_fg;
+  T r_grad_og;
+  T r_prev_state = 0;
+  T r_prev_state_grad;
+  T r_state;
+  T r_state_grad;
+  T r_state_atv;
+  T r_output_grad;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
+
+  T r_checkIGrad;
+  T r_checkFGrad;
+  T r_checkOGrad;
+
+  r_value_in = value.gate_value[frame_idx];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
+  r_state = value.state_value[frame_idx];
+  r_state_atv = value.state_active_value[frame_idx];
+  r_output_grad = grad.output_grad[frame_idx];
+  r_state_grad = grad.state_grad[frame_idx];
+
+  if (value.prev_state_value) {
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
+    r_prev_state = value.prev_state_value[frame_idx];
+  }
+
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
+     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
+     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
+     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+     active_state);
+
+  grad.gate_grad[frame_idx] = r_grad_in;
+  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
+  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
+  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
+  grad.state_grad[frame_idx] = r_state_grad;
+  if (grad.prev_state_grad) {
+    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
+    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
+  }
+
+  if (is_batch) {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
+                                        r_checkIGrad);
+      if (grad.check_fg_grad)
+        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
+                                        r_checkFGrad);
+    }
+    if (grad.check_og_grad)
+      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
+                                      r_checkOGrad);
+  } else {
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
+    }
+    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      ActivationType active_node, ActivationType active_gate,
+                      ActivationType active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
+  } else {
+    /* frame_per_block = 32 batch_per_block = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batch_size == 1) {
+    KeLstmForward<T, Op,
+                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmForward<T, Op,
+                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
+                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frame_size, int batch_size,
+                       ActivationType active_node, ActivationType active_gate,
+                       ActivationType active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batch_size == 1) {
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
+    threads = dim3(frame_per_block, 1);
+    grid = dim3(frame_blocks, 1);
+  } else {
+    /* frame_per_block = 32 batch_per_block = 16 */
+    threads = dim3(32, 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batch_size == 1) {
+    KeLstmBackward<T, Op,
+                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmBackward<T, Op,
+                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
+        active_state);
+  }
+}
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
new file mode 100644
index 0000000000..fed8f9c4ca
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &prev_state, T &state, T &state_atv, T &output,
+                             T &checkI, T &checkF, T &checkO,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig = activation(value_ig + prev_state * checkI, active_gate);
+    value_fg = activation(value_fg + prev_state * checkF, active_gate);
+    state = value_in * value_ig + prev_state * value_fg;
+    value_og = activation(value_og + state * checkO, active_gate);
+    state_atv = activation(state, active_state);
+    output = value_og * state_atv;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+
+  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
+                             __m256 &value_fg, __m256 &value_og,
+                             __m256 &prev_state, __m256 &state,
+                             __m256 &state_atv, __m256 &output, __m256 &checkI,
+                             __m256 &checkF, __m256 &checkO,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    value_in = activation(value_in, active_node);
+    value_ig =
+        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
+                   active_gate);
+    value_fg =
+        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
+                   active_gate);
+    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
+                          _mm256_mul_ps(prev_state, value_fg));
+    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
+                          active_gate);
+    state_atv = activation(state, active_state);
+    output = _mm256_mul_ps(value_og, state_atv);
+  }
+#endif
+#endif
+};
+
+}  // namespace forward
+
+namespace backward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
+                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
+                             T &prev_state, T &prev_state_grad, T &state,
+                             T &state_grad, T &state_atv, T &output_grad,
+                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
+                             T &checkFGrad, T &checkOGrad,
+                             ActivationType active_node,
+                             ActivationType active_gate,
+                             ActivationType active_state) {
+    grad_og = activation(output_grad * state_atv, value_og, active_gate);
+    state_grad += activation(output_grad * value_og, state_atv, active_state) +
+                  grad_og * checkO;
+    grad_in = activation(state_grad * value_ig, value_in, active_node);
+    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
+    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
+    prev_state_grad =
+        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
+    checkIGrad = grad_ig * prev_state;
+    checkFGrad = grad_fg * prev_state;
+    checkOGrad = grad_og * state;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(
+      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
+      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
+      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
+      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
+      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
+      __m256 &checkFGrad, __m256 &checkOGrad, ActivationType active_node,
+      ActivationType active_gate, ActivationType active_state) {
+    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
+                         active_gate);
+    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
+                                          state_atv, active_state),
+                               state_grad);
+    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
+    grad_in =
+        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
+    grad_ig =
+        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
+    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
+                         active_gate);
+    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
+                                    _mm256_mul_ps(grad_fg, checkF));
+    prev_state_grad =
+        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
+    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
+    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
+    checkOGrad = _mm256_mul_ps(grad_og, state);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detection_util.h b/paddle/operators/math/detection_util.h
new file mode 100644
index 0000000000..e3a3ef2bad
--- /dev/null
+++ b/paddle/operators/math/detection_util.h
@@ -0,0 +1,300 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <map>
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct BBox {
+  BBox(T x_min, T y_min, T x_max, T y_max)
+      : x_min(x_min),
+        y_min(y_min),
+        x_max(x_max),
+        y_max(y_max),
+        is_difficult(false) {}
+
+  BBox() {}
+
+  T get_width() const { return x_max - x_min; }
+
+  T get_height() const { return y_max - y_min; }
+
+  T get_center_x() const { return (x_min + x_max) / 2; }
+
+  T get_center_y() const { return (y_min + y_max) / 2; }
+
+  T get_area() const { return get_width() * get_height(); }
+
+  // coordinate of bounding box
+  T x_min;
+  T y_min;
+  T x_max;
+  T y_max;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool is_difficult;
+};
+// KNCHW ==> NHWC
+// template <typename T>
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec);
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec);
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data);
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2);
+template <typename T>
+bool SortScorePairDescend(const std::pair<T, BBox<T>>& pair1,
+                          const std::pair<T, BBox<T>>& pair2);
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2);
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices);
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices);
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox);
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data);
+template <typename T>
+void GetBBoxFromPriorData(const T* prior_data, const size_t num_bboxes,
+                          std::vector<BBox<T>>& bbox_vec) {
+  size_t out_offset = bbox_vec.size();
+  bbox_vec.resize(bbox_vec.size() + num_bboxes);
+  for (size_t i = 0; i < num_bboxes; ++i) {
+    BBox<T> bbox;
+    bbox.x_min = *(prior_data + i * 8);
+    bbox.y_min = *(prior_data + i * 8 + 1);
+    bbox.x_max = *(prior_data + i * 8 + 2);
+    bbox.y_max = *(prior_data + i * 8 + 3);
+    bbox_vec[out_offset + i] = bbox;
+  }
+}
+template <typename T>
+void GetBBoxVarFromPriorData(const T* prior_data, const size_t num,
+                             std::vector<std::vector<T>>& var_vec) {
+  size_t out_offset = var_vec.size();
+  var_vec.resize(var_vec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    std::vector<T> var;
+    var.push_back(*(prior_data + i * 8 + 4));
+    var.push_back(*(prior_data + i * 8 + 5));
+    var.push_back(*(prior_data + i * 8 + 6));
+    var.push_back(*(prior_data + i * 8 + 7));
+    var_vec[out_offset + i] = var;
+  }
+}
+template <typename T>
+BBox<T> DecodeBBoxWithVar(BBox<T>& prior_bbox,
+                          const std::vector<T>& prior_bbox_var,
+                          const std::vector<T>& loc_pred_data) {
+  T prior_bbox_width = prior_bbox.get_width();
+  T prior_bbox_height = prior_bbox.get_height();
+  T prior_bbox_center_x = prior_bbox.get_center_x();
+  T prior_bbox_center_y = prior_bbox.get_center_y();
+
+  T decoded_bbox_center_x =
+      prior_bbox_var[0] * loc_pred_data[0] * prior_bbox_width +
+      prior_bbox_center_x;
+  T decoded_bbox_center_y =
+      prior_bbox_var[1] * loc_pred_data[1] * prior_bbox_height +
+      prior_bbox_center_y;
+  T decoded_bbox_width =
+      std::exp(prior_bbox_var[2] * loc_pred_data[2]) * prior_bbox_width;
+  T decoded_bbox_height =
+      std::exp(prior_bbox_var[3] * loc_pred_data[3]) * prior_bbox_height;
+
+  BBox<T> decoded_bbox;
+  decoded_bbox.x_min = decoded_bbox_center_x - decoded_bbox_width / 2;
+  decoded_bbox.y_min = decoded_bbox_center_y - decoded_bbox_height / 2;
+  decoded_bbox.x_max = decoded_bbox_center_x + decoded_bbox_width / 2;
+  decoded_bbox.y_max = decoded_bbox_center_y + decoded_bbox_height / 2;
+
+  return decoded_bbox;
+}
+template <typename T1, typename T2>
+bool SortScorePairDescend(const std::pair<T1, T2>& pair1,
+                          const std::pair<T1, T2>& pair2) {
+  return pair1.first > pair2.first;
+}
+template <typename T>
+T jaccard_overlap(const BBox<T>& bbox1, const BBox<T>& bbox2) {
+  if (bbox2.x_min > bbox1.x_max || bbox2.x_max < bbox1.x_min ||
+      bbox2.y_min > bbox1.y_max || bbox2.y_max < bbox1.y_min) {
+    return 0.0;
+  } else {
+    T inter_x_min = std::max(bbox1.x_min, bbox2.x_min);
+    T inter_y_min = std::max(bbox1.y_min, bbox2.y_min);
+    T interX_max = std::min(bbox1.x_max, bbox2.x_max);
+    T interY_max = std::min(bbox1.y_max, bbox2.y_max);
+
+    T inter_width = interX_max - inter_x_min;
+    T inter_height = interY_max - inter_y_min;
+    T inter_area = inter_width * inter_height;
+
+    T bbox_area1 = bbox1.get_area();
+    T bbox_area2 = bbox2.get_area();
+
+    return inter_area / (bbox_area1 + bbox_area2 - inter_area);
+  }
+}
+
+template <typename T>
+void ApplyNmsFast(const std::vector<BBox<T>>& bboxes, const T* conf_score_data,
+                  size_t class_idx, size_t top_k, T conf_threshold,
+                  T nms_threshold, size_t num_priors, size_t num_classes,
+                  std::vector<size_t>* indices) {
+  std::vector<std::pair<T, size_t>> scores;
+  for (size_t i = 0; i < num_priors; ++i) {
+    size_t conf_offset = i * num_classes + class_idx;
+    if (conf_score_data[conf_offset] > conf_threshold)
+      scores.push_back(std::make_pair(conf_score_data[conf_offset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(),
+                   SortScorePairDescend<T, size_t>);
+  if (top_k > 0 && top_k < scores.size()) scores.resize(top_k);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t saved_idx = (*indices)[i];
+        T overlap = jaccard_overlap<T>(bboxes[idx], bboxes[saved_idx]);
+        keep = overlap <= nms_threshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+template <typename T>
+int GetDetectionIndices(
+    const T* conf_data, const size_t num_priors, const size_t num_classes,
+    const size_t background_label_id, const size_t batch_size,
+    const T conf_threshold, const size_t nms_top_k, const T nms_threshold,
+    const size_t top_k,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes,
+    std::vector<std::map<size_t, std::vector<size_t>>>* all_detection_indices) {
+  int total_keep_num = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+    size_t num_detected = 0;
+    std::map<size_t, std::vector<size_t>> indices;
+    size_t conf_offset = n * num_priors * num_classes;
+    for (size_t c = 0; c < num_classes; ++c) {
+      if (c == background_label_id) continue;
+      ApplyNmsFast<T>(decoded_bboxes, conf_data + conf_offset, c, nms_top_k,
+                      conf_threshold, nms_threshold, num_priors, num_classes,
+                      &(indices[c]));
+      num_detected += indices[c].size();
+    }
+    if (top_k > 0 && num_detected > top_k) {
+      // std::vector<pair<T,T>> score_index_pairs;
+      std::vector<std::pair<T, std::pair<size_t, size_t>>> score_index_pairs;
+      for (size_t c = 0; c < num_classes; ++c) {
+        const std::vector<size_t>& label_indices = indices[c];
+        for (size_t i = 0; i < label_indices.size(); ++i) {
+          size_t idx = label_indices[i];
+          score_index_pairs.push_back(
+              std::make_pair((conf_data + conf_offset)[idx * num_classes + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<T, std::pair<size_t, size_t>>);
+      score_index_pairs.resize(top_k);
+      std::map<size_t, std::vector<size_t>> new_indices;
+      for (size_t i = 0; i < score_index_pairs.size(); ++i) {
+        size_t label = score_index_pairs[i].second.first;
+        size_t idx = score_index_pairs[i].second.second;
+        new_indices[label].push_back(idx);
+      }
+      all_detection_indices->push_back(new_indices);
+      total_keep_num += top_k;
+    } else {
+      all_detection_indices->push_back(indices);
+      total_keep_num += num_detected;
+    }
+  }
+  return total_keep_num;
+}
+template <typename T>
+BBox<T> ClipBBox(const BBox<T>& bbox) {
+  T one = static_cast<T>(1.0);
+  T zero = static_cast<T>(0.0);
+  BBox<T> clipped_bbox;
+  clipped_bbox.x_min = std::max(std::min(bbox.x_min, one), zero);
+  clipped_bbox.y_min = std::max(std::min(bbox.y_min, one), zero);
+  clipped_bbox.x_max = std::max(std::min(bbox.x_max, one), zero);
+  clipped_bbox.y_max = std::max(std::min(bbox.y_max, one), zero);
+  return clipped_bbox;
+}
+template <typename T>
+void GetDetectionOutput(
+    const T* conf_data, const size_t num_kept, const size_t num_priors,
+    const size_t num_classes, const size_t batch_size,
+    const std::vector<std::map<size_t, std::vector<size_t>>>& all_indices,
+    const std::vector<std::vector<BBox<T>>>& all_decoded_bboxes, T* out_data) {
+  size_t count = 0;
+  for (size_t n = 0; n < batch_size; ++n) {
+    for (std::map<size_t, std::vector<size_t>>::const_iterator it =
+             all_indices[n].begin();
+         it != all_indices[n].end(); ++it) {
+      size_t label = it->first;
+      const std::vector<size_t>& indices = it->second;
+      const std::vector<BBox<T>>& decoded_bboxes = all_decoded_bboxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t conf_offset = n * num_priors * num_classes + idx * num_classes;
+        out_data[count * 7] = n;
+        out_data[count * 7 + 1] = label;
+        out_data[count * 7 + 2] = (conf_data + conf_offset)[label];
+        BBox<T> clipped_bbox = ClipBBox<T>(decoded_bboxes[idx]);
+        out_data[count * 7 + 3] = clipped_bbox.x_min;
+        out_data[count * 7 + 4] = clipped_bbox.y_min;
+        out_data[count * 7 + 5] = clipped_bbox.x_max;
+        out_data[count * 7 + 6] = clipped_bbox.y_max;
+        ++count;
+      }
+    }
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
new file mode 100644
index 0000000000..101ab85962
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
+                                 frame_size, batch_size, active_gate);
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
+                                 frame_size, batch_size, active_node);
+#endif
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+#ifndef __NVCC__
+    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
+                                grad, frame_size, batch_size, active_node);
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
+                                grad, frame_size, batch_size, active_gate);
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CPUDeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CPUDeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+#endif
+  }
+};
+
+template struct GRUUnitFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitFunctor<platform::CPUDeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
new file mode 100644
index 0000000000..d5a0e630ea
--- /dev/null
+++ b/paddle/operators/math/gru_compute.cu
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/gru_gpu_kernel.h"
+#include "paddle/operators/math/detail/gru_kernel.h"
+#include "paddle/operators/math/gru_compute.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    } else {
+      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
+    }
+
+    if (value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ false,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    } else {
+      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
+                                      /* is_batch= */ true,
+                                      T><<<grid, threads, 0, stream>>>(
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
+          active_node);
+    }
+  }
+};
+
+template <typename T>
+struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext &context,
+                      GRUMetaValue<T> value, GRUMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate) {
+    auto stream = context.stream();
+    dim3 threads;
+    dim3 grid;
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
+    } else {
+      threads = dim3(32, 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    } else {
+      detail::KeGruBackwardStateGrad<
+          detail::backward::gru_stateGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
+    }
+
+    if (value.prev_out_value && grad.prev_out_grad) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
+
+      if (grad.state_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
+      }
+    }
+
+    if (batch_size == 1) {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    } else {
+      detail::KeGruBackwardResetGrad<
+          detail::backward::gru_resetGrad<T>,
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
+    }
+
+    if (grad.prev_out_grad && value.prev_out_value) {
+      math::gemm<platform::CUDADeviceContext, T>(
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
+
+      if (grad.gate_weight_grad) {
+        math::gemm<platform::CUDADeviceContext, T>(
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
+      }
+    }
+  }
+};
+
+template struct GRUUnitFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitFunctor<platform::CUDADeviceContext, double>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, float>;
+template struct GRUUnitGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
new file mode 100644
index 0000000000..bf69147b50
--- /dev/null
+++ b/paddle/operators/math/gru_compute.h
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+struct GRUMetaValue {
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
+};
+
+template <typename T>
+struct GRUMetaGrad {
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitFunctor {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
+template <typename DeviceContext, typename T>
+struct GRUUnitGradFunctor {
+  static void compute(const DeviceContext &context, GRUMetaValue<T> value,
+                      GRUMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType active_node,
+                      const detail::ActivationType active_gate);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
new file mode 100644
index 0000000000..c2633b2e16
--- /dev/null
+++ b/paddle/operators/math/im2col.cc
@@ -0,0 +1,313 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/im2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          int col_idx = (c * col_height + h) * col_width + w;
+          int im_idx = (im_row_idx + c_im * im_height) * im_width + im_col_idx;
+
+          col_data[col_idx] = (im_row_idx < 0 || im_row_idx >= im_height ||
+                               im_col_idx < 0 || im_col_idx >= im_width)
+                                  ? static_cast<T>(0)
+                                  : im_data[im_idx];
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       ((dilation[0] * (filter_height - 1) + 1))) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       ((dilation[1] * (filter_width - 1) + 1))) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+
+    int channels_col = im_channels * filter_height * filter_width;
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int c_im = c / (filter_width * filter_height);
+      for (int h = 0; h < col_height; ++h) {
+        int im_row_idx = h * stride[0] - padding[0] + h_offset * dilation[0];
+        for (int w = 0; w < col_width; ++w) {
+          int im_col_idx = w * stride[1] - padding[1] + w_offset * dilation[1];
+          if ((im_row_idx) >= 0 && (im_row_idx) < im_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < im_width) {
+            im_data[(im_row_idx + c_im * im_height) * im_width + im_col_idx] +=
+                col_data[(c * col_height + h) * col_width + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CPUDeviceContext, double>;
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
+    const T* im_data = im.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  ((((col_row_idx)*col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              int im_offset = (channel * im_height + im_row_offset) * im_width +
+                              im_col_offset;
+              col_data[col_offset] =
+                  (im_row_offset < 0 || im_row_offset >= im_height ||
+                   im_col_offset < 0 || im_col_offset >= im_width)
+                      ? static_cast<T>(0)
+                      : im_data[im_offset];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ(
+        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
+        col_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
+        col_width,
+        "col_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
+    T* im_data = im->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int col_row_idx = 0; col_row_idx < col_height; ++col_row_idx) {
+      for (int col_col_idx = 0; col_col_idx < col_width; ++col_col_idx) {
+        for (int channel = 0; channel < im_channels; ++channel) {
+          for (int filter_row_idx = 0; filter_row_idx < filter_height;
+               ++filter_row_idx) {
+            int im_row_offset =
+                col_row_idx * stride[0] + filter_row_idx - padding[0];
+            for (int filter_col_idx = 0; filter_col_idx < filter_width;
+                 ++filter_col_idx) {
+              int im_col_offset =
+                  col_col_idx * stride[1] + filter_col_idx - padding[1];
+
+              int col_offset =
+                  (((col_row_idx * col_width + col_col_idx) * im_channels +
+                    channel) *
+                       filter_height +
+                   filter_row_idx) *
+                      filter_width +
+                  filter_col_idx;
+
+              if (im_row_offset >= 0 && im_row_offset < im_height &&
+                  im_col_offset >= 0 && im_col_offset < im_width) {
+                int im_offset =
+                    (channel * im_height + im_row_offset) * im_width +
+                    im_col_offset;
+                im_data[im_offset] += col_data[col_offset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
new file mode 100644
index 0000000000..a88e837b03
--- /dev/null
+++ b/paddle/operators/math/im2col.cu
@@ -0,0 +1,424 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/im2col.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void im2col(const T* data_im, int num_outs, int im_height,
+                       int im_width, int dilation_h, int dilation_w,
+                       int filter_height, int filter_width, int stride_height,
+                       int stride_width, int padding_height, int padding_width,
+                       int col_height, int col_width, T* data_col) {
+  const int index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < num_outs) {
+    int w_out = index % col_width;
+    int h_out = (index / col_width) % col_height;
+    int channel_in = index / col_width / col_height;
+    int channel_out = channel_in * filter_height * filter_width;
+    int h_in = h_out * stride_height - padding_height;
+    int w_in = w_out * stride_width - padding_width;
+
+    data_col += (channel_out * col_height + h_out) * col_width + w_out;
+    data_im += (channel_in * im_height + h_in) * im_width + w_in;
+    for (int i = 0; i < filter_height; ++i) {
+      for (int j = 0; j < filter_width; ++j) {
+        int rIdx = h_in + i * dilation_h;
+        int cIdx = w_in + j * dilation_w;
+        *data_col =
+            (rIdx >= im_height || rIdx < 0 || cIdx >= im_width || cIdx < 0)
+                ? 0
+                : data_im[i * dilation_h * im_width + j * dilation_w];
+        data_col += col_height * col_width;
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[1];
+    int filter_width = col->dims()[2];
+    int col_height = col->dims()[3];
+    int col_width = col->dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int num_outputs = im_channels * col_height * col_width;
+    int blocks = (num_outputs + 1024 - 1) / 1024;
+    int block_x = 512;
+    int block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+    im2col<T><<<grid, threads, 0, context.stream()>>>(
+        im.data<T>(), num_outputs, im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[1], col_height, col_width, col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2im(int n, const T* data_col, int im_height, int im_width,
+                       int dilation_h, int dilation_w, int filter_height,
+                       int filter_width, int stride_height, int stride_width,
+                       int padding_height, int padding_width, int col_height,
+                       int col_width, T* data_im) {
+  const int index =
+      (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
+  if (index < n) {
+    T val = 0;
+    int w = index % im_width + padding_width;
+    int h = (index / im_width) % im_height + padding_height;
+    int c = index / (im_width * im_height);
+
+    // compute the start and end of the output
+    int w_col_start =
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, col_width);
+    int h_col_start =
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, col_height);
+
+    for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+      for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+        int h_off = (h - h_col * stride_height);
+        int w_off = (w - w_col * stride_width);
+        if (h_off % dilation_h == 0 && w_off % dilation_w == 0) {
+          h_off /= dilation_h;
+          w_off /= dilation_w;
+          int data_col_index =
+              (((c * filter_height + h_off) * filter_width + w_off) *
+                   col_height +
+               h_col) *
+                  col_width +
+              w_col;
+
+          val += data_col[data_col_index];
+        }
+      }
+    }
+    data_im[index] = val;
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [input_channels, filter_height, filter_width, output_height, output_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[1];
+    int filter_width = col.dims()[2];
+    int col_height = col.dims()[3];
+    int col_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    size_t num_kernels = im_channels * im_height * im_width;
+
+    size_t blocks = (num_kernels + 1024 - 1) / 1024;
+    size_t block_x = 512;
+    size_t block_y = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(block_x, block_y);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<<grid, threads, 0, context.stream()>>>(
+        num_kernels, col.data<T>(), im_height, im_width, dilation[0],
+        dilation[1], filter_height, filter_width, stride[0], stride[1],
+        padding[0], padding[2], col_height, col_width, im->data<T>());
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             platform::CUDADeviceContext, double>;
+
+template <class T>
+__global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* col_data) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < im_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
+
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
+
+        col_data[col_offset] =
+            (height_offset >= im_height || height_offset < 0 ||
+             width_offset >= im_width || width_offset < 0)
+                ? T(0)
+                : im_data[im_offset];
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col) {
+    PADDLE_ENFORCE(im.dims().size() == 3);
+    PADDLE_ENFORCE(col->dims().size() == 5);
+    int im_channels = im.dims()[0];
+    int im_height = im.dims()[1];
+    int im_width = im.dims()[2];
+    int filter_height = col->dims()[3];
+    int filter_width = col->dims()[4];
+    int col_height = col->dims()[0];
+    int col_width = col->dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
+    im2colOCF<T><<<grid, threads, 0, context.stream()>>>(
+        im.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
+                          int im_width, int filter_height, int filter_width,
+                          int stride_height, int stride_width,
+                          int padding_height, int padding_width, int col_height,
+                          int col_width, T* im_data) {
+  int swid = blockIdx.x;
+  int shid = blockIdx.y;
+  for (int channelid = threadIdx.z; channelid < im_channels;
+       channelid += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filter_height; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filter_width; idx += blockDim.x) {
+        int width_offset = idx + swid * stride_width - padding_width;
+        int height_offset = idy + shid * stride_height - padding_height;
+        int im_offset = width_offset + height_offset * im_width +
+                        channelid * im_height * im_width;
+
+        int col_offset = idx + idy * filter_width +
+                         channelid * filter_height * filter_width +
+                         (shid * col_width + swid) *
+                             (im_channels * filter_height * filter_width);
+
+        if (height_offset >= 0 && height_offset < im_height &&
+            width_offset >= 0 && width_offset < im_width) {
+          paddle::platform::CudaAtomicAdd(im_data + im_offset,
+                                          col_data[col_offset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels, input_height, input_width]
+ * col =
+ *   [output_height, output_width, input_channels, filter_height, filter_width]
+ */
+template <class T>
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                    platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im) {
+    PADDLE_ENFORCE(im->dims().size() == 3);
+    PADDLE_ENFORCE(col.dims().size() == 5);
+    int im_channels = im->dims()[0];
+    int im_height = im->dims()[1];
+    int im_width = im->dims()[2];
+    int filter_height = col.dims()[3];
+    int filter_width = col.dims()[4];
+    int col_height = col.dims()[0];
+    int col_width = col.dims()[1];
+
+    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
+                       (dilation[0] * (filter_height - 1) + 1)) /
+                              stride[0] +
+                          1,
+                      col_height,
+                      "Output_height and padding(padding_up, padding_down) are "
+                      "inconsistent.");
+    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
+                       (dilation[1] * (filter_width - 1) + 1)) /
+                              stride[1] +
+                          1,
+                      col_width,
+                      "col_width and padding(padding_left, padding_right) are "
+                      "inconsistent.");
+
+    int block_dim_x = 0;
+    int block_dim_y = 0;
+    if (filter_height <= 4 && filter_width <= 4) {
+      block_dim_x = 4;
+      block_dim_y = 4;
+    } else if (filter_height <= 8 && filter_width <= 8) {
+      block_dim_x = 8;
+      block_dim_y = 8;
+    } else if (filter_height <= 16 && filter_width <= 16) {
+      block_dim_x = 16;
+      block_dim_y = 16;
+    } else {
+      block_dim_x = 32;
+      block_dim_y = 32;
+    }
+
+    int block_dim_z = 1024 / block_dim_x / block_dim_y;
+    dim3 threads(block_dim_x, block_dim_y, std::min(block_dim_z, im_channels));
+    dim3 grid(col_width, col_height);
+    col2imOCF<T><<<grid, threads, 0, context.stream()>>>(
+        col.data<T>(), im_channels, im_height, im_width, filter_height,
+        filter_width, stride[0], stride[1], padding[0], padding[1], col_height,
+        col_width, im->data<T>());
+  }
+};
+
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
new file mode 100644
index 0000000000..38f2c9fe0a
--- /dev/null
+++ b/paddle/operators/math/im2col.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum class ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [input_channels, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 2-dimension  [dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 2-dimension  [stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 4-dimension  [up_pad, left_pad, down_pad, right_pad].
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [input_channels, filter_height, filter_width, output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_height * filter_width, and the width is equal
+ * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_height,
+ *      output_width]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [output_height, output_width, input_channels, filter_height, filter_width]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seq_length, step_size], where the seq_length
+ * is equal output_height * output_width, and the step_size is equal
+ * input_channels * filter_height * filter_width.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [output_height,
+ *      output_width,
+ *      input_channels,    ======>    [seqLength, stepSize]
+ *      filter_height,
+ *      filter_width]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, typename DeviceContext, typename T>
+class Im2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* col);
+};
+
+template <ColFormat Format, typename DeviceContext, typename T>
+class Col2ImFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilation,
+                  const std::vector<int>& stride,
+                  const std::vector<int>& padding, framework::Tensor* im);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
new file mode 100644
index 0000000000..1ba24325ff
--- /dev/null
+++ b/paddle/operators/math/im2col_test.cc
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/im2col.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place>
+void testIm2col() {
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor output_cfo;
+  paddle::framework::Tensor output_ocf;
+  paddle::framework::Tensor output_tmp;
+
+  /**
+   * input = [0, 1, 2,
+   *          3, 4, 5]
+   *
+   * output_cfo = [0, 1
+   *               1, 2
+   *               3, 4
+   *               4, 5]
+   *
+   * output_ocf = [0, 1, 3, 4
+   *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
+   */
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> stride({1, 1});  // stride_y, stride_x
+  std::vector<int> padding(
+      {0, 0, 0, 0});                  // up_pad, left_pad, down_pad, right_pad
+  std::vector<int> dilation({1, 1});  // dilation_y, dilation_x
+  int output_height =
+      (input_height - filter_size + padding[0] + padding[1]) / stride[0] + 1;
+  int output_width =
+      (input_width - filter_size + padding[2] + padding[3]) / stride[1] + 1;
+  float* input_ptr = input_tmp.mutable_data<float>(
+      {1, input_height, input_width}, paddle::platform::CPUPlace());
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input_ptr, arr, 6 * sizeof(float));
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+  output_cfo.mutable_data<float>(
+      {1, filter_size, filter_size, output_height, output_width}, *place);
+  output_ocf.mutable_data<float>(
+      {output_height, output_width, 1, filter_size, filter_size}, *place);
+
+  // Im2Col
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
+      im2col;
+  paddle::operators::math::Im2ColFunctor<
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
+      im2col_ocf;
+
+  im2col(*context, input, dilation, stride, padding, &output_cfo);
+  im2col_ocf(*context, input, dilation, stride, padding, &output_ocf);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
+
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output_cfo.data<float>();
+  } else {
+    Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
+
+  float* out_ocf_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_ocf_ptr = output_ocf.data<float>();
+  } else {
+    Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_ocf_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO, DeviceContext, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF, DeviceContext, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  col2im(*context, output_cfo, dilation, stride, padding, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  delete place;
+  delete context;
+}
+
+TEST(math, im2col) {
+  testIm2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testIm2col<paddle::platform::CUDADeviceContext,
+             paddle::platform::CUDAPlace>();
+#endif
+}
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
new file mode 100644
index 0000000000..d453102ece
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
+#include "paddle/operators/math/detail/lstm_kernel.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
+                               cand_act, gate_act, cell_act);
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::CPUDeviceContext, T> {
+  static void compute(const platform::CPUDeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
+                                frame_size, cand_act, gate_act, cell_act);
+
+      value.gate_value += frame_size * 4;
+      value.state_value += frame_size;
+      value.state_active_value += frame_size;
+      value.output_value += frame_size;
+      if (value.prev_state_value) {
+        value.prev_state_value += frame_size;
+      }
+
+      grad.gate_grad += frame_size * 4;
+      grad.state_grad += frame_size;
+      grad.state_active_grad += frame_size;
+      grad.output_grad += frame_size;
+      if (grad.prev_state_grad) {
+        grad.prev_state_grad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitFunctor<platform::CPUDeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
new file mode 100644
index 0000000000..82065d699f
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
+#include "paddle/operators/math/detail/lstm_kernel.h"
+#include "paddle/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
+                                frame_size, batch_size, cand_act, gate_act,
+                                cell_act);
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::CUDADeviceContext, T> {
+  static void compute(const platform::CUDADeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType& gate_act,
+                      const detail::ActivationType& cell_act,
+                      const detail::ActivationType& cand_act) {
+    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
+                              frame_size, batch_size, cand_act, gate_act,
+                              cell_act);
+  }
+};
+
+template class LstmUnitFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitFunctor<platform::CUDADeviceContext, double>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, float>;
+template class LstmUnitGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
new file mode 100644
index 0000000000..e1ad6b64d2
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/detail/activation_functions.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmMetaValue {
+  T *gate_value;
+  T *prev_state_value;
+  T *state_value;
+  T *state_active_value;
+  T *output_value;
+  T *check_ig;
+  T *check_fg;
+  T *check_og;
+};
+
+template <class T>
+struct LstmMetaGrad {
+  T *gate_grad;
+  T *prev_state_grad;
+  T *state_grad;
+  T *state_active_grad;
+  T *output_grad;
+  T *check_ig_grad;
+  T *check_fg_grad;
+  T *check_og_grad;
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitFunctor {
+ public:
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+template <typename DeviceContext, typename T>
+class LstmUnitGradFunctor {
+ public:
+  static void compute(const DeviceContext &context, LstmMetaValue<T> value,
+                      LstmMetaGrad<T> grad, int frame_size, int batch_size,
+                      const detail::ActivationType &gate_act,
+                      const detail::ActivationType &cell_act,
+                      const detail::ActivationType &cand_act);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
new file mode 100644
index 0000000000..dcf4b85e1a
--- /dev/null
+++ b/paddle/operators/math/math_function.cc
@@ -0,0 +1,336 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/operators/math/math_function_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+              beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
+  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
+  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+              lda, B, ldb, beta, C, ldc);
+}
+
+template <>
+void matmul<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUDeviceContext, float>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
+}
+
+template <>
+void matmul<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_cpu_place(matrix_a.place()) &&
+                     platform::is_cpu_place(matrix_b.place()) &&
+                     platform::is_cpu_place(matrix_out->place()),
+                 "Matrix must all be in CPUPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CPUDeviceContext, double>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
+}
+
+#ifdef PADDLE_WITH_MKLML
+// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
+template <>
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const float*>(batchCount);
+  auto b_array = std::vector<const float*>(batchCount);
+  auto c_array = std::vector<float*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+
+template <>
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const double*>(batchCount);
+  auto b_array = std::vector<const double*>(batchCount);
+  auto c_array = std::vector<double*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+#else
+// The below is a naive but correct serial implementation that just loops
+// over the batch dimension. This is a fallback for when the batched gemm
+// functions of Intel MKL are not available. In the future, this computation
+// should be parallelized.
+template <>
+void batched_gemm<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const float* Ak = &A[k * strideA];
+    const float* Bk = &B[k * strideB];
+    float* Ck = &C[k * M * N];
+    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+                                            alpha, Ak, Bk, beta, Ck);
+  }
+}
+
+template <>
+void batched_gemm<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const double* Ak = &A[k * strideA];
+    const double* Bk = &B[k * strideB];
+    double* Ck = &C[k * M * N];
+    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+                                             alpha, Ak, Bk, beta, Ck);
+  }
+}
+#endif
+
+template <>
+void gemv<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+void gemv<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
+}
+
+template <>
+void axpy<platform::CPUDeviceContext, float>(
+    const platform::CPUDeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  cblas_saxpy(n, alpha, x, 1, y, 1);
+}
+
+template <>
+void axpy<platform::CPUDeviceContext, double>(
+    const platform::CPUDeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  cblas_daxpy(n, alpha, x, 1, y, 1);
+}
+
+template struct SetConstant<platform::CPUDeviceContext, float>;
+template struct SetConstant<platform::CPUDeviceContext, double>;
+template struct SetConstant<platform::CPUDeviceContext, int>;
+template struct SetConstant<platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<platform::CPUDeviceContext, bool>;
+
+#define DEFINE_CPU_TRANS(RANK)                                          \
+  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
+  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
+  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
+  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
+  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void operator()() const {
+    auto cpu = platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CPUPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#ifdef PADDLE_WITH_CUDA
+  tensor->place().apply_visitor(func);
+#else
+  func(platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+
+    auto in = framework::EigenMatrix<T>::From(input);
+    auto vec = framework::EigenVector<T>::Flatten(vector);
+    auto out = framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<platform::CPUDeviceContext, float>;
+template struct ColwiseSum<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
new file mode 100644
index 0000000000..d47a7f818d
--- /dev/null
+++ b/paddle/operators/math/math_function.cu
@@ -0,0 +1,330 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/framework/data_type.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/math_function_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <>
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, N));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K, const float alpha,
+    const float* A, const int lda, const float* B, const int ldb,
+    const float beta, float* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
+}
+
+template <>
+void gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool transA,
+    const bool transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const int lda, const double* B,
+    const int ldb, const double beta, double* C, const int ldc) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
+      lda, &beta, C, ldc));
+}
+
+template <>
+void matmul<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, float alpha,
+    framework::Tensor* matrix_out, float beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in CUDAPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CUDADeviceContext, float>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      matrix_b.data<float>(), beta, matrix_out->data<float>());
+}
+
+template <>
+void matmul<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context,
+    const framework::Tensor& matrix_a, bool trans_a,
+    const framework::Tensor& matrix_b, bool trans_b, double alpha,
+    framework::Tensor* matrix_out, double beta) {
+  auto dim_a = matrix_a.dims();
+  auto dim_b = matrix_b.dims();
+  auto dim_out = matrix_out->dims();
+  PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
+                 "The input and output of matmul be matrix");
+
+  PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
+                     platform::is_gpu_place(matrix_b.place()) &&
+                     platform::is_gpu_place(matrix_out->place()),
+                 "Matrix must all be in CUDAPlace");
+
+  int M = dim_out[0];
+  int N = dim_out[1];
+  int K = (trans_a == false) ? dim_a[1] : dim_a[0];
+
+  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+  gemm<platform::CUDADeviceContext, double>(
+      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      matrix_b.data<double>(), beta, matrix_out->data<double>());
+}
+
+template <>
+void batched_gemm<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+}
+
+template <>
+void batched_gemm<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
+      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb,
+      strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount));
+}
+
+template <>
+void gemv<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const float alpha, const float* A, const float* B,
+    const float beta, float* C) {
+  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
+}
+
+template <>
+void gemv<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const bool trans_a, const int M,
+    const int N, const double alpha, const double* A, const double* B,
+    const double beta, double* C) {
+  cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N;
+  PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(),
+                                                cuTransA, N, M, &alpha, A, N, B,
+                                                1, &beta, C, 1));
+}
+
+template <>
+void axpy<platform::CUDADeviceContext, float>(
+    const platform::CUDADeviceContext& context, const int n, const float alpha,
+    const float* x, float* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
+}
+
+template <>
+void axpy<platform::CUDADeviceContext, double>(
+    const platform::CUDADeviceContext& context, const int n, const double alpha,
+    const double* x, double* y) {
+  PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n,
+                                                &alpha, x, 1, y, 1));
+}
+
+template struct SetConstant<platform::CUDADeviceContext, float>;
+template struct SetConstant<platform::CUDADeviceContext, double>;
+template struct SetConstant<platform::CUDADeviceContext, int>;
+template struct SetConstant<platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<platform::CUDADeviceContext, bool>;
+
+#define DEFINE_GPU_TRANS(RANK)                                         \
+  template struct Transpose<platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<platform::CUDADeviceContext, double, RANK>;
+
+DEFINE_GPU_TRANS(1);
+DEFINE_GPU_TRANS(2);
+DEFINE_GPU_TRANS(3);
+DEFINE_GPU_TRANS(4);
+DEFINE_GPU_TRANS(5);
+DEFINE_GPU_TRANS(6);
+
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const platform::DeviceContext& context,
+                       framework::Tensor* tensor, float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void operator()() const {
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
+            tensor_, static_cast<T>(value_));
+  }
+
+  const platform::DeviceContext& context_;
+  framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<platform::CUDAPlace>(
+    const platform::DeviceContext& context, framework::Tensor* tensor,
+    float value) {
+  framework::VisitDataType(framework::ToDataType(tensor->type()),
+                           TensorSetConstantGPU(context, tensor, value));
+}
+
+template <typename T>
+__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
+                                 int num) {
+  T tmp = 1.0 / width;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+       i += blockDim.x * gridDim.x) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& vector, framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(vector.numel(), size);
+    PADDLE_ENFORCE_EQ(output->dims(), in_dims);
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(), vector.data<T>(), output->data<T>(),
+        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
+  }
+};
+
+template struct RowwiseAdd<platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<platform::CUDADeviceContext, double>;
+template struct ColwiseSum<platform::CUDADeviceContext, float>;
+// template struct ColwiseSum<platform::CUDADeviceContext, double>;
+// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
+    const platform::CUDADeviceContext& context, const framework::Tensor& input,
+    framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(), size);
+  framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  gemv<platform::CUDADeviceContext, double>(
+      context, true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]),
+      1.0, input.data<double>(), one.data<double>(), 0.0,
+      vector->data<double>());
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
new file mode 100644
index 0000000000..8cc03c2ba0
--- /dev/null
+++ b/paddle/operators/math/math_function.h
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#ifdef PADDLE_WITH_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <cblas.h>
+#include <clapack.h>
+}
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#include <lapacke.h>
+#endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>
+int LAPACKE_sgetrf(int matrix_layout, int m, int n, float* a, int lda,
+                   int* ipiv);
+int LAPACKE_dgetrf(int matrix_layout, int m, int n, double* a, int lda,
+                   int* ipiv);
+int LAPACKE_sgetri(int matrix_layout, int n, float* a, int lda,
+                   const int* ipiv);
+int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
+                   const int* ipiv);
+}
+#endif
+
+#include <cmath>
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Support continuous memory now
+// If transA = N, and transB = N
+// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
+// For more detailed info, please refer to
+// http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+          const T alpha, const T* A, const T* B, const T beta, T* C);
+
+// gemm wrapper with stride args for matrix uncontinuous in memory
+template <typename DeviceContext, typename T>
+void gemm(const DeviceContext& context, const bool transA, const bool transB,
+          const int M, const int N, const int K, const T alpha, const T* A,
+          const int lda, const T* B, const int ldb, const T beta, T* C,
+          const int ldc);
+
+// matrix multiply with continuous memory
+template <typename DeviceContext, typename T>
+void matmul(const DeviceContext& context, const framework::Tensor& matrix_a,
+            bool trans_a, const framework::Tensor& matrix_b, bool trans_b,
+            T alpha, framework::Tensor* matrix_out, T beta);
+
+// Batched gemm
+template <typename DeviceContext, typename T>
+void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE transB, const int M, const int N,
+                  const int K, const T alpha, const T* A, const T* B,
+                  const T beta, T* C, const int batchCount, const int strideA,
+                  const int strideB);
+
+template <typename DeviceContext, typename T>
+void gemv(const DeviceContext& context, const bool trans_a, const int M,
+          const int N, const T alpha, const T* A, const T* B, const T beta,
+          T* C);
+
+template <typename DeviceContext, typename T>
+void axpy(const DeviceContext& context, const int n, const T alpha, const T* x,
+          T* y);
+
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+  void operator()(const DeviceContext& context, const framework::Tensor& in,
+                  framework::Tensor* out, const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T>
+struct SetConstant {
+  void operator()(const DeviceContext& context, framework::Tensor* tensor,
+                  T num);
+};
+
+template <typename Place>
+void set_constant_with_place(const platform::DeviceContext& context,
+                             framework::Tensor* tensor, float value);
+
+void set_constant(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, float value);
+
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& vec, framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* vec);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h
new file mode 100644
index 0000000000..de591626df
--- /dev/null
+++ b/paddle/operators/math/math_function_impl.h
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/data_type.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
+                                               framework::Tensor* tensor,
+                                               T num) {
+  auto t = framework::EigenVector<T>::Flatten(*tensor);
+  t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
+}
+
+template <typename DeviceContext, typename T, int Rank>
+void Transpose<DeviceContext, T, Rank>::operator()(
+    const DeviceContext& context, const framework::Tensor& in,
+    framework::Tensor* out, const std::vector<int>& axis) {
+  Eigen::array<int, Rank> permute;
+  for (int i = 0; i < Rank; i++) {
+    permute[i] = axis[i];
+  }
+  auto in_dim = in.dims();
+  auto out_dim = out->dims();
+
+  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+  auto* dev = context.eigen_device();
+  eigen_out.device(*dev) = eigen_in.shuffle(permute);
+}
+
+template <typename DeviceContext, typename T>
+void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
+                                              const framework::Tensor& input,
+                                              framework::Tensor* out) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(out->numel(), size);
+
+  auto in = framework::EigenMatrix<T>::From(input);
+  auto vec = framework::EigenVector<T>::Flatten(*out);
+
+  vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
+}
+
+// Specialize for CPU, since Eigen implement a general reduce. However,
+// colwise-sum can be easily implemented. General reduce has a huge overhead in
+// CPU
+template <typename T>
+class ColwiseSum<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* out) {
+    auto& in_dims = input.dims();
+    auto height = in_dims[0];
+    auto size = in_dims[1];
+    PADDLE_ENFORCE_EQ(out->numel(), size);
+
+    T* out_buf = out->mutable_data<T>(out->place());
+    const T* in_buf = input.data<T>();
+
+    for (size_t i = 0; i < static_cast<size_t>(height); ++i) {
+      for (size_t j = 0; j < static_cast<size_t>(size); ++j) {
+        if (i == 0) {
+          out_buf[j] = in_buf[i * size + j];
+        } else {
+          out_buf[j] += in_buf[i * size + j];
+        }
+      }
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
new file mode 100644
index 0000000000..c9f322b92e
--- /dev/null
+++ b/paddle/operators/math/math_function_test.cc
@@ -0,0 +1,167 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/operators/math/math_function.h"
+#include "gtest/gtest.h"
+
+TEST(math_function, gemm_notrans_cblas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
+      context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, gemm_trans_clbas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemm<paddle::platform::CPUDeviceContext, float>(
+      context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1,
+      input3_ptr + 1, 4);
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+}
+
+TEST(math_function, zero) {
+  paddle::framework::Tensor tensor;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
+                                       float>
+      functor;
+  functor(context, &tensor, 0);
+  EXPECT_EQ(t[0], 0);
+  EXPECT_EQ(t[1], 0);
+  EXPECT_EQ(t[2], 0);
+  EXPECT_EQ(t[3], 0);
+
+  functor(context, &tensor, 1);
+
+  EXPECT_EQ(t[0], 1);
+  EXPECT_EQ(t[1], 1);
+  EXPECT_EQ(t[2], 1);
+  EXPECT_EQ(t[3], 1);
+}
+
+template <typename T>
+void GemvTest(int m, int n, bool trans) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor vec_b;
+  paddle::framework::Tensor vec_c;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  int b_num = trans ? m : n;
+  int c_num = trans ? n : m;
+
+  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
+  T* data_b = vec_b.mutable_data<T>({b_num}, *cpu_place);
+  T* data_c = vec_c.mutable_data<T>({c_num}, *cpu_place);
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    data_a[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < vec_b.numel(); ++i) {
+    data_b[i] = static_cast<T>(i);
+  }
+
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::gemv<paddle::platform::CPUDeviceContext, T>(
+      context, trans, static_cast<int>(m), static_cast<int>(n), 1., data_a,
+      data_b, 0., data_c);
+
+  if (!trans) {
+    for (int i = 0; i < m; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < n; ++j) {
+        sum += data_a[i * n + j] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < m; ++j) {
+        sum += data_a[j * n + i] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  }
+}
+
+TEST(math_function, gemv) {
+  GemvTest<float>(3, 13, false);
+  GemvTest<double>(4, 5, false);
+  GemvTest<float>(12, 7, true);
+  GemvTest<double>(7, 9, true);
+}
+
+TEST(math_funciton, set_constant) {
+  paddle::framework::Tensor t;
+  t.Resize({10, 10});
+  t.mutable_data<int>(paddle::platform::CPUPlace());
+  auto* ctx = new paddle::platform::CPUDeviceContext();
+  paddle::operators::math::set_constant(*ctx, &t, 10);
+  for (int64_t i = 0; i < t.numel(); ++i) {
+    PADDLE_ENFORCE_EQ(10, t.data<int>()[i]);
+  }
+  delete ctx;
+}
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
new file mode 100644
index 0000000000..6f16d66792
--- /dev/null
+++ b/paddle/operators/math/math_function_test.cu
@@ -0,0 +1,255 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(math_function, notrans_mul_trans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+
+  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
+      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
+
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 5);
+  EXPECT_EQ(out_ptr[1], 14);
+  EXPECT_EQ(out_ptr[2], 14);
+  EXPECT_EQ(out_ptr[3], 50);
+  delete gpu_place;
+}
+
+TEST(math_function, trans_mul_notrans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
+
+  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
+      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
+
+  paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 9);
+  EXPECT_EQ(out_ptr[1], 12);
+  EXPECT_EQ(out_ptr[2], 15);
+  EXPECT_EQ(out_ptr[3], 12);
+  EXPECT_EQ(out_ptr[4], 17);
+  EXPECT_EQ(out_ptr[5], 22);
+  EXPECT_EQ(out_ptr[6], 15);
+  EXPECT_EQ(out_ptr[7], 22);
+  EXPECT_EQ(out_ptr[8], 29);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  context.Wait();
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
+  context.Wait();
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+template <typename T>
+void GemvTest(int m, int n, bool trans) {
+  paddle::framework::Tensor mat_a;
+  paddle::framework::Tensor vec_b;
+  paddle::framework::Tensor vec_c;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+
+  T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
+  T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
+  T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
+
+  auto* gpu_place = new paddle::platform::CUDAPlace(0);
+  paddle::framework::Tensor g_mat_a;
+  paddle::framework::Tensor g_vec_b;
+  paddle::framework::Tensor g_vec_c;
+  T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
+  T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
+  T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
+
+  for (int i = 0; i < mat_a.numel(); ++i) {
+    data_a[i] = static_cast<T>(i);
+  }
+  for (int i = 0; i < vec_b.numel(); ++i) {
+    data_b[i] = static_cast<T>(i);
+  }
+
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+  paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
+
+  paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
+      context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
+      g_data_b, 0., g_data_c);
+
+  paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
+                          &vec_c);
+
+  if (!trans) {
+    for (int i = 0; i < m; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < n; ++j) {
+        sum += data_a[i * n + j] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  } else {
+    for (int i = 0; i < n; ++i) {
+      T sum = 0.0;
+      for (int j = 0; j < m; ++j) {
+        sum += data_a[j * n + i] * data_b[j];
+      }
+      ASSERT_FLOAT_EQ(data_c[i], sum);
+    }
+  }
+}
+
+TEST(math_function, gemv) {
+  GemvTest<float>(3, 13, false);
+  GemvTest<double>(3, 13, false);
+  GemvTest<float>(3, 13, true);
+  GemvTest<double>(3, 13, true);
+}
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
new file mode 100644
index 0000000000..ae7f1fe9be
--- /dev/null
+++ b/paddle/operators/math/matmul.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Implements the logic of numpy matmul:
+// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+//
+// but allowing also for a, b to be transposed
+//
+// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
+// yet.
+template <typename DeviceContext, typename T>
+class MatMulFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& a,
+                  bool trans_a, const framework::Tensor& b, bool trans_b,
+                  T alpha, framework::Tensor* out, T beta) {
+    auto dim_a = a.dims();
+    auto dim_b = b.dims();
+
+    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
+                   "Tensors must all be in the same place.");
+    PADDLE_ENFORCE_GE(dim_a.size(), 1,
+                      "Input tensor a must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_b.size(), 1,
+                      "Input tensor b must be at least 1-dimensional.");
+
+    std::vector<int64_t> out_dim;
+    int64_t batch_count = 1;
+    if (dim_a.size() > 3) {
+      PADDLE_ENFORCE(dim_b.size() == dim_a.size(),
+                     "The dimensions of X and Y must be the same, and both of "
+                     "them should be %d-dimensional.",
+                     dim_b.size());
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_a.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_b[j], dim_a[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_a[j]);
+        batch_count *= dim_a[j];
+      }
+    }
+
+    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
+        strideA = 0, strideB = 0;
+
+    switch (dim_a.size()) {
+      case 1:
+        // similar to np.matmul:
+        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
+        M = trans_a ? dim_a[0] : 1;
+        kA = trans_a ? 1 : dim_a[0];
+        break;
+      case 2:
+        M = trans_a ? dim_a[1] : dim_a[0];
+        kA = trans_a ? dim_a[0] : dim_a[1];
+        break;
+      case 3:
+        batchCountA = dim_a[0];
+        M = trans_a ? dim_a[2] : dim_a[1];
+        kA = trans_a ? dim_a[1] : dim_a[2];
+        strideA = M * kA;
+        break;
+      default:
+        batchCountA = batch_count;
+        size_t mat_s = dim_a.size() - 2;
+        M = trans_a ? dim_a[mat_s + 1] : dim_a[mat_s];
+        kA = trans_a ? dim_a[mat_s] : dim_a[mat_s + 1];
+        strideA = M * kA;
+    }
+
+    switch (dim_b.size()) {
+      case 1:
+        // similar to np.matmul:
+        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
+        kB = trans_b ? 1 : dim_b[0];
+        N = trans_b ? dim_b[0] : 1;
+        break;
+      case 2:
+        kB = trans_b ? dim_b[1] : dim_b[0];
+        N = trans_b ? dim_b[0] : dim_b[1];
+        break;
+      case 3:
+        batchCountB = dim_b[0];
+        kB = trans_b ? dim_b[2] : dim_b[1];
+        N = trans_b ? dim_b[1] : dim_b[2];
+        strideB = kB * N;
+        break;
+      default:
+        batchCountB = batch_count;
+        size_t mat_s = dim_b.size() - 2;
+        kB = trans_b ? dim_b[mat_s + 1] : dim_b[mat_s];
+        N = trans_b ? dim_b[mat_s] : dim_b[mat_s + 1];
+        strideB = kB * N;
+    }
+
+    PADDLE_ENFORCE_EQ(
+        kA, kB,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountA && batchCountB) {
+      PADDLE_ENFORCE_EQ(
+          batchCountA, batchCountB,
+          "When input tensors a and b are both batched, they must have the "
+          "same batch dimension.");
+    }
+    int batchCount = std::max(batchCountA, batchCountB);
+
+    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+    if (!batchCount) {
+      // regular matrix multiplication
+      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>());
+    } else {
+      // batched matrix multiplication
+      batched_gemm<DeviceContext, T>(
+          context, transA, transB, M, N, kA, alpha, a.data<T>(), b.data<T>(),
+          beta, out->data<T>(), batchCount, strideA, strideB);
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
new file mode 100644
index 0000000000..fea86675f7
--- /dev/null
+++ b/paddle/operators/math/maxouting.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// All tensors are in NCHW format, and the groups must be greater than 1
+template <typename T>
+class MaxOutFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    int fea_size = input_height * input_width;
+    // c_size means the output size of each sample
+    int c_size = fea_size * output_channels;
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int new_bindex = c_size * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int new_cindex = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          T ele = static_cast<T>(-FLT_MAX);
+          for (int ph = 0; ph < groups; ++ph) {
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
+            ele = ele > x ? ele : x;
+          }
+          output_data[(new_bindex + new_cindex + f)] = ele;
+        }
+      }
+    }
+  }
+};
+
+template <class T>
+class MaxOutGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    int fea_size = input_height * input_width;
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; ++i) {
+      int blen = fea_size * output_channels * i;
+      for (int c = 0; c < output_channels; ++c) {
+        int clen = fea_size * c;
+        for (int f = 0; f < fea_size; ++f) {
+          int input_idx0 = (blen + clen) * groups + f;
+          bool continue_match = true;
+          int output_idx = blen + clen + f;
+          for (int g = 0; g < groups && continue_match; ++g) {
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
+template class MaxOutFunctor<platform::CPUDeviceContext, float>;
+template class MaxOutFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
new file mode 100644
index 0000000000..6056ad251c
--- /dev/null
+++ b/paddle/operators/math/maxouting.cu
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/maxouting.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+__global__ void KernelMaxOut(const int nthreads, const T* input_data,
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    T ele = static_cast<T>(-FLT_MAX);
+    for (int g = 0; g < groups; ++g) {
+      T x = input_data[data_idx + g * feat_len];
+      ele = ele > x ? ele : x;
+    }
+    output_data[i] = ele;
+  }
+}
+template <typename T>
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
+      }
+    }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* output,
+                  int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int nthreads = output->numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxOut<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width, groups,
+        output_data);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class MaxOutGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int nthreads = output.numel();
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxoutGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, groups);
+  }
+};
+
+template class MaxOutGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutGradFunctor<platform::CUDADeviceContext, double>;
+
+template class MaxOutFunctor<platform::CUDADeviceContext, float>;
+template class MaxOutFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
new file mode 100644
index 0000000000..68f4743db0
--- /dev/null
+++ b/paddle/operators/math/maxouting.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename DeviceContext, typename T>
+class MaxOutFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* output, int groups);
+};
+
+template <typename DeviceContext, class T>
+class MaxOutGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  framework::Tensor* input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, int groups);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
new file mode 100644
index 0000000000..150de6fd59
--- /dev/null
+++ b/paddle/operators/math/pooling.cc
@@ -0,0 +1,760 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+* All tensors are in NCHW format.
+* Ksize, strides, paddings are two elements. These two elements represent height
+* and width, respectively.
+*/
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CPUDeviceContext, double>;
+
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CPUDeviceContext, double>;
+
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUDeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CPUDeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T1 ele = static_cast<T1>(-FLT_MAX);
+            int index = -1;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (ele < input_data[h * input_width + w]) {
+                  ele = input_data[h * input_width + w];
+                  index = h * input_width + w;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = ele;
+            mask_data[ph * output_width + pw] = index;
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          for (int pw = 0; pw < output_width; ++pw) {
+            const int output_idx = ph * output_width + pw;
+            const int input_idx = static_cast<int>(mask_data[output_idx]);
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T1 ele = static_cast<T1>(-FLT_MAX);
+              int index = -1;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    if (ele < input_data[input_idx]) {
+                      index = input_idx;
+                      ele = input_data[input_idx];
+                    }
+                  }
+                }
+              }
+              output_data[output_idx] = ele;
+              mask_data[output_idx] = index;
+            }
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
+    const int output_channels = output_grad.dims()[1];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            for (int pw = 0; pw < output_width; ++pw) {
+              const int output_idx =
+                  (pd * output_height + ph) * output_width + pw;
+              const int input_idx = static_cast<int>(mask_data[output_idx]);
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+            }
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CPUDeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUDeviceContext, double,
+                                             int>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
new file mode 100644
index 0000000000..0243cf8316
--- /dev/null
+++ b/paddle/operators/math/pooling.cu
@@ -0,0 +1,1041 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             const int channels, const int input_height,
+                             const int input_width, const int output_height,
+                             const int output_width, const int ksize_height,
+                             const int ksize_width, const int stride_height,
+                             const int stride_width, const int padding_height,
+                             const int padding_width, PoolProcess pool_process,
+                             T* output_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    PoolProcess pool_process, T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_height,
+    const int input_width, const int output_height, const int output_width,
+    const int ksize_height, const int ksize_width, const int stride_height,
+    const int stride_width, const int padding_height, const int padding_width,
+    T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+
+    if (maxIndex != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, pool_process, output_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
+
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool2dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(const int nthreads, const T* input_data,
+                             const int channels, const int input_depth,
+                             const int input_height, const int input_width,
+                             const int output_depth, const int output_height,
+                             const int output_width, const int ksize_depth,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_depth, const int stride_height,
+                             const int stride_width, const int padding_depth,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process, T* output_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, PoolProcess pool_process,
+    T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, const int channels, const int input_depth,
+    const int input_height, const int input_width, const int output_depth,
+    const int output_height, const int output_width, const int ksize_depth,
+    const int ksize_height, const int ksize_width, const int stride_depth,
+    const int stride_height, const int stride_width, const int padding_depth,
+    const int padding_height, const int padding_width, T* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, pool_process,
+        output_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, pool_process, input_grad_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width, input_grad_data);
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
+
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<float>,
+                                 float>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class Pool3dGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::AvgPoolGrad<double>,
+                                 double>;
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool2dWithIdx(
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, T1* output_data, T2* mask_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T1 ele = -FLT_MAX;
+    int max_index = -1;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_index = h * input_width + w;
+        if (ele < input_data[input_index]) {
+          max_index = input_index;
+          ele = input_data[input_index];
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool2DWithIdxGrad(
+    const int nthreads, const T1* output_grad, const T2* mask_data,
+    const int channels, const int input_height, const int input_width,
+    const int output_height, const int output_width, const int ksize_height,
+    const int ksize_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T1* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int c_offset = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T1 gradient = 0;
+    int input_current_featuremap_idx = h_offset * input_width + w_offset;
+    int output_idx =
+        (batch_idx * channels + c_offset) * output_height * output_width;
+
+    mask_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = ph_start; ph < ph_end; ++ph) {
+      for (int pw = pw_start; pw < pw_end; ++pw) {
+        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
+          gradient += output_grad[ph * output_width + pw];
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2dWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_height, input_width,
+        output_height, output_width, ksize_height, ksize_width, stride_height,
+        stride_width, padding_height, padding_width, output_data, mask_data);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_height = input_grad->dims()[2];
+    const int input_width = input_grad->dims()[3];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T2* mask_data = mask.data<T2>();
+    const T1* output_grad_data = output_grad.data<T1>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_height,
+        input_width, output_height, output_width, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool2dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool2dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool3DWithIdx(
+    const int nthreads, const T1* input_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    T1* output_data, T2* mask_data) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+
+    T1 ele = -FLT_MAX;
+    int max_index = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (ele < input_data[(d * input_height + h) * input_width + w]) {
+            max_index = (d * input_height + h) * input_width + w;
+            ele = input_data[max_index];
+          }
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T1, typename T2>
+__global__ void KernelMaxPool3DWithIdxGrad(
+    const int nthreads, const T1* output_grad, const T2* mask,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width, T1* input_grad) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int d_offset = (index / input_width / input_height) % input_depth;
+    int c_offset =
+        (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pd_start =
+        (d_offset + padding_depth < ksize_depth)
+            ? 0
+            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int pd_end =
+        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T1 gradient = 0;
+    int input_current_feature_map_idx =
+        (d_offset * input_height + h_offset) * input_width + w_offset;
+    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+                     output_height * output_width;
+    mask += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pd_start; pd < pd_end; ++pd) {
+      for (int ph = ph_start; ph < ph_end; ++ph) {
+        for (int pw = pw_start; pw < pw_end; ++pw) {
+          if (mask[(pd * output_height + ph) * output_width + pw] ==
+              input_current_feature_map_idx)
+            gradient +=
+                output_grad[(pd * output_height + ph) * output_width + pw];
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output, framework::Tensor* mask) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output->dims()[1];
+    const int output_depth = output->dims()[2];
+    const int output_height = output->dims()[3];
+    const int output_width = output->dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T1* input_data = input.data<T1>();
+    T1* output_data = output->mutable_data<T1>(context.GetPlace());
+    T2* mask_data = mask->mutable_data<T2>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdx<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, input_channels, input_depth, input_height,
+        input_width, output_depth, output_height, output_width, ksize_depth,
+        ksize_height, ksize_width, stride_depth, stride_height, stride_width,
+        padding_depth, padding_height, padding_width, output_data, mask_data);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, T1, T2> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input_grad->dims()[0];
+    const int input_channels = input_grad->dims()[1];
+    const int input_depth = input_grad->dims()[2];
+    const int input_height = input_grad->dims()[3];
+    const int input_width = input_grad->dims()[4];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T1* output_grad_data = output_grad.data<T1>();
+    const T2* mask_data = mask.data<T2>();
+    T1* input_grad_data = input_grad->mutable_data<T1>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdxGrad<T1, T2><<<grid, threads, 0, context.stream()>>>(
+        nthreads, output_grad_data, mask_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, float,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext, float,
+                                             int>;
+template class MaxPool3dWithIndexFunctor<platform::CUDADeviceContext, double,
+                                         int>;
+template class MaxPool3dWithIndexGradFunctor<platform::CUDADeviceContext,
+                                             double, int>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
new file mode 100644
index 0000000000..2759f06cb6
--- /dev/null
+++ b/paddle/operators/math/pooling.h
@@ -0,0 +1,192 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX \
+  __FLT_MAX__  // It might need to be placed in another file, but I'm still
+               // wondering where to put it.
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute,
+                  framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute, framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, framework::Tensor* output,
+                  framework::Tensor* mask);
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sampler.cc b/paddle/operators/math/sampler.cc
new file mode 100644
index 0000000000..4f1cbfe31a
--- /dev/null
+++ b/paddle/operators/math/sampler.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "sampler.h"
+
+namespace paddle {
+namespace random {
+
+Sampler::~Sampler() {}
+
+UniformSampler::UniformSampler(int64 range)
+    : Sampler(range), inv_range_(1.0 / range) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+}
+
+UniformSampler::UniformSampler(int64 range, unsigned int seed)
+    : Sampler(range, seed), inv_range_(1.0 / range) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_int_distribution<>>(0, range);
+}
+
+int64 UniformSampler::Sample() const { return (*dist_)(*random_engine_); }
+
+float UniformSampler::Probability(int64 value) const { return inv_range_; }
+
+LogUniformSampler::LogUniformSampler(int64 range)
+    : Sampler(range), log_range_(log(range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+}
+
+LogUniformSampler::LogUniformSampler(int64 range, unsigned int seed)
+    : Sampler(range, seed), log_range_(log(range + 1)) {
+  random_engine_ = std::make_shared<std::mt19937>(seed_);
+  dist_ = std::make_shared<std::uniform_real_distribution<>>(0, 1);
+}
+int64 LogUniformSampler::Sample() const {
+  // Got Log Uniform distribution from uniform distribution by
+  // inverse_transform_sampling method
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler/
+  const int64 value =
+      static_cast<int64>(exp((*dist_)(*random_engine_) * log_range_)) - 1;
+  // Mathematically, value should be <= range_, but might not be due to some
+  // floating point roundoff, so we mod by range_.
+  return value % range_;
+}
+
+float LogUniformSampler::Probability(int64 value) const {
+  // Given f(x) = 1/[(x+1) * log_range_]
+  // The value's  probability  is integral of f(x) from value to (value + 1)
+  // More details:
+  // https://wanghaoshuang.github.io/2017/11/Log-uniform-distribution-sampler
+  return (log((value + 2.0) / (value + 1.0))) / log_range_;
+}
+
+}  // namespace random
+}  // namespace paddle
diff --git a/paddle/operators/math/sampler.h b/paddle/operators/math/sampler.h
new file mode 100644
index 0000000000..8f82089e7b
--- /dev/null
+++ b/paddle/operators/math/sampler.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory>
+#include <random>
+typedef long int64;
+namespace paddle {
+namespace operators {
+namespace math {
+
+// TODO(wanghaoshuang): Support for GPU
+
+/**
+* Sample integers from [0, range).
+*/
+class Sampler {
+ public:
+  explicit Sampler(int64 range) : range_(range) {
+    PADDLE_ENFORCE_GT(range, 0);
+    std::random_device r;
+    seed_ = r();
+  }
+  explicit Sampler(int64 range, unsigned int seed)
+      : range_(range), seed_(seed) {
+    PADDLE_ENFORCE_GT(range, 0);
+  }
+  virtual ~Sampler();
+  // Sample a single value
+  virtual int64 Sample() const = 0;
+  // The probability that a single call to Sample() returns the given value.
+  virtual float Probability(int64 value) const = 0;
+
+  int64 range() { return range_; };
+
+ protected:
+  const int64 range_;
+  unsigned int seed_;
+};
+
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = 1 / range
+ */
+class UniformSampler : public Sampler {
+ public:
+  explicit UniformSampler(int64 range);
+
+  explicit UniformSampler(int64 range, unsigned int seed);
+
+  ~UniformSampler() override {}
+
+  int64 Sample() const override;
+
+  float Probability(int64 value) const override;
+
+ private:
+  const float inv_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_int_distribution<>> dist_;
+};
+
+/**
+ * Sample integers from [0, range).
+ * And the distribution function is:
+ * P(x) = (1/ln(range+1)) * ln(1 + 1/(x + 1))
+ */
+class LogUniformSampler : public Sampler {
+ public:
+  explicit LogUniformSampler(int64 range);
+
+  explicit LogUniformSampler(int64 range, unsigned int seed);
+
+  ~LogUniformSampler() override {}
+
+  int64 Sample() const override;
+
+  float Probability(int64 value) const override;
+
+ private:
+  const float log_range_;
+  std::shared_ptr<std::mt19937_64> random_engine_;
+  std::shared_ptr<std::uniform_real_distribution<>> dist_;
+};
+
+}  // math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
new file mode 100644
index 0000000000..8a1ebb58c2
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -0,0 +1,298 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAdd<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAdd<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    SetConstant<platform::CPUDeviceContext, T> functor;
+    functor(context, output, 0.0);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
+
+// This is a separated namespace for manipulate SelectedRows typed
+// data. Like merge duplicated rows, adding two SelectedRows etc.
+//
+// Another group of functors is called "scatter updates", which means
+// use SelectedRows to update a dense tensor with different Ops, like
+// add or mul.
+namespace scatter {
+
+size_t FindPos(const std::vector<int64_t>& rows, int64_t value) {
+  return std::find(rows.begin(), rows.end(), value) - rows.begin();
+}
+
+template <typename T>
+struct MergeAdd<platform::CPUDeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CPUDeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    for (size_t i = 0; i < input_rows.size(); i++) {
+      size_t out_i = FindPos(merge_rows, input_rows[i]);
+      for (int64_t j = 0; j < input_width; j++) {
+        out_data[out_i * input_width + j] += input_data[i * input_width + j];
+      }
+    }
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CPUDeviceContext, float>;
+template struct MergeAdd<platform::CPUDeviceContext, double>;
+template struct MergeAdd<platform::CPUDeviceContext, int>;
+template struct MergeAdd<platform::CPUDeviceContext, int64_t>;
+
+template <typename T>
+struct UpdateToTensor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* input2_data = input2->data<T>();
+
+    // FIXME(typhoonzero): use macro fix the below messy code.
+    switch (op) {
+      case ScatterOps::ASSIGN:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::ADD:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUB:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] -=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::SUBBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] -
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+      case ScatterOps::MUL:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] *=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIV:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] /=
+            in1_data[i * in1_row_numel + j];
+        break;
+      case ScatterOps::DIVBY:
+        INLINE_FOR2(in1_rows.size(), in1_row_numel)
+        input2_data[in1_rows[i] * in1_row_numel + j] =
+            in1_data[i * in1_row_numel + j] /
+            input2_data[in1_rows[i] * in1_row_numel + j];
+        break;
+    }
+  }
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
new file mode 100644
index 0000000000..0ee456f9bc
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <set>
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
+
+    memory::Copy(
+        boost::get<platform::CUDAPlace>(out_place), out_data,
+        boost::get<platform::CUDAPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CUDAPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CUDAPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T), context.stream());
+  }
+};
+
+template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
+
+namespace {
+template <typename T, int block_size>
+__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
+                                            const int64_t* rows, T* tensor_out,
+                                            int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we can not use
+    // tensor_out[index] += selected_rows[index]; Instead, we have to use
+    // AtomicAdd to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2.data<T>();
+    auto* out_data = output->data<T>();
+
+    SetConstant<platform::CUDADeviceContext, T> functor;
+    functor(context, output, 0.0);
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), out_data, in1_row_numel);
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
+
+template <typename T>
+struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset,
+                  framework::SelectedRows* input2) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2->height());
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = *(input2->mutable_rows());
+
+    auto& in1_value = input1.value();
+    auto* in2_value = input2->mutable_value();
+
+    // concat rows
+    in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2->place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = in2_value->data<T>();
+    memory::Copy(boost::get<platform::CUDAPlace>(in2_place),
+                 in2_data + input2_offset,
+                 boost::get<platform::CUDAPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T), context.stream());
+  }
+};
+
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
+
+namespace {
+template <typename T, int block_size>
+__global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
+                                              const int64_t* rows,
+                                              T* tensor_out,
+                                              int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2->data<T>();
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddToTensorKernel<
+        T, block_size><<<grid, threads, 0, context.stream()>>>(
+        in1_data, in1_rows.data(), in2_data, in1_row_numel);
+  }
+};
+
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
+template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
+
+namespace scatter {
+
+template <typename T, int block_size>
+__global__ void MergeAddKernel(const T* input, const int64_t* input_rows,
+                               T* out, const int64_t* out_rows,
+                               size_t out_rows_size, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+  __shared__ size_t out_idx;
+
+  if (tid == 0) {
+    for (size_t i = 0; i < out_rows_size; i++) {
+      if (input_rows[ty] == out_rows[i]) {
+        out_idx = i;
+      }
+    }
+  }
+
+  __syncthreads();
+
+  input += ty * row_numel;
+  out += out_idx * row_numel;
+  for (int index = tid; index < row_numel; index += block_size) {
+    paddle::platform::CudaAtomicAdd(out + index, input[index]);
+  }
+}
+
+template <typename T>
+struct MergeAdd<platform::CUDADeviceContext, T> {
+  framework::SelectedRows operator()(const platform::CUDADeviceContext& context,
+                                     const framework::SelectedRows& input) {
+    framework::SelectedRows out;
+    auto input_rows = input.rows();
+    std::set<int64_t> row_set(input_rows.begin(), input_rows.end());
+    std::vector<int64_t> merge_rows(row_set.begin(), row_set.end());
+
+    auto input_width = input.value().dims()[1];
+
+    out.set_rows(merge_rows);
+    out.set_height(input.height());
+    out.mutable_value()->mutable_data<T>(
+        framework::make_ddim(
+            {static_cast<int64_t>(merge_rows.size()), input_width}),
+        context.GetPlace());
+
+    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    constant_functor(context, out.mutable_value(), 0.0);
+
+    auto* out_data = out.mutable_value()->data<T>();
+    auto* input_data = input.value().data<T>();
+
+    const int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid1(1, input_rows.size());
+
+    MergeAddKernel<
+        T, 256><<<grid1, threads, 0,
+                  reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                      .stream()>>>(input_data, input.rows().data(), out_data,
+                                   out.rows().data(), out.rows().size(),
+                                   input_width);
+    return out;
+  }
+};
+
+template struct MergeAdd<platform::CUDADeviceContext, float>;
+template struct MergeAdd<platform::CUDADeviceContext, double>;
+template struct MergeAdd<platform::CUDADeviceContext, int>;
+template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
+
+template <typename T, int block_size>
+__global__ void UpdateToTensorKernel(const T* selected_rows,
+                                     const int64_t* rows, const ScatterOps& op,
+                                     T* tensor_out, int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+  // FIXME(typhoonzero): use macro fix the below messy code.
+  switch (op) {
+    case ScatterOps::ASSIGN:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index];
+      }
+      break;
+    case ScatterOps::ADD:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] += selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUB:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] -= selected_rows[index];
+      }
+      break;
+    case ScatterOps::SUBBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] - tensor_out[index];
+      }
+      break;
+    case ScatterOps::MUL:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] *= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIV:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] /= selected_rows[index];
+      }
+      break;
+    case ScatterOps::DIVBY:
+      for (int index = tid; index < row_numel; index += block_size) {
+        tensor_out[index] = selected_rows[index] / tensor_out[index];
+      }
+      break;
+  }
+}
+
+template <typename T>
+struct UpdateToTensor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& context,
+                  const ScatterOps& op, const framework::SelectedRows& input1,
+                  framework::Tensor* input2) {
+    // NOTE: Use SelectedRowsAddToTensor for better performance
+    //       no additional MergeAdd called.
+    MergeAdd<platform::CUDADeviceContext, T> merge_func;
+    auto merged_in1 = merge_func(context, input1);
+
+    auto in1_height = merged_in1.height();
+    auto in2_dims = input2->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+
+    auto& in1_value = merged_in1.value();
+    auto& in1_rows = merged_in1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2->numel() / in1_height);
+
+    auto* in1_data = in1_value.template data<T>();
+    auto* in2_data = input2->data<T>();
+
+    dim3 threads(platform::PADDLE_CUDA_NUM_THREADS, 1);
+    dim3 grid(1, in1_rows.size());
+    UpdateToTensorKernel<T, platform::PADDLE_CUDA_NUM_THREADS><<<
+        grid, threads, 0, context.stream()>>>(in1_data, in1_rows.data(), op,
+                                              in2_data, in1_row_numel);
+  }
+};
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
new file mode 100644
index 0000000000..09d4631905
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+#define INLINE_FOR2(sizei, sizej)     \
+  for (int64_t i = 0; i < sizei; i++) \
+    for (int64_t j = 0; j < sizej; j++)
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+template <typename DeviceContext, typename T>
+struct SelectedRowsAdd {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output);
+};
+
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output);
+};
+
+// input2 = input1 + input2
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddTo {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const int64_t input2_offset, framework::SelectedRows* input2);
+};
+
+// input2 = input1 + input2
+template <typename DeviceContext, typename T>
+struct SelectedRowsAddToTensor {
+  void operator()(const DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+namespace scatter {
+// functors for manuplating SelectedRows data
+template <typename DeviceContext, typename T>
+struct MergeAdd {
+  // unary functor, merge by adding duplicated rows in
+  // the input SelectedRows object.
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input);
+};
+
+template <typename DeviceContext, typename T>
+struct Add {
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 + e_in2;
+    return out;
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct Mul {
+  // multiply two SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const framework::SelectedRows& input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    auto e_in2 = framework::EigenVector<T>::Flatten(input2.value());
+    e_out.device(*context.eigen_device()) = e_in1 * e_in2;
+    return out;
+  }
+  // multiply scalar to SelectedRows
+  framework::SelectedRows operator()(const DeviceContext& context,
+                                     const framework::SelectedRows& input1,
+                                     const T input2) {
+    framework::SelectedRows out;
+    out.set_rows(input1.rows());
+    out.set_height(input1.height());
+    out.mutable_value()->mutable_data<T>(input1.value().dims(),
+                                         context.GetPlace());
+    auto e_out = framework::EigenVector<T>::Flatten(*(out.mutable_value()));
+    auto e_in1 = framework::EigenVector<T>::Flatten(input1.value());
+    e_out.device(*context.eigen_device()) = input2 * e_in1;
+    return out;
+  }
+};
+
+enum class ScatterOps { ASSIGN, ADD, SUB, SUBBY, MUL, DIV, DIVBY };
+
+// out = seleted_rows_in / tensor
+template <typename DeviceContext, typename T>
+struct UpdateToTensor {
+  void operator()(const DeviceContext& context, const ScatterOps& op,
+                  const framework::SelectedRows& input1,
+                  framework::Tensor* input2);
+};
+
+}  // namespace scatter
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
new file mode 100644
index 0000000000..8c74cab0a1
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(selected_rows_functor, cpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUDeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAdd<CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+
+  SelectedRowsAddTensor<CPUDeviceContext, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  auto* tensor2_data = tensor2->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
+}
+
+TEST(selected_rows_functor, cpu_add_to) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUDeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAddTo<CPUDeviceContext, float> add_to_functor;
+  add_to_functor(ctx, *selected_rows1, 0, output.get());
+  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  SelectedRowsAddToTensor<CPUDeviceContext, float> add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+
+  auto* tensor1_data = tensor1->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
new file mode 100644
index 0000000000..38808e1301
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+TEST(selected_rows_functor, gpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CUDAPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<CUDADeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAdd<CUDADeviceContext, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+
+  SelectedRowsAddTensor<CUDADeviceContext, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  Tensor tensor2_cpu;
+  Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
+  ctx.Wait();
+
+  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
+}
+
+TEST(selected_rows_functor, gpu_add_to) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CUDAPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<CUDADeviceContext, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  output->set_height(height);
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAddTo<CUDADeviceContext, float> add_to_functor;
+  add_to_functor(ctx, *selected_rows1, 0, output.get());
+  add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  Copy(*out_value, cpu_place, ctx, &out_cpu);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  SelectedRowsAddToTensor<CUDADeviceContext, float> add_to_tensor_functor;
+  add_to_tensor_functor(ctx, *output, tensor1.get());
+
+  Tensor tensor1_cpu;
+  Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
+  ctx.Wait();
+
+  auto* tensor1_cpu_data = tensor1_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor1_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor1_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor1_cpu_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
new file mode 100644
index 0000000000..e459a42ca2
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence2batch.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+    for (int i = 0; i < height; ++i) {
+      if (is_src_index) {
+        memcpy(dst_data + i * width, src_data + index[i] * width,
+               width * sizeof(T));
+      } else {
+        memcpy(dst_data + index[i] * width, src_data + i * width,
+               width * sizeof(T));
+      }
+    }
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CPUDeviceContext, double>;
+
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CPUDeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
new file mode 100644
index 0000000000..452ae89510
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
+                                     int64_t height, int64_t width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * GridDimX;
+  while (id < height) {
+    int src_idx = is_src_index ? index[id] : id;
+    int dst_idx = is_src_index ? id : index[id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += BlockDimX) {
+      dst_data[i] = src_data[i];
+    }
+    id += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& src, const size_t* index,
+                  framework::Tensor& dst, bool is_src_index) {
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+
+    dim3 threads(128, 8);
+    dim3 grid(8, 1);
+    auto stream = context.stream();
+    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
+        src_data, dst_data, index, height, width, is_src_index);
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, float>;
+template class CopyMatrixRowsFunctor<platform::CUDADeviceContext, double>;
+
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, float>;
+template class LoDTensor2BatchFunctor<platform::CUDADeviceContext, double>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class Batch2LoDTensorFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
new file mode 100644
index 0000000000..a5c43a2c7d
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.h
@@ -0,0 +1,169 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true,
+  // copy the indexed rows of input src to the output dst.
+  // If is_src_index is false,
+  // copy the input src to the indexed rows of output dst.
+  // The indexed rows are based on the input index.
+  void operator()(const DeviceContext& context, const framework::Tensor& src,
+                  const size_t* index, framework::Tensor& dst,
+                  bool is_src_index);
+};
+
+template <typename DeviceContext, typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  //
+  struct SeqInfo {
+    SeqInfo(int start, int length, int seq_idx)
+        : start(start), length(length), seq_idx(seq_idx) {}
+    int start;
+    int length;
+    int seq_idx;
+  };
+
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& lod_tensor,
+                  framework::LoDTensor& batch, bool is_cal_batch_lod,
+                  bool is_reverse = false) const {
+    if (!is_cal_batch_lod) {
+      auto lods = batch.lod();
+      PADDLE_ENFORCE_GT(lods.size(), 2UL);
+      PADDLE_ENFORCE_EQ(lods[1].size(),
+                        static_cast<size_t>(lod_tensor.dims()[0]));
+      CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+      to_batch(context, lod_tensor, lods[1].data(), batch, true);
+      return;
+    }
+
+    auto lods = lod_tensor.lod();
+    auto lod = lods[0];
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+
+    std::vector<SeqInfo> seq_info;
+    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+      int length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(),
+              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+
+    // Calculate the start position of each batch.
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           num_batch = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = len(b0)
+    //              batch_start_positions[1] = len(b0) + len(b1)
+    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    //           seq_order = {1, 0, 2}, the sort order.
+    //               where 1 is the second sequence,
+    //                     0 is the first sequence,
+    //                     2 is the third sequence.
+    // The num_batch represents batch size after rearranging the
+    // input LodTensor. It is also the maximum length of input sequence.
+
+    paddle::framework::LoD batch_lods;
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    int num_batch = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    // batch_lods[1] is the raw index in the input LoDTensor
+    batch_lods[1].resize(static_cast<size_t>(lod_tensor.dims()[0]));
+    // batch_lods[2] is the sort order for the input LoDTensor.
+    batch_lods[2].resize(seq_info.size());
+
+    size_t* batch_starts = batch_lods[0].data();
+    size_t* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (int n = 0; n < num_batch; n++) {
+      auto batch_id = static_cast<int>(batch_starts[n]);
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        int seq_len = seq_info[i].length;
+        int start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          batch_id++;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = static_cast<size_t>(batch_id);
+    }
+    size_t* seq_order = batch_lods[2].data();
+    for (size_t i = 0; i < seq_info.size(); ++i) {
+      seq_order[i] = seq_info[i].seq_idx;
+    }
+    batch.set_lod(batch_lods);
+
+    CopyMatrixRowsFunctor<DeviceContext, T> to_batch;
+    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& batch,
+                  framework::LoDTensor& lod_tensor) const {
+    auto in_lod = batch.lod();
+    PADDLE_ENFORCE_GT(in_lod.size(), 2UL);
+    PADDLE_ENFORCE_EQ(in_lod[1].size(),
+                      static_cast<size_t>(lod_tensor.dims()[0]));
+    CopyMatrixRowsFunctor<DeviceContext, T> to_seq;
+    size_t* index = in_lod[1].data();
+    to_seq(context, batch, index, lod_tensor, false);
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cc b/paddle/operators/math/sequence_padding.cc
new file mode 100644
index 0000000000..2e69aa47eb
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cc
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    for (int64_t i = 0; i < max_sequence_length; ++i) {
+      for (int64_t j = 0; j < num_sequences; ++j) {
+        int64_t start_pos = abs_offset_lod[level][j];
+        int64_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
+        if (i < sequence_length) {
+          // i > 0 => sequence_length > 0
+          T scale =
+              norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+          for (int64_t k = 0; k < sequence_width; ++k) {
+            padding_data[(i * num_sequences + j) * sequence_width + k] =
+                seq_data[(start_pos + i) * sequence_width + k] * scale;
+          }
+        } else {
+          memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
+                 sequence_width * sizeof(T));
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The LoD of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    const int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    for (int64_t i = 0; i < num_sequences; ++i) {
+      int64_t start_pos = abs_offset_lod[level][i];
+      int64_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
+      for (int64_t j = 0; j < sequence_length; ++j) {
+        // sequence_width > j > 0
+        T scale =
+            norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+        for (int64_t k = 0; k < sequence_width; ++k) {
+          seq_data[(start_pos + j) * sequence_width + k] =
+              padding_data[(j * num_sequences + i) * sequence_width + k] *
+              scale;
+        }
+      }
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.cu b/paddle/operators/math/sequence_padding.cu
new file mode 100644
index 0000000000..a38df26f59
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.cu
@@ -0,0 +1,211 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, bool NormByTimes, bool Padding>
+__global__ void SequencePaddingKernel(T* padding, T* sequence,
+                                      const size_t* sequence_start_positions,
+                                      const size_t sequence_width,
+                                      const size_t max_sequence_length,
+                                      const size_t num_sequences) {
+  size_t padding_idx = blockIdx.y;
+  size_t start_pos = sequence_start_positions[padding_idx];
+  size_t sequence_length =
+      sequence_start_positions[padding_idx + 1] - start_pos;
+
+  size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
+  size_t padding_base_idx =
+      (sequence_idx * num_sequences + padding_idx) * sequence_width;
+  size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
+
+  if (sequence_idx < sequence_length) {
+    T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
+      }
+    } else {
+      /* padding -> sequence */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
+      }
+    }
+  } else if (sequence_idx < max_sequence_length) {
+    if (Padding) {
+      /* sequence -> padding */
+      for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
+        padding[padding_base_idx + i] = 0;
+      }
+    }
+  }
+}
+
+template <typename T>
+class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& seq, framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequence_length, num_sequences, sequence_width].");
+
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be the "
+                      "maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be the "
+                      "number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(seq, context.GetPlace(), context, &padding);
+      padding.Resize(padding_dims);
+      return;
+    }
+
+    const int64_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* seq_data = seq.data<T>();
+    T* padding_data = padding.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
+          padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template <typename T>
+class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const framework::Tensor& padding,
+                  bool norm_by_times) {
+    auto lod = seq.lod();
+    PADDLE_ENFORCE_GT(lod.size(), 0UL,
+                      "The lod of LoDTensor seq should not be null.");
+
+    const size_t level = 0;
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    auto seq_dims = seq.dims();
+    PADDLE_ENFORCE_EQ(seq_dims[0],
+                      static_cast<int64_t>(abs_offset_lod[level].back()),
+                      "The first dimension of LoDTensor seq should be "
+                      "equal to the sum of all sequences's length.");
+
+    auto padding_dims = padding.dims();
+    PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
+                      "The input padding should be a 3-D Tensor of shape "
+                      "[max_sequnece_length, num_sequences, sequence_width].");
+
+    int64_t max_sequence_length = MaximumSequenceLength(lod, level);
+    PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
+                      "The first dimension of Tensor padding should be "
+                      "the maximum length of all sequences in LoDTensor seq.");
+
+    const int64_t num_sequences = abs_offset_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
+                      "The second dimension of Tensor padding should be "
+                      "the number of sequences in LoDTensor seq.");
+
+    const int64_t sequence_width = seq.numel() / seq_dims[0];
+    PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
+                      "The third dimension of Tensor padding should be the "
+                      "width of sequence in LoDTensor seq.");
+
+    if (!norm_by_times && num_sequences == 1UL) {
+      Copy(padding, context.GetPlace(), context, &seq);
+      seq.Resize(seq_dims);
+      return;
+    }
+
+    const int64_t kBlockSize = 512;
+
+    /* At least use 32 threads to copy sequence_width elements,
+     * and at least 8 elements for each thread.
+     */
+    size_t block_dim_x =
+        std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
+    size_t block_dim_y = kBlockSize / block_dim_x;
+    dim3 threads(block_dim_x, block_dim_y);
+
+    size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
+    size_t grid_dim_y = num_sequences;
+    dim3 grid(grid_dim_x, grid_dim_y);
+
+    const T* padding_data = padding.data<T>();
+    T* seq_data = seq.data<T>();
+    if (norm_by_times) {
+      SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    } else {
+      SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
+          const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
+          sequence_width, max_sequence_length, num_sequences);
+    }
+  }
+};
+
+template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding.h b/paddle/operators/math/sequence_padding.h
new file mode 100644
index 0000000000..8f586c5eb4
--- /dev/null
+++ b/paddle/operators/math/sequence_padding.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+inline static size_t MaximumSequenceLength(const framework::LoD& lod,
+                                           const size_t level) {
+  const size_t num_sequences = lod[level].size() - 1;
+  size_t max_sequence_length = 0;
+  framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    max_sequence_length =
+        std::max(max_sequence_length,
+                 abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
+  }
+  return max_sequence_length;
+}
+
+/*
+ * \brief   Padding/Unpadding LoDTensor to/from normal Tensor of the shape
+ *          [max_sequence_length, num_sequences, sequence_width].
+ *
+ *  Padding sequence:
+ *        padding[i] = seq[lod[level][i]]
+ *  Unpadding sequence:
+ *        seq[lod[level][i]] = padding[i]
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    seq     (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *    padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+ *
+ * \param context       device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param padding       Tensor which is padded to the same length, the shape is
+ *                      [max_sequence_length, num_sequences, sequence_width].
+ * \param norm_by_times whether dividing sequence's length.
+ *
+ * \note  transposition is also done in this functor.
+ */
+template <typename DeviceContext, typename T>
+class PaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
+                  framework::Tensor& padding, bool norm_by_times);
+};
+
+template <typename DeviceContext, typename T>
+class UnpaddingLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const framework::Tensor& padding, bool norm_by_times);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_padding_test.cc b/paddle/operators/math/sequence_padding_test.cc
new file mode 100644
index 0000000000..3e504f4a15
--- /dev/null
+++ b/paddle/operators/math/sequence_padding_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_padding.h"
+#include <gtest/gtest.h>
+
+template <typename DeviceContext, typename Place, typename T>
+void TestSequencePadding(const paddle::framework::LoD& lod,
+                         const size_t sequence_width) {
+  paddle::framework::LoDTensor cpu_seq;
+  paddle::framework::LoDTensor cpu_seq_back;
+  paddle::framework::LoDTensor seq;
+  paddle::framework::LoDTensor seq_back;
+  paddle::framework::Tensor padding;
+
+  const size_t level = lod.size() - 1;
+  auto seq_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
+                                    static_cast<int64_t>(sequence_width)});
+
+  cpu_seq.set_lod(lod);
+  cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
+    cpu_seq.data<T>()[i] = static_cast<T>(i);
+  }
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+  if (paddle::platform::is_cpu_place(*place)) {
+    seq = cpu_seq;
+  } else {
+    Copy(cpu_seq, *place, *context, &seq);
+    seq.set_lod(lod);
+  }
+
+  const size_t max_sequence_length =
+      paddle::operators::math::MaximumSequenceLength(lod, level);
+  const size_t num_sequences = lod[level].size() - 1;
+  auto padding_dims =
+      paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                                    static_cast<int64_t>(num_sequences),
+                                    static_cast<int64_t>(sequence_width)});
+  padding.mutable_data<T>(padding_dims, *place);
+  paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq, padding, false);
+
+  seq_back.set_lod(lod);
+  seq_back.mutable_data<T>(seq_dims, *place);
+  paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+      *context, seq_back, padding, false);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    cpu_seq_back = seq_back;
+  } else {
+    Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
+    cpu_seq_back.set_lod(lod);
+  }
+
+  EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
+  EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
+  for (int64_t i = 0; i < cpu_seq.numel(); ++i) {
+    EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
+  }
+
+  delete place;
+  delete context;
+};
+
+TEST(Seq2BatchPadding, CPU) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CPUDeviceContext,
+                      paddle::platform::CPUPlace, float>(lod2, 128);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(SequencePadding, CUDA) {
+  paddle::framework::LoD lod1;
+  lod1.push_back(std::vector<size_t>{0, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod1, 16);
+
+  paddle::framework::LoD lod2;
+  lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
+  TestSequencePadding<paddle::platform::CUDADeviceContext,
+                      paddle::platform::CUDAPlace, float>(lod2, 128);
+}
+#endif
diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc
new file mode 100644
index 0000000000..8fb92b1a13
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_pooling.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), 1);
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t k = 0; k < dim; ++k) {
+        out_data[i * dim + k] = in_data[starts[i] * dim + k];
+        max_index[i * dim + k] = starts[i];
+      }
+      for (size_t j = starts[i] + 1; j < starts[i + 1]; ++j) {
+        for (int64_t k = 0; k < dim; ++k) {
+          if (in_data[j * dim + k] > out_data[i * dim + k]) {
+            out_data[i * dim + k] = in_data[j * dim + k];
+            max_index[i * dim + k] = j;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto ig_dims = in_grad->dims();
+    auto idx_dims = index.dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), 1);
+    PADDLE_ENFORCE_GT(ig_dims.size(), 1);
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CPUDeviceContext, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+    for (int64_t i = 0; i < num_seq; ++i) {
+      for (int64_t j = 0; j < dim; ++j) {
+        int step_id = max_index[i * dim + j];
+        ig_data[step_id * dim + j] = og_data[i * dim + j];
+      }
+    }
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CPUDeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu
new file mode 100644
index 0000000000..4c9e6b375c
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename T>
+__global__ void KeMaxSequencePool(const T* input, const size_t* starts,
+                                  T* output, int* index, int64_t num_seq,
+                                  int64_t dim) {
+  int dim_idx = threadIdx.x;
+  int seq_id = blockIdx.x;
+  if (seq_id >= num_seq) return;
+  size_t start = starts[seq_id];
+  size_t end = starts[seq_id + 1];
+
+  for (int64_t i = dim_idx; i < dim; i += blockDim.x) {
+    T max_val = static_cast<T>(-FLT_MAX);
+    int max_id = -1;
+    for (size_t step_id = start; step_id < end; step_id++) {
+      if (max_val < input[step_id * dim + i]) {
+        max_val = input[step_id * dim + i];
+        max_id = step_id;
+      }
+    }
+    output[seq_id * dim + i] = max_val;
+    index[seq_id * dim + i] = max_id;
+  }
+}
+
+template <typename T>
+class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto idx_dims = index->dims();
+    PADDLE_ENFORCE_GT(in_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(out_dims.size(), 1);
+    for (int64_t i = 1; i < in_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, out_dims);
+
+    auto starts = input.lod()[0];
+    const T* in_data = input.data<T>();
+    T* out_data = output->data<T>();
+    int* max_index = index->data<int>();
+
+    int64_t num_seq = out_dims[0];
+    int64_t dim = output->numel() / num_seq;
+
+    dim3 threads(256, 1);
+    dim3 grid(num_seq, 1);
+    auto stream = context.stream();
+    KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
+        in_data, starts.data(), out_data, max_index, num_seq, dim);
+  }
+};
+
+template <typename T>
+__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index,
+                                      T* in_grad, int64_t num_seq,
+                                      int64_t dim) {
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  int col_idx = idx % dim;
+  if (idx < num_seq * dim) {
+    int step_id = max_index[idx];
+    in_grad[step_id * dim + col_idx] = out_grad[idx];
+  }
+}
+
+template <typename T>
+class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad) {
+    auto og_dims = out_grad.dims();
+    auto idx_dims = index.dims();
+    auto ig_dims = in_grad->dims();
+    PADDLE_ENFORCE_GT(og_dims.size(), static_cast<int64_t>(1));
+    PADDLE_ENFORCE_GT(ig_dims.size(), static_cast<int64_t>(1));
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]);
+    }
+    PADDLE_ENFORCE_EQ(idx_dims, og_dims);
+
+    const T* og_data = out_grad.data<T>();
+    const int* max_index = index.data<int>();
+    T* ig_data = in_grad->data<T>();
+
+    SetConstant<platform::CUDADeviceContext, T> set_zero;
+    set_zero(context, in_grad, static_cast<T>(0.0));
+    int64_t num_seq = og_dims[0];
+    int64_t dim = out_grad.numel() / num_seq;
+
+    unsigned int blocks = (num_seq * dim + 128 - 1) / 128;
+    dim3 threads(128, 1);
+    dim3 grid(blocks, 1);
+    auto stream = context.stream();
+    KeMaxSequencePoolGrad<T><<<grid, threads, 0, stream>>>(
+        og_data, max_index, ig_data, num_seq, dim);
+  }
+};
+
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolFunctor<platform::CUDADeviceContext, double>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, float>;
+template class MaxSeqPoolGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h
new file mode 100644
index 0000000000..13ffb2ebef
--- /dev/null
+++ b/paddle/operators/math/sequence_pooling.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX __FLT_MAX__
+
+template <typename DeviceContext, typename T>
+class MaxSeqPoolFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::LoDTensor& input, framework::Tensor* output,
+                  framework::Tensor* index);
+};
+
+template <typename DeviceContext, class T>
+class MaxSeqPoolGradFunctor {
+ public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& out_grad,
+                  const framework::Tensor& index,
+                  framework::LoDTensor* in_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.cc b/paddle/operators/math/sequence_scale.cc
new file mode 100644
index 0000000000..7e439e9a2c
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_scale.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    size_t seq_width = seq.dims()[1];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+    for (size_t i = 0; i < num_seq; ++i) {
+      for (size_t j = lod[level][i] * seq_width;
+           j < lod[level][i + 1] * seq_width; ++j) {
+        seq_data[j] *= scales[i];
+      }
+    }
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.cu b/paddle/operators/math/sequence_scale.cu
new file mode 100644
index 0000000000..ceaabd8e0f
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.cu
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence_scale.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T, int BlockSize>
+__global__ void SequenceScaleKernel(T* seq, size_t* lod, const T* scales,
+                                    const size_t seq_width) {
+  for (int i = threadIdx.x;
+       i < (lod[blockIdx.x + 1] - lod[blockIdx.x]) * seq_width;
+       i += BlockSize) {
+    int idx = lod[blockIdx.x] * seq_width + i;
+    seq[idx] *= scales[blockIdx.x];
+  }
+}
+
+template <typename T>
+class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  framework::LoDTensor& seq, const T* scales) {
+    const size_t level = 0;
+    auto lod = seq.lod();
+    const size_t num_seq = lod[level].size() - 1;
+    const size_t seq_width = seq.numel() / seq.dims()[0];
+    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
+    T* seq_data = seq.mutable_data<T>(context.GetPlace());
+
+    SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
+        num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
+        seq_data, abs_offset_lod[level].data(), scales, seq_width);
+  }
+};
+
+template class ScaleLoDTensorFunctor<platform::CUDADeviceContext, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence_scale.h b/paddle/operators/math/sequence_scale.h
new file mode 100644
index 0000000000..ecd9a57c3f
--- /dev/null
+++ b/paddle/operators/math/sequence_scale.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * \brief   Scale a sequence.
+ *
+ *  All sequences will be padded to the same length and stored in a transposed
+ * shape.
+ *  Example:
+ *    Given:
+ *      seq = (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+ *      scales = (2, 3, 4, 5)
+ *    then:
+ *      result = (2*s0, 2*s0, 2*s0, 2*s0; 3*s1, 3*s1; 4*s2, 4*s2, 4*s2; 5*s3)
+
+ *
+ * \param context       Device context of this functor.
+ * \param seq           LoDTensor which is stored in sequence format, the shape
+ *                      is [total_sequence_length, sequence_width] where
+ *                      total_sequence_length is the sum of all sequences'
+ *                      length.
+ * \param scales        Array<T>. The i-th sequence will be scaled by scales[i].
+ * \param num_seq       Number of sequence
+ *
+ */
+template <typename DeviceContext, typename T>
+class ScaleLoDTensorFunctor {
+ public:
+  void operator()(const DeviceContext& context, framework::LoDTensor& seq,
+                  const T* scales);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
new file mode 100644
index 0000000000..72f10f35f4
--- /dev/null
+++ b/paddle/operators/math/softmax.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/math/softmax_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxFunctor<platform::CPUDeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
new file mode 100644
index 0000000000..9e73f6a371
--- /dev/null
+++ b/paddle/operators/math/softmax.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/math/softmax.h"
+#include "paddle/operators/math/softmax_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
+template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
new file mode 100644
index 0000000000..471f44d340
--- /dev/null
+++ b/paddle/operators/math/softmax.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+class SoftmaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor* X,
+                  framework::Tensor* Y);
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor* y,
+                  const framework::Tensor* y_grad, framework::Tensor* x_grad);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h
new file mode 100644
index 0000000000..82f597ff79
--- /dev/null
+++ b/paddle/operators/math/softmax_impl.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct ValueClip {
+  HOSTDEVICE T operator()(const T& x) const {
+    const T kThreshold = -64.;
+    return x < kThreshold ? kThreshold : x;
+  }
+};
+
+template <typename DeviceContext, typename T>
+void SoftmaxFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
+                                                  const framework::Tensor* X,
+                                                  framework::Tensor* Y) {
+  auto logits = EigenMatrix<T>::From(*X);
+  auto softmax = EigenMatrix<T>::From(*Y);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = logits.dimension(kBatchDim);
+  const int num_classes = logits.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto shifted_logits = (logits -
+                         logits.maximum(along_class)
+                             .eval()
+                             .reshape(batch_by_one)
+                             .broadcast(one_by_class))
+                            .unaryExpr(ValueClip<T>());
+
+  softmax.device(*context.eigen_device()) = shifted_logits.exp();
+  softmax.device(*context.eigen_device()) = (softmax *
+                                             softmax.sum(along_class)
+                                                 .inverse()
+                                                 .eval()
+                                                 .reshape(batch_by_one)
+                                                 .broadcast(one_by_class));
+}
+
+template <typename DeviceContext, typename T>
+void SoftmaxGradFunctor<DeviceContext, T>::operator()(
+    const DeviceContext& context, const framework::Tensor* y,
+    const framework::Tensor* y_grad, framework::Tensor* x_grad) {
+  auto softmax = EigenMatrix<T>::From(*y);
+  auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+  auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+  const int kBatchDim = 0;
+  const int kClassDim = 1;
+
+  const int batch_size = softmax.dimension(kBatchDim);
+  const int num_classes = softmax.dimension(kClassDim);
+
+  Eigen::DSizes<int, 1> along_class(kClassDim);
+  Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+  Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+  auto dot = (softmax * softmax_grad)
+                 .sum(along_class)
+                 .eval()
+                 .reshape(batch_by_one)
+                 .broadcast(one_by_class);
+  logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax;
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000..ecd3a647e0
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUDeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CPUDeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000..ecbde0f6a7
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<T><<<grid, threads, 0, context.stream()>>>(
+        input.numel(), input_data, indices_data, input_height, input_width,
+        output_channels, output_data, output_grad_data, output_height,
+        output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxGradFunctor<platform::CUDADeviceContext, double>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, float>;
+template class Unpool2dMaxFunctor<platform::CUDADeviceContext, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000..0f0ff1371e
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename DeviceContext, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename DeviceContext, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
new file mode 100644
index 0000000000..d574ed9234
--- /dev/null
+++ b/paddle/operators/math/vol2col.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col->dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+
+    const T* vol_data = vol.data<T>();
+    T* col_data = col->data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            int vol_idx =
+                ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                    input_width +
+                w_pad;
+            col_data[col_idx] =
+                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
+                    ? static_cast<T>(0)
+                    : vol_data[vol_idx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUDeviceContext, T> {
+ public:
+  void operator()(const platform::CPUDeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "mismatching.");
+    T* vol_data = vol->data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * strides[0] - paddings[0] + d_offset * dilations[0];
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * strides[1] - paddings[1] + h_offset * dilations[1];
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * strides[2] - paddings[2] + w_offset * dilations[2];
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx =
+                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
+template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
+template class Col2VolFunctor<platform::CPUDeviceContext, float>;
+template class Col2VolFunctor<platform::CPUDeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
new file mode 100644
index 0000000000..b029442fe4
--- /dev/null
+++ b/paddle/operators/math/vol2col.cu
@@ -0,0 +1,262 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % output_width;
+    int h_out = (index / output_width) % output_height;
+    int d_out = (index / output_width / output_height) % output_detph;
+    int channel_in = index / output_width / output_height / output_detph;
+    int channel_out = channel_in * filter_depth * filter_height * filter_width;
+    int w_in = w_out * stride_width - padding_width;
+    int h_in = h_out * stride_height - padding_height;
+    int d_in = d_out * stride_depth - padding_depth;
+
+    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
+                    output_width +
+                w_out;
+    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filter_depth; ++k) {
+      for (int i = 0; i < filter_height; ++i) {
+        for (int j = 0; j < filter_width; ++j) {
+          int d = d_in + k * dilation_d;
+          int h = h_in + i * dilation_h;
+          int w = w_in + j * dilation_w;
+          int col_idx = (k * dilation_d * height + i * dilation_h) * width +
+                        j * dilation_w;
+          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                       w < width)
+                          ? data_vol[col_idx]
+                          : 0;
+          data_col += output_detph * output_height * output_width;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels,intpu_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col->dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col->dims()[1];
+    int filter_height = col->dims()[2];
+    int filter_width = col->dims()[3];
+    int output_depth = col->dims()[4];
+    int output_height = col->dims()[5];
+    int output_width = col->dims()[6];
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
+
+    int num_outputs =
+        input_channels * output_depth * output_height * output_width;
+
+    const int threads = 1024;
+    const int blocks = (num_outputs + 1024 - 1) / 1024;
+    vol2col<T><<<blocks, threads, 0, context.stream()>>>(
+        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        col->data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2vol(int num_kernels, const T* data_col, int depth,
+                        int height, int width, int dilation_d, int dilation_h,
+                        int dilation_w, int filter_depth, int filter_height,
+                        int filter_width, int stride_depth, int stride_height,
+                        int stride_width, int padding_depth, int padding_height,
+                        int padding_width, int output_detph, int output_height,
+                        int output_width, T* data_vol) {
+  const int d_filter_depth = dilation_d * (filter_depth - 1) + 1;
+  const int d_filter_height = dilation_h * (filter_height - 1) + 1;
+  const int d_filter_width = dilation_w * (filter_width - 1) + 1;
+
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    T src_val = 0;
+    int w = index % width + padding_width;
+    int h = (index / width) % height + padding_height;
+    int d = (index / width / height) % depth + padding_depth;
+    int c = index / width / height / depth;
+
+    // compute the start and end of the output
+    int w_col_start =
+        (w < d_filter_width) ? 0 : (w - d_filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, output_width);
+    int h_col_start =
+        (h < d_filter_height) ? 0 : (h - d_filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, output_height);
+    int d_col_start =
+        (d < d_filter_depth) ? 0 : (d - d_filter_depth) / stride_depth + 1;
+    int d_col_end = min(d / stride_depth + 1, output_detph);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          int d_off = (d - d_col * stride_depth);
+          int h_off = (h - h_col * stride_height);
+          int w_off = (w - w_col * stride_width);
+          if (d_off % dilation_d == 0 && h_off % dilation_h == 0 &&
+              w_off % dilation_w == 0) {
+            d_off /= dilation_d;
+            h_off /= dilation_h;
+            w_off /= dilation_w;
+
+            int data_col_index =
+                (((((c * filter_depth + d_off) * filter_height + h_off) *
+                       filter_width +
+                   w_off)));
+            data_col_index =
+                ((data_col_index * output_detph + d_col) * output_height +
+                 h_col) *
+                    output_width +
+                w_col;
+            src_val += data_col[data_col_index];
+          }
+        }
+      }
+    }
+    data_vol[index] = src_val;
+  }
+}
+
+/*
+ * im = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const {
+    PADDLE_ENFORCE(vol->dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol->dims()[0];
+    int input_depth = vol->dims()[1];
+    int input_height = vol->dims()[2];
+    int input_width = vol->dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    PADDLE_ENFORCE_EQ((input_depth + 2 * paddings[0] -
+                       ((dilations[0] * (filter_depth - 1) + 1))) /
+                              strides[0] +
+                          1,
+                      output_depth,
+                      "input_depth and output_depth are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_height + 2 * paddings[1] -
+                       ((dilations[1] * (filter_height - 1) + 1))) /
+                              strides[1] +
+                          1,
+                      output_height,
+                      "input_height and output_height are "
+                      "Mismatching.");
+    PADDLE_ENFORCE_EQ((input_width + 2 * paddings[2] -
+                       ((dilations[2] * (filter_width - 1) + 1))) /
+                              strides[2] +
+                          1,
+                      output_width,
+                      "input_width and output_width are "
+                      "Mismatching.");
+
+    int num_kernels = input_channels * input_depth * input_height * input_width;
+
+    const int threads = 1024;
+    const int blocks = (num_kernels + 1024 - 1) / 1024;
+
+    col2vol<T><<<blocks, threads, 0, context.stream()>>>(
+        num_kernels, col.data<T>(), input_depth, input_height, input_width,
+        dilations[0], dilations[1], dilations[2], filter_depth, filter_height,
+        filter_width, strides[0], strides[1], strides[2], paddings[0],
+        paddings[1], paddings[2], output_depth, output_height, output_width,
+        vol->data<T>());
+  }
+};
+
+template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
+template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<platform::CUDADeviceContext, float>;
+template class Col2VolFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
new file mode 100644
index 0000000000..dcd80370e8
--- /dev/null
+++ b/paddle/operators/math/vol2col.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * \param dilations    dilation data.
+ * \param 3-dimension  [dilation_depth, dilation_height, dilation_width].
+ *
+ * \param strides      stride data.
+ * \param 3-dimension  [stride_depth, stride_height, stride_width].
+ *
+ * \param paddings     padding data.
+ * \param 3-dimension  [d_pad, h_pad, w_pad].
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename DeviceContext, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& vol,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* col) const;
+};
+
+template <typename DeviceContext, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
+                  const std::vector<int>& dilations,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
+                  framework::Tensor* vol) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
new file mode 100644
index 0000000000..7a308ca814
--- /dev/null
+++ b/paddle/operators/math/vol2col_test.cc
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+template <typename DeviceContext, typename Place>
+void testVol2col() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new Place();
+  DeviceContext* context = new DeviceContext(*place);
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  std::vector<int> strides({1, 1, 1});
+  std::vector<int> paddings({0, 0, 0});
+  std::vector<int> dilations({1, 1, 1});
+  int output_depth =
+      (input_depth - filter_size + 2 * paddings[0]) / strides[0] + 1;
+  int output_height =
+      (input_height - filter_size + 2 * paddings[1]) / strides[1] + 1;
+  int output_width =
+      (input_width - filter_size + 2 * paddings[2]) / strides[2] + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<DeviceContext, float> vol2col;
+  vol2col(*context, input, dilations, strides, paddings, &output);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    Copy(input_tmp, *place, *context, &input);
+  }
+
+  paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
+  col2vol(*context, output, dilations, strides, paddings, &input);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+}
+
+TEST(math, vol2col) {
+  testVol2col<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testVol2col<paddle::platform::CUDADeviceContext,
+              paddle::platform::CUDAPlace>();
+#endif  // PADDLE_WITH_CUDA
+}
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
new file mode 100644
index 0000000000..3336978c8d
--- /dev/null
+++ b/paddle/operators/matmul_op.cc
@@ -0,0 +1,244 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/matmul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MatMulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of MatMulOp should not be null.");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
+    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
+
+    PADDLE_ENFORCE_GE(dim_x.size(), 1,
+                      "Input tensor X must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_y.size(), 1,
+                      "Input tensor Y must be at least 1-dimensional.");
+
+    std::vector<int64_t> out_dim;
+    int64_t batch_count = 1;
+    if (dim_x.size() > 3) {
+      PADDLE_ENFORCE_EQ(
+          dim_y.size(), dim_x.size(),
+          "The dimensions of X and Y must be the same, and both of "
+          "them should be %d-dimensional.",
+          dim_x.size());
+
+      // The first rank-2 dimensions are accumulated on the batch_count, and the
+      // last two dimensions are used for matrix multiplication.
+      for (int j = 0; j < dim_x.size() - 2; ++j) {
+        PADDLE_ENFORCE_EQ(dim_y[j], dim_x[j],
+                          "The %d-th dimension of X and Y must be the same.",
+                          j);
+        out_dim.push_back(dim_x[j]);
+        batch_count *= dim_x[j];
+      }
+    }
+
+    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
+    bool remove_initial_dim = false, remove_final_dim = false;
+
+    switch (dim_x.size()) {
+      case 1:
+        if (transpose_x) {
+          M = dim_x[0];
+          KX = 1;
+        } else {
+          M = 1;
+          KX = dim_x[0];
+          remove_initial_dim = true;
+        }
+        break;
+      case 2:
+        M = transpose_x ? dim_x[1] : dim_x[0];
+        KX = transpose_x ? dim_x[0] : dim_x[1];
+        break;
+      case 3:
+        batchCountX = dim_x[0];
+        M = transpose_x ? dim_x[2] : dim_x[1];
+        KX = transpose_x ? dim_x[1] : dim_x[2];
+        break;
+      default:
+        batchCountX = batch_count;
+        size_t mat_s = dim_x.size() - 2;
+        M = transpose_x ? dim_x[mat_s + 1] : dim_x[mat_s];
+        KX = transpose_x ? dim_x[mat_s] : dim_x[mat_s + 1];
+        break;
+    }
+
+    switch (dim_y.size()) {
+      case 1:
+        if (transpose_y) {
+          N = dim_y[0];
+          KY = 1;
+        } else {
+          N = 1;
+          KY = dim_y[0];
+          remove_final_dim = true;
+        }
+        break;
+      case 2:
+        KY = transpose_y ? dim_y[1] : dim_y[0];
+        N = transpose_y ? dim_y[0] : dim_y[1];
+        break;
+      case 3:
+        batchCountY = dim_y[0];
+        KY = transpose_y ? dim_y[2] : dim_y[1];
+        N = transpose_y ? dim_y[1] : dim_y[2];
+        break;
+      default:
+        batchCountY = batch_count;
+        size_t mat_s = dim_y.size() - 2;
+        KY = transpose_y ? dim_y[mat_s + 1] : dim_y[mat_s];
+        N = transpose_y ? dim_y[mat_s] : dim_y[mat_s + 1];
+    }
+
+    PADDLE_ENFORCE_EQ(
+        KX, KY,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+
+    std::vector<int64_t> dim_out;
+    if (batchCount) {
+      if (dim_x.size() > 3) {
+        dim_out.insert(dim_out.begin(), out_dim.begin(), out_dim.end());
+      } else {
+        dim_out.push_back(batchCount);
+      }
+    }
+    if (!remove_initial_dim) {
+      dim_out.push_back(M);
+    }
+    if (!remove_final_dim) {
+      dim_out.push_back(N);
+    }
+    if (dim_out.size() == 0) {
+      // We don't support 0-dimensional Tensors (scalars), so instead
+      // treat the output as a Tensor of shape (1, ) in this case.
+      dim_out.push_back(1);
+    }
+    context->SetOutputDim("Out", framework::make_ddim(dim_out));
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MatMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of MatMul op");
+    AddInput("Y", "The second input of MatMul op");
+    AddOutput("Out", "The output of MatMul op");
+    AddAttr<bool>("transpose_X",
+                  R"DOC(If true, use the transpose of `X`.
+        )DOC")
+        .SetDefault(false);
+    AddAttr<bool>("transpose_Y",
+                  R"DOC(If true, use the transpose of `Y`.
+        )DOC")
+        .SetDefault(false);
+    AddComment(R"DOC(
+MatMul Operator.
+
+
+This operator is used to perform (batched) matrix multiplication
+over the last two dimensions of the input tensors `X` and `Y`.
+
+If a transpose flag is specified, the last two dimensions of the
+tensor are transposed. If the tensor is rank-1 of shape [D], then
+for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
+in transposed form, whereas for `Y` it is the opposite: It is treated
+as [D, 1] in nontransposed form and as [1, D] in transposed form.
+
+Examples without transpose:
+- X: [K], Y: [K] => Out: [1]
+- X: [K], Y: [K, N] => Out: [N]
+- X: [B, M, K], Y: [K] => Out: [B, M]
+- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, ..., M, K], Y: [B, ..., K, N] => Out: [B, ..., M, N]
+
+The behavior is designed to be similar to the `numpy.matmul` function.
+The differences are:
+- When the rank of the input data is less than or equal to 3, it
+  is similar to the `numpy.matmul` function.
+- When the rank of the input is greater than 3, the rank of X and
+  Y must be equal, and the first `rank - 2` dimensions must be equal.
+- We add `transpose_X` and `transpose_Y` flags.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+class MatMulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
+            ops::MatMulOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc
new file mode 100644
index 0000000000..d28d12164e
--- /dev/null
+++ b/paddle/operators/matmul_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/matmul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    matmul, ops::MatMulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    matmul_grad,
+    ops::MatMulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
new file mode 100644
index 0000000000..fe6a97465f
--- /dev/null
+++ b/paddle/operators/matmul_op.h
@@ -0,0 +1,242 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/matmul.h"
+
+namespace paddle {
+namespace operators {
+namespace matmul_detail {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+using framework::make_ddim;
+using framework::vectorize;
+
+template <typename DeviceContext, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    math::MatMulFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), x, transpose_x, y,
+        transpose_y, T(1), out, T(0));
+  }
+};
+
+template <typename T>
+inline Tensor Reshape(const Tensor& input, const DDim& dims) {
+  Tensor output;
+  output.ShareDataWith(input);
+  output.Resize(dims);
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+template <typename T>
+Tensor CombineBatchAndM(const Tensor& input) {
+  Tensor output;
+  output.ShareDataWith(input);
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
+    output.Resize(make_ddim(out_dims));
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename DeviceContext, typename T>
+Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) {
+  Tensor output;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize({in_dims[1], in_dims[0], in_dims[2]});
+    output.mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = {1, 0, 2};
+    math::Transpose<DeviceContext, T, 3> trans;
+    trans(context, input, &output, axis);
+    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
+    output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
+  } else {
+    output.ShareDataWith(input);
+  }
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename DeviceContext, typename T>
+class MatMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    std::vector<int64_t> x_dims = vectorize(x.dims());
+    std::vector<int64_t> y_dims = vectorize(y.dims());
+
+    // If X is a vector, reshape it to a matrix.
+    if (x_dims.size() == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+    }
+
+    // If Y is a vector, reshape it to a matrix.
+    if (y_dims.size() == 1) {
+      y_dims.push_back(1);
+    }
+
+    int batch_count = 0;
+    // The first rank-2 dimensions are accumulated on the batch_count, and the
+    // last two dimensions are used for matrix multiplication.
+    if (x_dims.size() > 3) {
+      batch_count = accumulate(x_dims.begin(), x_dims.end() - 2, 1,
+                               std::multiplies<int>());
+    }
+    // Fix the dOut dimensions.
+    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
+
+    switch (x_dims.size()) {
+      case 2:
+        M = transpose_x ? x_dims[1] : x_dims[0];
+        break;
+      case 3:
+        batchCountX = x_dims[0];
+        M = transpose_x ? x_dims[2] : x_dims[1];
+        break;
+      default:
+        batchCountX = batch_count;
+        size_t mat_s = x_dims.size() - 2;
+        M = transpose_x ? x_dims[mat_s + 1] : x_dims[mat_s];
+    }
+
+    switch (y_dims.size()) {
+      case 2:
+        N = transpose_y ? y_dims[0] : y_dims[1];
+        break;
+      case 3:
+        batchCountY = y_dims[0];
+        N = transpose_y ? y_dims[1] : y_dims[2];
+        break;
+      default:
+        batchCountY = batch_count;
+        size_t mat_s = y_dims.size() - 2;
+        N = transpose_y ? y_dims[mat_s] : y_dims[mat_s + 1];
+    }
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+    std::vector<int64_t> dout_dims = {M, N};
+    if (batchCount) {
+      if (x_dims.size() > 3) {
+        dout_dims.insert(dout_dims.begin(), x_dims.begin(), x_dims.end() - 2);
+      } else {
+        dout_dims.insert(dout_dims.begin(), batchCount);
+      }
+    }
+    Tensor X = Reshape<T>(x, make_ddim(x_dims));
+    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
+    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dX =
+          (x_dims.size() == 2 && y_dims.size() == 3)
+              ? CombineBatchAndN<DeviceContext, T>(dev_ctx, dOut)
+              : dOut;
+      if (x_dims.size() == 2 && y_dims.size() == 3) {
+        Y = transpose_y ? CombineBatchAndM<T>(Y)
+                        : CombineBatchAndN<DeviceContext, T>(dev_ctx, Y);
+      }
+      if (transpose_x) {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0));
+      } else {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0));
+      }
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
+                                      ? CombineBatchAndM<T>(dOut)
+                                      : dOut;
+      if (y_dims.size() == 2 && x_dims.size() == 3) {
+        X = transpose_x ? CombineBatchAndN<DeviceContext, T>(dev_ctx, X)
+                        : CombineBatchAndM<T>(X);
+        dOut = CombineBatchAndM<T>(dOut);
+      }
+      if (transpose_y) {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0));
+      } else {
+        math::MatMulFunctor<DeviceContext, T>()(
+            dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0));
+      }
+    }
+  }
+};
+}  // namespace matmul_detail
+
+using matmul_detail::MatMulKernel;
+using matmul_detail::MatMulGradKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000..019150e491
--- /dev/null
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
new file mode 100644
index 0000000000..3ee3226941
--- /dev/null
+++ b/paddle/operators/maxout_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *     Unless required by applicable law or agreed to in writing, software
+ *     distributed under the License is distributed on an "AS IS" BASIS,
+ *     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *     See the License for the specific language governing permissions and
+ *     limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxOutOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of maxout operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<int>(
+        "groups",
+        R"DOC("Specifies how many groups the input tensor will be split"
+        "in the channel dimension. And the number of output channel is "
+        "the number of channels divided by groups.."
+        )DOC");
+    AddComment(R"DOC(
+MaxOut Operator.
+
+Assumed the input shape is (N, Ci, H, W).
+The output shape is (N, Co, H, W).
+Then $Co = Ci / groups$ and the operator formula is as follows:
+
+$$
+y_{si+j} = \max_k x_{gsi + sk + j} \\
+g = groups \\
+s = \frac{input.size}{num\_channels} \\
+0 \le i < \frac{num\_channels}{groups} \\
+0 \le j < s \\
+0 \le k < groups
+$$
+
+Please refer to Paper:
+  - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+  - Multi-digit Number Recognition from Street View \
+    Imagery using Deep Convolutional Neural Networks: \
+    https://arxiv.org/pdf/1312.6082v4.pdf
+
+)DOC");
+  }
+};
+
+class MaxOutOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MaxoutOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int groups = ctx->Attrs().Get<int>("groups");
+    // check groups > 1
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
+    output_shape.push_back(in_x_dims[2]);
+    output_shape.push_back(in_x_dims[3]);
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxOutOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
new file mode 100644
index 0000000000..c4a2d676d3
--- /dev/null
+++ b/paddle/operators/maxout_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/maxout_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    maxout, ops::MaxOutKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    maxout_grad,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MaxOutGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
new file mode 100644
index 0000000000..e8b12552b9
--- /dev/null
+++ b/paddle/operators/maxout_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/maxouting.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MaxOutKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    int groups = context.template Attr<int>("groups");
+
+    math::MaxOutFunctor<DeviceContext, T> maxout_forward;
+    maxout_forward(context.template device_context<DeviceContext>(), *in_x, out,
+                   groups);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MaxOutGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    int groups = context.template Attr<int>("groups");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0.0));
+      math::MaxOutGradFunctor<DeviceContext, T> maxout_backward;
+      maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
new file mode 100644
index 0000000000..411f4d14ef
--- /dev/null
+++ b/paddle/operators/mean_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MeanOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MeanOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MeanOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of mean op");
+    AddOutput("Out", "The output of mean op");
+    AddComment(R"DOC(
+Mean Operator.
+
+Out is a scalar which is the mean of all elements in X. 
+
+)DOC");
+  }
+};
+
+class MeanGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class MeanGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("mean_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
+REGISTER_OP_CPU_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
new file mode 100644
index 0000000000..212d448113
--- /dev/null
+++ b/paddle/operators/mean_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/mean_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
new file mode 100644
index 0000000000..351b345959
--- /dev/null
+++ b/paddle/operators/mean_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class MeanKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input);
+    auto y = EigenScalar<T>::From(*output);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    y.device(place) = X.mean();
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MeanGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(OG->numel() == 1, "Mean Gradient should be scalar");
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    T ig_size = static_cast<T>(IG->numel());
+    Eigen::DSizes<int, 1> bcast(ig_size);
+
+    EigenVector<T>::Flatten(*IG).device(
+        *context.template device_context<DeviceContext>().eigen_device()) =
+        (EigenVector<T>::From(*OG) / ig_size).broadcast(bcast);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
new file mode 100644
index 0000000000..87644d316d
--- /dev/null
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -0,0 +1,186 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using LoD = framework::LoD;
+
+class MergeLoDTensorOp : public framework::OperatorBase {
+ public:
+  MergeLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto &in_true = scope.FindVar(Input("InTrue"))->Get<framework::LoDTensor>();
+    auto &in_false =
+        scope.FindVar(Input("InFalse"))->Get<framework::LoDTensor>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+
+    auto &mask_dim = mask.dims();
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    int rank = in_true.dims().size();
+    platform::Place place = in_true.place();
+    std::type_index data_type = in_true.type();
+    framework::DDim in_true_dims =
+        framework::slice_ddim(in_true.dims(), 1, rank);
+
+    int64_t batch_size = in_true.dims()[0] + in_false.dims()[0];
+
+    auto in_true_dim_vec = framework::vectorize(in_true_dims);
+    in_true_dim_vec.insert(in_true_dim_vec.begin(), batch_size);
+
+    framework::DDim out_dims = framework::make_ddim(in_true_dim_vec);
+    out->Resize(out_dims);
+    out->mutable_data(place, data_type);
+
+    auto *out_lod = out->mutable_lod();
+    out_lod->clear();
+    size_t out_offset = 0;
+
+    // Build LoDTensor `out`
+
+    size_t in_true_idx = 0;
+    size_t in_false_idx = 0;
+    for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+      const framework::LoDTensor *input = nullptr;
+      size_t *in_idx = nullptr;
+      if (static_cast<int>(mask_data[i]) == 0) {
+        input = &in_false;
+        in_idx = &in_false_idx;
+      } else {
+        input = &in_true;
+        in_idx = &in_true_idx;
+      }
+      auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+          input->lod(), *in_idx, (*in_idx) + 1, 0);
+      auto &lod_length = lod_and_offset.first;
+
+      framework::AppendLoD(out_lod, lod_length);
+
+      size_t start_offset = lod_and_offset.second.first;
+      size_t end_offset = lod_and_offset.second.second;
+
+      PADDLE_ENFORCE_GE(end_offset, start_offset);
+      size_t len = end_offset - start_offset;
+      if (len == 0) {
+        continue;
+      }
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
+                      &slice);
+      out_offset += len;
+      (*in_idx) += 1;
+    }
+
+    for (size_t i = 0; i < level; i++) {
+      out_lod->insert(out_lod->begin(), x.lod()[i]);
+    }
+  }
+};
+
+class MergeLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MergeLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input LoDTensor, contains complete lod information to "
+             "construct the output");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddInput("InTrue", "The True branch to be merged");
+    AddInput("InFalse", "The False branch to be merged");
+    AddOutput("Out", "The merged output LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to rank.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Merge True and False branches of LoDTensor into a single Output,
+        with a mask at certain lod level. X is used to obtain complete
+        lod information. Please refer to SplitLoDTensorOp.)DOC");
+  }
+};
+
+class MergeLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "MergeLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "MergeLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasInput("InTrue"),
+                   "MergeLoDTensorOp must has input InTrue.");
+    PADDLE_ENFORCE(context->HasInput("InFalse"),
+                   "MergeLoDTensorOp must has input InFalse.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "MergeLoDTensorOp must has output Out");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("Out", context->GetInputDim("InTrue"));
+  }
+};
+
+class MergeLoDTensorGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("split_lod_tensor");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetOutput("OutTrue", InputGrad("InTrue"));
+    grad_op->SetOutput("OutFalse", InputGrad("InFalse"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(merge_lod_tensor, ops::MergeLoDTensorOp,
+                  ops::MergeLoDTensorOpProtoMaker,
+                  ops::MergeLoDTensorInferShape, ops::MergeLoDTensorGradMaker);
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
new file mode 100644
index 0000000000..3d7742dd4b
--- /dev/null
+++ b/paddle/operators/minus_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/minus_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MinusOp : public framework::OperatorWithKernel {
+ public:
+  MinusOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of MinusOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MinusOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(
+        x_dims, y_dims,
+        "Minus operator must take two tensor with same num of elements");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left tensor of minus operator.");
+    AddInput("Y", "The right tensor of minus operator.");
+    AddOutput("Out", "The output tensor of minus operator.");
+
+    AddComment(R"DOC(
+Minus Operator.
+
+Equation:
+
+    $Out = X - Y$
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+class MinusGradMaker : public framework::GradOpDescMakerBase {
+ public:
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    std::vector<std::unique_ptr<framework::OpDesc>> ops;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *x_g_op = new framework::OpDesc();
+      x_g_op->SetType("scale");
+      x_g_op->SetInput("X", OutputGrad("Out"));
+      x_g_op->SetOutput("Out", x_g);
+      x_g_op->SetAttr("scale", 1.0f);
+      ops.emplace_back(x_g_op);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *y_g_op = new framework::OpDesc();
+      y_g_op->SetType("scale");
+      y_g_op->SetInput("X", OutputGrad("Out"));
+      y_g_op->SetOutput("Out", y_g);
+      y_g_op->SetAttr("scale", -1.0f);
+      ops.emplace_back(y_g_op);
+    }
+
+    return ops;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    minus, ops::MinusKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu
new file mode 100644
index 0000000000..80cd9f7c16
--- /dev/null
+++ b/paddle/operators/minus_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/minus_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    minus,
+    paddle::operators::MinusKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
new file mode 100644
index 0000000000..20760b8cd5
--- /dev/null
+++ b/paddle/operators/minus_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MinusKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* left_tensor = context.Input<framework::Tensor>("X");
+    auto* right_tensor = context.Input<framework::Tensor>("Y");
+    auto* out_tensor = context.Output<framework::Tensor>("Out");
+
+    out_tensor->mutable_data<T>(context.GetPlace());
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    framework::EigenVector<T>::Flatten(*out_tensor).device(dev) =
+        framework::EigenVector<T>::Flatten(*left_tensor) -
+        framework::EigenVector<T>::Flatten(*right_tensor);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
new file mode 100644
index 0000000000..f5d69071a8
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/modified_huber_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ModifiedHuberLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+
+    ctx->SetOutputDim("IntermediateVal", x_dims);
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+  }
+};
+
+class ModifiedHuberLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ModifiedHuberLossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of modified huber loss op. "
+             "X is 2-D tensor with shape [batch_size, 1].");
+    AddInput("Y",
+             "The target labels of modified huber loss op. "
+             "The shape of Y is the same as X. Values of Y must be 0 or 1.");
+    AddOutput("IntermediateVal",
+              "Variable to save intermediate result which will be reused in "
+              "backward processing.")
+        .AsIntermediate();
+    AddOutput("Out", "Classification loss for X.");
+    AddComment(R"DOC(
+Modified Huber Loss Operator.
+
+This operator is used in binary classification problem. The shape of
+input X and target Y are both [N, 1] and so is the shape of the output loss.
+Since target Y is not differentiable, calculating gradient for Y is illegal.
+The formula of modified huber loss is:
+
+$$
+L(y, f(x)) = 
+\begin{cases}
+(\max(0, 1 - yf(x)))^2,  \text{if} \  yf(x) >= -1    \\
+             -4yf(x),    \quad \text{otherwise}
+\end{cases}
+$$
+
+Make sure the values of target label Y are in {0, 1} here. This operator will
+scale values of Y to {-1, +1} when computing losses and gradients.
+
+)DOC");
+  }
+};
+
+class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
+                   "Intermediate value must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@Grad) must not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
+    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(
+        intermediate_dims, x_dims,
+        "The shape of X and intermediate value must be the same.");
+    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+                      "The shape of Input(Out@Grad) and X must be the same.");
+
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp,
+            ops::ModifiedHuberLossOpMaker, modified_huber_loss_grad,
+            ops::ModifiedHuberLossGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad,
+                       ops::ModifiedHuberLossGradCPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
new file mode 100644
index 0000000000..3d2a5562e8
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/for_each.h>
+#include <thrust/tuple.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/modified_huber_loss_op.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+struct ModifiedHuberLossBackward {
+  template <typename Tuple>
+  HOSTDEVICE void operator()(Tuple t) const {
+    auto inter_val = thrust::get<1>(t);
+    auto y_val = thrust::get<2>(t);
+    auto out_grad = thrust::get<3>(t);
+    if (inter_val < -1) {
+      thrust::get<0>(t) = -4 * (2 * y_val - 1) * out_grad;
+    } else if (inter_val < 1) {
+      thrust::get<0>(t) = -2 * (1 - inter_val) * (2 * y_val - 1) * out_grad;
+    } else {
+      thrust::get<0>(t) = 0;
+    }
+  }
+};
+
+template <typename T>
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Y");
+    auto* in1 = context.Input<Tensor>("IntermediateVal");
+    auto* in2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+
+    if (out0) {
+      auto counts = framework::product(in1->dims());
+      auto y_ptr = thrust::device_pointer_cast(in0->data<T>());
+      auto inter_val_ptr = thrust::device_pointer_cast(in1->data<T>());
+      auto out_grad_ptr = thrust::device_pointer_cast(in2->data<T>());
+      thrust::device_ptr<T> x_grad_ptr(
+          out0->mutable_data<T>(context.GetPlace()));
+
+      auto iter_begin = thrust::make_zip_iterator(
+          thrust::make_tuple(x_grad_ptr, inter_val_ptr, y_ptr, out_grad_ptr));
+
+      auto iter_end = thrust::make_zip_iterator(
+          thrust::make_tuple(x_grad_ptr + counts, inter_val_ptr + counts,
+                             y_ptr + counts, out_grad_ptr + counts));
+
+      thrust::for_each(iter_begin, iter_end, ModifiedHuberLossBackward());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    modified_huber_loss,
+    ops::ModifiedHuberLossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad,
+                        ops::ModifiedHuberLossGradGPUKernel<float>);
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
new file mode 100644
index 0000000000..6ce86feee5
--- /dev/null
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename T>
+struct CheckLabelValue {
+  HOSTDEVICE T operator()(const T& val) const {
+    PADDLE_ASSERT(val == static_cast<T>(0) || val == static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct ModifiedHuberLossForward {
+  HOSTDEVICE T operator()(const T& val) const {
+    if (val < -1) {
+      return -4 * val;
+    } else if (val < 1) {
+      return (1 - val) * (1 - val);
+    } else {
+      return static_cast<T>(0);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ModifiedHuberLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<framework::Tensor>("IntermediateVal");
+    auto* out1 = context.Output<framework::Tensor>("Out");
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    // make sure value's of Y in {0, 1}
+    y.unaryExpr(CheckLabelValue<T>());
+    auto inter_val = EigenVector<T>::Flatten(*out0);
+    // scale y to {-1, +1} and compute x * y
+    inter_val.device(place) = x * (2 * y - static_cast<T>(1));
+    auto loss = EigenVector<T>::Flatten(*out1);
+    loss.device(place) = inter_val.unaryExpr(ModifiedHuberLossForward<T>());
+  }
+};
+
+// CPU backward kernel
+template <typename T>
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("Y");
+    auto* in1 = context.Input<framework::Tensor>("IntermediateVal");
+    auto* in2 = context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* out0 = context.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    if (out0) {
+      const T* y_ptr = in0->data<T>();
+      const T* inter_val_ptr = in1->data<T>();
+      const T* out_grad_ptr = in2->data<T>();
+      size_t counts = static_cast<size_t>(framework::product(in1->dims()));
+      T* x_grad_ptr = out0->mutable_data<T>(context.GetPlace());
+      for (size_t i = 0; i < counts; ++i) {
+        if (inter_val_ptr[i] < -1) {
+          x_grad_ptr[i] = -4 * (2 * y_ptr[i] - 1) * out_grad_ptr[i];
+        } else if (inter_val_ptr[i] < 1) {
+          x_grad_ptr[i] = -2 * (1 - inter_val_ptr[i]) * (2 * y_ptr[i] - 1) *
+                          out_grad_ptr[i];
+        } else {
+          x_grad_ptr[i] = 0;
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
new file mode 100644
index 0000000000..15b8b80776
--- /dev/null
+++ b/paddle/operators/momentum_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MomentumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(param) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(grad) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
+                   "Input(velocity) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of Momentum should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
+                   "Output(VelocityOut) of Momentum should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and Grad input of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Velocity"),
+        "Param and Velocity of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
+                      "Learning_rate should be a scalar");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+  }
+};
+
+class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MomentumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(Tensor, default Tensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "Input learning rate");
+
+    AddOutput("ParamOut",
+              "(Tensor) This output is updated parameter. "
+              "It shared memory with Input(Param).");
+    AddOutput("VelocityOut",
+              "(Tensor) This output is updated velocity. "
+              "It shared memory with Input(Velocity).");
+
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("use_nesterov",
+                  "(bool, default false) "
+                  "Use Nesterov Momentum")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Momentum Optimizer.
+
+This optimizer has a flag for Nestrov Momentum.
+The update equations are as follows:
+
+$$
+velocity = mu * velocity + gradient \\
+if (use\_nesterov):   \\
+  param = param - gradient * learning\_rate + mu * velocity * learning\_rate \\
+else:   \\
+  param = param - learning\_rate * velocity. \\
+$$
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
+REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel<float>,
+                       ops::MomentumOpKernel<double>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
new file mode 100644
index 0000000000..2b9314162e
--- /dev/null
+++ b/paddle/operators/momentum_op.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void MomentumKernel(const T* p, const T* g, const T* v,
+                               const T* learning_rate, const T mu,
+                               const int64_t num, bool use_nesterov, T* p_out,
+                               T* v_out) {
+  T lr = learning_rate[0];
+  if (use_nesterov) {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T g_val = g[i];
+      T v_new = v[i] * mu + g_val;
+      v_out[i] = v_new;
+      p_out[i] = p[i] - (g_val - v_new * mu) * lr;
+    }
+  } else {
+    for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
+         i += blockDim.x * gridDim.x) {
+      T v_new = v[i] * mu + g[i];
+      v_out[i] = v_new;
+      p_out[i] = p[i] - lr * v_new;
+    }
+  }
+}
+
+template <typename T>
+class MomentumOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    T* p_out = param_out->mutable_data<T>(ctx.GetPlace());
+    T* v_out = velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto* p = param->data<T>();
+    auto* v = velocity->data<T>();
+    auto* g = grad->data<T>();
+    auto* lr = learning_rate->data<T>();
+
+    int block = 512;
+    int grid = (param->numel() + block - 1) / block;
+    MomentumKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+        p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel<float>,
+                        ops::MomentumOpCUDAKernel<double>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
new file mode 100644
index 0000000000..da69532ea5
--- /dev/null
+++ b/paddle/operators/momentum_op.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    T mu = static_cast<T>(ctx.Attr<float>("mu"));
+    bool use_nesterov = ctx.Attr<bool>("use_nesterov");
+
+    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
+    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
+
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto v = framework::EigenVector<T>::Flatten(*velocity);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto* lr = learning_rate->data<T>();
+
+    v_out = v * mu + g;
+    if (use_nesterov) {
+      p_out = p - (g - v_out * mu) * lr[0];
+    } else {
+      p_out = p - lr[0] * v_out;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
new file mode 100644
index 0000000000..c923e988a5
--- /dev/null
+++ b/paddle/operators/mul_op.cc
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MulOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MulOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
+    int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
+
+    VLOG(3) << "mul operator x.shape=" << x_dims << " y.shape=" << y_dims
+            << " x_num_col_dims=" << x_num_col_dims
+            << " y_num_col_dims=" << y_num_col_dims;
+
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), x_num_col_dims,
+        "The input tensor X's rank of MulOp should be larger than "
+        "x_num_col_dims.");
+    PADDLE_ENFORCE_GT(
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");
+
+    auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+    auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+    PADDLE_ENFORCE_EQ(
+        x_mat_dims[1], y_mat_dims[0],
+        "First matrix's width must be equal with second matrix's height.");
+    std::vector<int64_t> output_dims;
+    output_dims.reserve(
+        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+    for (int i = 0; i < x_num_col_dims; ++i) {
+      output_dims.push_back(x_dims[i]);
+    }
+
+    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+      output_dims.push_back(y_dims[i]);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor), The first input tensor of mul op.");
+    AddInput("Y", "(Tensor), The second input tensor of mul op.");
+    AddOutput("Out", "(Tensor), The output tensor of mul op.");
+    AddAttr<int>(
+        "x_num_col_dims",
+        R"DOC((int, default 1), The mul_op can take tensors with more than two
+              dimensions as its inputs. If the input $X$ is a tensor with more
+              than two dimensions, $X$ will be flattened into a two-dimensional
+              matrix first. The flattening rule is: the first `num_col_dims`
+              will be flattened to form the first dimension of the final matrix
+              (the height of the matrix), and the rest `rank(X) - num_col_dims`
+              dimensions are flattened to form the second dimension of the final
+              matrix (the width of the matrix). As a result, height of the
+              flattened matrix is equal to the product of $X$'s first
+              `x_num_col_dims` dimensions' sizes, and width of the flattened
+              matrix is equal to the product of $X$'s last `rank(x) - num_col_dims`
+              dimensions' size. For example, suppose $X$ is a 6-dimensional
+              tensor with the shape [2, 3, 4, 5, 6], and `x_num_col_dims` = 3.
+              Thus, the flattened matrix will have a shape [2 x 3 x 4, 5 x 6] =
+              [24, 30].
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
+    AddAttr<int>(
+        "y_num_col_dims",
+        R"DOC((int, default 1), The mul_op can take tensors with more than two,
+              dimensions as its inputs. If the input $Y$ is a tensor with more
+              than two dimensions, $Y$ will be flattened into a two-dimensional
+              matrix first. The attribute `y_num_col_dims` determines how $Y$ is
+              flattened. See comments of `x_num_col_dims` for more details.
+        )DOC")
+        .SetDefault(1)
+        .EqualGreaterThan(1);
+    AddComment(R"DOC(
+Mul Operator.
+
+This operator is used to perform matrix multiplication for input $X$ and $Y$.
+
+The equation is:
+
+$$Out = X * Y$$
+
+Both the input $X$ and $Y$ can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD information with input $X$.
+
+)DOC");
+  }
+};
+
+class MulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    auto x_mat_dims = framework::flatten_to_2d(
+        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
+    auto y_mat_dims = framework::flatten_to_2d(
+        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker,
+                  ops::MulOpShapeInference,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(mul_grad, ops::MulOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc
new file mode 100644
index 0000000000..43de9a7194
--- /dev/null
+++ b/paddle/operators/mul_op.cu.cc
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    mul, ops::MulKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    mul_grad, ops::MulGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
new file mode 100644
index 0000000000..1fb0569b49
--- /dev/null
+++ b/paddle/operators/mul_op.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class MulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* x = context.Input<Tensor>("X");
+    const Tensor* y = context.Input<Tensor>("Y");
+    Tensor* z = context.Output<Tensor>("Out");
+    const Tensor x_matrix =
+        x->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *x, context.template Attr<int>("x_num_col_dims"))
+            : *x;
+    const Tensor y_matrix =
+        y->dims().size() > 2
+            ? framework::ReshapeToMatrix(
+                  *y, context.template Attr<int>("y_num_col_dims"))
+            : *y;
+
+    z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
+    math::matmul<DeviceContext, T>(
+        context.template device_context<DeviceContext>(), x_matrix, false,
+        y_matrix, false, 1, z, 0);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
+    int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
+    const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+
+    Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    if (dx) {
+      dx->mutable_data<T>(ctx.GetPlace());
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+
+      // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
+      math::matmul<DeviceContext, T>(dev_ctx, dout_mat, false, y_matrix, true,
+                                     1, &dx_matrix, 0);
+    }
+    if (dy) {
+      dy->mutable_data<T>(ctx.GetPlace());
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
+      // dy = x' * dout. dy K x N, dout : M x N, x : M x K
+      math::matmul<DeviceContext, T>(dev_ctx, x_matrix, true, dout_mat, false,
+                                     1, &dy_matrix, 0);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
new file mode 100644
index 0000000000..d275fa5cbb
--- /dev/null
+++ b/paddle/operators/multiplex_op.cc
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class MultiplexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
+                   "MultiInput(X) shouldn't be empty.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto ids_dim = ctx->GetInputDim("Ids");
+    PADDLE_ENFORCE(
+        ids_dim.size() == 2 && ids_dim[1] == 1,
+        "The index tensor must be a vector with size batchSize x 1.");
+
+    auto ins_dims = ctx->GetInputsDim("X");
+    auto num_ins = ins_dims.size();
+    PADDLE_ENFORCE(num_ins > 1,
+                   "multiplex operator should have more than "
+                   "one candidate input tensors.");
+
+    auto in_dim = ins_dims[0];
+    PADDLE_ENFORCE(in_dim.size() >= 2,
+                   "The rank of candidate tensors must be not less than 2.");
+    for (size_t i = 1; i < num_ins; i++) {
+      auto dim = ins_dims[i];
+      PADDLE_ENFORCE(in_dim == dim,
+                     "All the candidate tensors must have the same size.");
+    }
+    ctx->SetOutputDim("Out", in_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
+  }
+};
+
+class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MultiplexOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ids", "The index tensor of multiplex operator.");
+    AddInput("X", "The candidate tensors of multiplex operator.")
+        .AsDuplicable();
+    AddOutput("Out", "The output tensor of multiplex operator.");
+    AddComment(R"DOC(
+Multiplex Operator.
+
+Multiplex multiple tensors according to the index provided by the index tensor.
+
+Ids: the index tensor.
+X[0 : N - 1]: the candidate tensors for output (N >= 2).
+For each index i from 0 to batchSize - 1, the output is the i-th row of the
+the (Ids[i])-th tensor.
+
+For i-th row of the output tensor:
+
+$$y[i] = x_{k}[i]$$
+
+where `y` is the output tensor, `x_{k}` is the k-th input tensor,
+and `k = Ids[i]`.
+
+)DOC");
+  }
+};
+
+class MultiplexGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
+    PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
+                   "Output(X@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<false>);
+REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
+REGISTER_OP_CPU_KERNEL(
+    multiplex,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::MultiplexGradCPUKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
new file mode 100644
index 0000000000..546e6e7a24
--- /dev/null
+++ b/paddle/operators/multiplex_op.cu
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/multiplex_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MultiplexGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto* out = ctx.Output<Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    // copy index to cpu
+    Tensor index_t_cpu;
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    auto* index = index_t_cpu.data<int32_t>();
+    auto stream = ctx.cuda_device_context().stream();
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
+      PADDLE_ENFORCE_LT((size_t)k, ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T), stream);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MultiplexGradGPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto ins = ctx.MultiInput<Tensor>("X");
+    auto* ids = ctx.Input<Tensor>("Ids");
+    auto d_ins = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(*ctx.template device_context<Place>().eigen_device()) =
+            t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    // copy index to cpu
+    Tensor index_t_cpu;
+    Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
+    auto* index = index_t_cpu.data<int32_t>();
+
+    auto stream = ctx.cuda_device_context().stream();
+    platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      size_t k = static_cast<size_t>(index[i]);
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T), stream);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    multiplex,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    multiplex_grad,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::MultiplexGradGPUKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
new file mode 100644
index 0000000000..ef66be5556
--- /dev/null
+++ b/paddle/operators/multiplex_op.h
@@ -0,0 +1,81 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MultiplexCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto ids = ctx.Input<framework::Tensor>("Ids");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto index = ids->data<int32_t>();
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      int32_t k = index[i];
+      PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative.");
+      PADDLE_ENFORCE_LT(static_cast<size_t>(k), ins.size(),
+                        "index exceeds the number of candidate tensors.");
+      memory::Copy(place, out->data<T>() + i * cols, place,
+                   ins[k]->data<T>() + i * cols, cols * sizeof(T));
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class MultiplexGradCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* ids = ctx.Input<framework::Tensor>("Ids");
+    auto ins = ctx.MultiInput<framework::Tensor>("X");
+    auto d_ins =
+        ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
+    for (size_t i = 0; i < d_ins.size(); i++) {
+      if (d_ins[i]) {
+        d_ins[i]->mutable_data<T>(ctx.GetPlace());
+        auto t = framework::EigenVector<T>::Flatten(*d_ins[i]);
+        t.device(*ctx.template device_context<DeviceContext>().eigen_device()) =
+            t.constant(static_cast<T>(0));
+      }
+    }
+
+    auto rows = ins[0]->dims()[0];
+    auto cols = ins[0]->numel() / rows;
+    auto* index = ids->data<int32_t>();
+    platform::CPUPlace place = boost::get<platform::CPUPlace>(ctx.GetPlace());
+    for (auto i = 0; i < rows; i++) {
+      size_t k = static_cast<size_t>(index[i]);
+      if (d_ins[k]) {
+        memory::Copy(place, d_ins[k]->data<T>() + i * cols, place,
+                     d_out->data<T>() + i * cols, cols * sizeof(T));
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/nccl/CMakeLists.txt b/paddle/operators/nccl/CMakeLists.txt
new file mode 100644
index 0000000000..ce0ddd89bf
--- /dev/null
+++ b/paddle/operators/nccl/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_GPU)
+  nv_library(nccl_common SRCS nccl_gpu_common.cc DEPS device_context operator )
+endif()
diff --git a/paddle/operators/nccl/nccl_gpu_common.cc b/paddle/operators/nccl/nccl_gpu_common.cc
new file mode 100644
index 0000000000..1602a3d9b5
--- /dev/null
+++ b/paddle/operators/nccl/nccl_gpu_common.cc
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/operators/nccl/nccl_gpu_common.h b/paddle/operators/nccl/nccl_gpu_common.h
new file mode 100644
index 0000000000..5173996f20
--- /dev/null
+++ b/paddle/operators/nccl/nccl_gpu_common.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <condition_variable>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/dynload/nccl.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+constexpr int kInvalidGPUId = -1;
+
+struct Communicator {
+  std::vector<ncclComm_t> comms_;
+  std::unordered_map<int, int> comm_id_map_;
+  bool inited_;
+
+  Communicator() {}
+
+  int GetCommId(int device_id) const { return comm_id_map_.at(device_id); }
+
+  void InitAll(const std::vector<int>& gpus) {
+    comms_.resize(gpus.size());
+    inited_ = false;
+    for (size_t i = 0; i < gpus.size(); ++i) {
+      comm_id_map_[gpus[i]] = i;
+    }
+    PADDLE_ENFORCE(
+        dynload::ncclCommInitAll(comms_.data(), gpus.size(), gpus.data()));
+    inited_ = true;
+  }
+
+  ~Communicator() {
+    if (inited_) {
+      for (size_t i = 0; i < comms_.size(); ++i) {
+        // FIXME(dzh) : PADDLE_ENFORCE return void
+        dynload::ncclCommDestroy(comms_[i]);
+      }
+    }
+  }
+
+  DISABLE_COPY_AND_ASSIGN(Communicator);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/operators/nccl_op.cc b/paddle/operators/nccl_op.cc
new file mode 100644
index 0000000000..9d51153b06
--- /dev/null
+++ b/paddle/operators/nccl_op.cc
@@ -0,0 +1,224 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {
+
+// NCCLinitOp
+class NCCLInitOp : public framework::OperatorBase {
+ public:
+  NCCLInitOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    const auto &name = Output("Communicator");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
+                            "Can not find variable '%s' in the scope.", name);
+    std::vector<int> gpus = Attr<std::vector<int>>("gpus");
+    PADDLE_ENFORCE(!gpus.empty(), "Attr(gpus) should not be empty.");
+
+    if (scope.FindVar(name) == nullptr) {
+      PADDLE_THROW("Output(Communicator) is needed for ncclInit operator.");
+    }
+
+    platform::Communicator *comm =
+        scope.FindVar(name)->GetMutable<platform::Communicator>();
+    comm->InitAll(gpus);
+  }
+};
+
+class NCCLInitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLInitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Communicator",
+              "Create Communicator for communicating between gpus");
+    AddAttr<std::vector<int>>("gpus", "(vector<int>) GPU id lists");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddComment(R"DOC(
+NCCLInit Operator.
+
+Create communicator.
+
+)DOC");
+  }
+};
+
+// AllReduceOp
+class NCCLAllReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of AllReduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of AllReduce op input should not be NULL");
+
+    auto x_dims = ctx->GetInputsDim("X");
+
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// ReduceOp
+class NCCLReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Communicator"),
+        " Input(Communicator) of Reduce op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Input(X) of Reduce op input should not be NULL");
+
+    std::string reduction = ctx->Attrs().Get<std::string>("reduction");
+    PADDLE_ENFORCE((reduction == "ncclSum" || reduction == "ncclProd" ||
+                    reduction == "ncclMin" || reduction == "ncclMax"),
+                   "invalid reduction.");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// BcastOp
+class NCCLBcastOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   " Input(X) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasInput("Communicator"),
+                   " Input(Communicator) of Bcast op input should not be NULL");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   " Output(Out) of Bcast op output should not be NULL");
+
+    int root = ctx->Attrs().Get<int>("root");
+    PADDLE_ENFORCE(root != platform::kInvalidGPUId, "Bcast root must be set.");
+
+    auto x_dims = ctx->GetInputsDim("X");
+    ctx->SetOutputsDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+// AllreduceOp
+class NCCLAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLAllReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of AllReduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of AllReduce op");
+    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
+    AddComment(R"DOC(
+NCCLAllReduce Operator.
+
+AllReduce the input tensors.
+
+)DOC");
+  }
+};
+
+// ReduceOp
+class NCCLReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of Reduce op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of Reduce op");
+    AddAttr<std::string>("reduction",
+                         "(string, default 'ncclSum') "
+                         "{'ncclMin', 'ncclMax', 'ncclProd', 'ncclSum'}.")
+        .SetDefault("ncclSum");
+    AddAttr<int>("root",
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
+    AddComment(R"DOC(
+NCCLReduce Operator.
+
+Reduce the tensors.
+
+)DOC");
+  }
+};
+
+// BcastOp
+class NCCLBcastOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCCLBcastOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of BcastSend op");
+    AddInput("Communicator", "Communicator for communicating between gpus");
+    AddOutput("Out", "The output of Bcast");
+    AddAttr<int>("root",
+                 "(int, default kInvalidGPUId) "
+                 "Root gpu of the parameter. If not, "
+                 "set(platform::kInvalidGPUId). Hashed by name.")
+        .SetDefault(platform::kInvalidGPUId);
+    AddComment(R"DOC(
+NCCLBcast Operator.
+
+Bcast the tensors.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(ncclInit, ops::NCCLInitOp,
+                  paddle::framework::EmptyGradOpMaker, ops::NCCLInitOpMaker);
+
+REGISTER_OP_WITHOUT_GRADIENT(ncclAllReduce, ops::NCCLAllReduceOp,
+                             ops::NCCLAllReduceOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclBcast, ops::NCCLBcastOp,
+                             ops::NCCLBcastOpMaker);
+REGISTER_OP_WITHOUT_GRADIENT(ncclReduce, ops::NCCLReduceOp,
+                             ops::NCCLReduceOpMaker);
diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc
new file mode 100644
index 0000000000..1b986a1365
--- /dev/null
+++ b/paddle/operators/nccl_op.cu.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenseshashernless required by applicable law or agreed
+to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Communicator;
+using framework::LoDTensor;
+
+template <typename Type>
+class NCCLTypeWrapper;
+
+template <>
+class NCCLTypeWrapper<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+
+template <>
+class NCCLTypeWrapper<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+template <typename T>
+class NCCLAllReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
+
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
+    }
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = ctx.cuda_device_context().stream();
+
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
+
+    for (size_t i = 0; i < ins.size(); ++i) {
+      VLOG(1) << "gpu : "
+              << " invoke allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+
+      PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
+          ins[i]->data<T>(), outs[i]->mutable_data<T>(ctx.GetPlace()),
+          outs[i]->numel(), NCCLTypeWrapper<T>::type, reduction_op_,
+          comm->comms_[idx], stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << "gpu : "
+              << " finished allreduce. send " << ins[i]->numel() << " recv "
+              << outs[i]->numel();
+    }
+  }
+};
+
+template <typename T>
+class NCCLReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    auto ins = ctx.MultiInput<LoDTensor>("X");  // x0, x1, x2
+    auto outs = ctx.MultiOutput<LoDTensor>("Out");
+
+    std::string reduction = ctx.Attr<std::string>("reduction");
+    ncclRedOp_t reduction_op_ = ncclSum;
+
+    if (reduction == "ncclMin") {
+      reduction_op_ = ncclMin;
+    } else if (reduction == "ncclMax") {
+      reduction_op_ = ncclMax;
+    } else if (reduction == "ncclSum") {
+      reduction_op_ = ncclSum;
+    } else if (reduction == "ncclProd") {
+      reduction_op_ = ncclProd;
+    } else {
+      PADDLE_THROW("Invalid reduction. default ncclSum.");
+    }
+
+    int root = ctx.Attr<int>("root");
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
+
+    auto ins_names = ctx.Inputs("X");
+    std::hash<std::string> hasher;
+    for (size_t i = 0; i < ins.size(); ++i) {
+      if (root == platform::kInvalidGPUId) {
+        root = hasher(ins_names[i]) % comm->comms_.size();
+      }
+      T* recvbuffer = nullptr;
+      if (root == gpu_id) {
+        recvbuffer = outs[i]->mutable_data<T>(ctx.GetPlace());
+      }
+
+      VLOG(1) << "gpu : " << gpu_id << " invoke reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
+
+      PADDLE_ENFORCE(platform::dynload::ncclReduce(
+          ins[i]->data<T>(), recvbuffer, ins[i]->numel(),
+          NCCLTypeWrapper<T>::type, reduction_op_, root, comm->comms_[idx],
+          stream));
+      PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+      VLOG(1) << "gpu : " << gpu_id << " finished reduce. send "
+              << ins[i]->numel() << " recv " << outs[i]->numel();
+    }
+  }
+};
+
+template <typename T>
+class NCCLBcastKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+
+    int root = ctx.Attr<int>("root");
+
+    auto* comm = ctx.Input<Communicator>("Communicator");
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                      ctx.device_context())
+                      .stream();
+    // device id
+    int gpu_id = boost::get<platform::CUDAPlace>(ctx.GetPlace()).GetDeviceId();
+    int idx = comm->GetCommId(gpu_id);
+
+    if (idx == root) {
+      auto ins = ctx.MultiInput<LoDTensor>("X");
+      for (size_t i = 0; i < ins.size(); ++i) {
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. send "
+                << ins[i]->numel();
+
+        VLOG(1) << " before ncclBcast";
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            (void*)ins[i]->data<T>(), ins[i]->numel(), NCCLTypeWrapper<T>::type,
+            root, comm->comms_[idx], stream));
+        VLOG(1) << " after ncclBcast";
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast.";
+      }
+    } else {
+      auto outs = ctx.MultiOutput<LoDTensor>("Out");
+      for (size_t i = 0; i < outs.size(); ++i) {
+        VLOG(1) << "gpu : " << gpu_id << " invoke Bcast. recv buffer "
+                << framework::product(outs[i]->dims());
+
+        PADDLE_ENFORCE(platform::dynload::ncclBcast(
+            outs[i]->mutable_data<T>(ctx.GetPlace()), outs[i]->numel(),
+            NCCLTypeWrapper<T>::type, root, comm->comms_[idx], stream));
+        PADDLE_ENFORCE(cudaStreamSynchronize(stream));
+
+        VLOG(1) << "gpu : " << gpu_id << " finished Bcast. recv "
+                << outs[i]->numel();
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel<float>);
+REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel<float>);
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
new file mode 100644
index 0000000000..072e4eb2ef
--- /dev/null
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -0,0 +1,315 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <memory>
+#include <mutex>
+#include <thread>
+#include <utility>
+#include <vector>
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/init.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+USE_NO_KERNEL_OP(ncclInit);
+USE_CUDA_ONLY_OP(ncclAllReduce);
+USE_CUDA_ONLY_OP(ncclReduce);
+USE_CUDA_ONLY_OP(ncclBcast);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+
+static std::vector<int> gpu_list;
+
+// test data amount
+const f::DDim kDims = {100, 100};
+
+// nccl op common tester, init communicator.
+class NCCLTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    paddle::platform::CPUPlace cpu_place;
+    for (size_t i = 0; i < gpu_list.size(); ++i) {
+      p::CUDAPlace place(i);
+      dev_ctxs.emplace_back(new p::CUDADeviceContext(place));
+    }
+
+    NCCLInitOp();
+  }
+
+  virtual void TearDown() override {
+    for (auto &device_context : dev_ctxs) {
+      delete device_context;
+    }
+  }
+
+  void NCCLInitOp() {
+    paddle::platform::CPUPlace cpu_place;
+    std::unique_ptr<f::OpDesc> op1(new f::OpDesc);
+
+    op1->SetType("ncclInit");
+    op1->SetOutput("Communicator", {"comm"});
+    op1->SetAttr("gpus", {gpu_list});
+
+    auto *var = g_scope.Var("comm");
+    var->GetMutable<p::Communicator>();
+
+    auto op = f::OpRegistry::CreateOp(*op1);
+    VLOG(1) << "invoke NCCLInitOp.";
+    op->Run(g_scope, cpu_place);
+    VLOG(1) << "NCCLInitOp finished.";
+  }
+
+  template <class T>
+  void PerThreadProgram(int gpu_id, const f::OpDesc &op_desc, f::Scope *scope) {
+    std::unique_lock<std::mutex> lk(mu);
+    const f::OpDesc *op1 = &op_desc;
+
+    p::CUDAPlace place(gpu_id);
+    auto &ctx = dev_ctxs.at(gpu_id);
+
+    auto *send_tensor = scope->Var("st")->GetMutable<f::LoDTensor>();
+    auto *recv_tensor = scope->Var("rt")->GetMutable<f::LoDTensor>();
+
+    if (!send_tensor->numel()) {
+      send_tensor->Resize(kDims);
+      send_tensor->mutable_data<T>(kDims, place);
+
+      std::vector<T> send_vector(f::product(kDims), gpu_id);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
+      ctx->Wait();
+      VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
+    }
+
+    lk.unlock();
+
+    PADDLE_ENFORCE(send_tensor->numel() == f::product(kDims),
+                   "Tensor numel not match!");
+
+    auto op = f::OpRegistry::CreateOp(*op1);
+
+    VLOG(1) << "Device : " << gpu_id << " invoke " << op_desc.Type();
+    VLOG(1) << " send_tensor : " << send_tensor->numel()
+            << " recv_tensor : " << recv_tensor->numel();
+    op->Run(*scope, place);
+    VLOG(1) << "Device : " << gpu_id << " finished " << op_desc.Type();
+  }
+
+ public:
+  std::vector<p::DeviceContext *> dev_ctxs;
+  f::Scope g_scope;
+  std::mutex mu;
+};
+
+// ncclInitOp with desc
+TEST(NCCL, ncclInitOp) {
+  std::unique_ptr<f::OpDesc> op_desc(new f::OpDesc);
+
+  op_desc->SetType("ncclInit");
+  op_desc->SetOutput("Communicator", {"x1"});
+  op_desc->SetAttr("gpus", {gpu_list});
+
+  f::Scope g_scope;
+  paddle::platform::CPUPlace cpu_place;
+
+  auto *var = g_scope.Var("x1");
+  var->GetMutable<p::Communicator>();
+
+  auto op = f::OpRegistry::CreateOp(*op_desc);
+  VLOG(1) << "invoke NCCLInitOp.";
+  op->Run(g_scope, cpu_place);
+  VLOG(1) << "NCCLInitOp finished.";
+}
+
+// ncclAllReduceOp with desc
+TEST_F(NCCLTester, ncclAllReduceOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  op2->SetType("ncclAllReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  for (size_t i = 0; i < dev_scopes.size(); ++i) {
+    p::CPUPlace cpu_place;
+    p::CUDAPlace gpu_place(gpu_list[i]);
+
+    auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get<f::LoDTensor>();
+    auto *rt = recv_tensor.data<float>();
+    auto *result_tensor = dev_scopes[i]->Var("ct")->GetMutable<f::LoDTensor>();
+    result_tensor->Resize(kDims);
+    auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+    paddle::memory::Copy(
+        cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt,
+        recv_tensor.numel() * sizeof(float),
+        static_cast<p::CUDADeviceContext *>(dev_ctxs[i])->stream());
+
+    for (int64_t j = 0; j < f::product(kDims); ++j) {
+      ASSERT_NEAR(ct[j], result, 1e-5);
+    }
+  }
+}
+
+// ncclReduceOp with desc
+TEST_F(NCCLTester, ncclReduceOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  const int kRoot = 0;
+  op2->SetType("ncclReduce");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", kRoot);
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  // check results on
+  float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0);
+
+  p::CPUPlace cpu_place;
+  p::CUDAPlace gpu_place(gpu_list[kRoot]);
+
+  auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor =
+      dev_scopes[kRoot]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[kRoot])->stream());
+
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
+// ncclBcastOp with desc
+TEST_F(NCCLTester, ncclBcastOp) {
+  std::unique_ptr<f::OpDesc> op2(new f::OpDesc);
+  const int kRoot = 0;
+  op2->SetType("ncclBcast");
+  op2->SetInput("X", {"st"});
+  op2->SetInput("Communicator", {"comm"});
+  op2->SetOutput("Out", {"rt"});
+  op2->SetAttr("root", kRoot);
+
+  std::vector<f::Scope *> dev_scopes;
+
+  std::vector<std::thread> ths;
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    dev_scopes.emplace_back(&g_scope.NewScope());
+    std::thread th(&NCCLTester::PerThreadProgram<float>, this, gpu_list[i],
+                   *op2.get(), dev_scopes[i]);
+    ths.emplace_back(std::move(th));
+  }
+
+  for (size_t i = 0; i < gpu_list.size(); ++i) {
+    ths[i].join();
+  }
+
+  const int idx = 1;
+  // check results on
+  float result = kRoot;
+
+  p::CPUPlace cpu_place;
+  p::CUDAPlace gpu_place(gpu_list[idx]);
+
+  auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get<f::LoDTensor>();
+  auto *rt = recv_tensor.data<float>();
+  auto *result_tensor = dev_scopes[idx]->Var("ct")->GetMutable<f::LoDTensor>();
+  result_tensor->Resize(kDims);
+  auto *ct = result_tensor->mutable_data<float>(cpu_place);
+
+  paddle::memory::Copy(
+      cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt,
+      recv_tensor.numel() * sizeof(float),
+      static_cast<p::CUDADeviceContext *>(dev_ctxs[idx])->stream());
+
+  for (int64_t j = 0; j < f::product(kDims); ++j) {
+    ASSERT_NEAR(ct[j], result, 1e-5);
+  }
+}
+
+int main(int argc, char **argv) {
+  const int dev_count = p::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+    gpu_list.emplace_back(i);
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+
+  // device context should be release before scope.
+  // otherwise driver will down.
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc
new file mode 100644
index 0000000000..994ddf717e
--- /dev/null
+++ b/paddle/operators/nce_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/nce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class NCEOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Label"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
+                        ctx->GetInputDim("Bias")[0]);
+    }
+    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
+    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
+    std::vector<int> custom_neg_classes =
+        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
+    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
+    if (custom_neg_classes.size() > 0) {
+      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
+                        static_cast<size_t>(num_neg_samples));
+    }
+    // set dims of output(Out)
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(x_dims[0]);
+    out_dims.push_back(1);
+    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
+
+    // set dims of output(SampleOut)
+    std::vector<int64_t> sample_out_dims;
+    sample_out_dims.push_back(x_dims[0]);
+    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
+    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCEOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
+    AddInput(
+        "Label",
+        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
+        "'num_true_class' is the number of target classes in each sample."
+        "The number of target classes per sample should be same. "
+        "If you have a variable number of target classes, "
+        "you can pad them out to a constant number by either repeating them"
+        " or by padding with an otherwise unused class.)");
+    AddInput("Weight",
+             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
+             "total number of class.");
+    AddInput(
+        "Bias",
+        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
+        "number of class. It is a dispensable input.")
+        .AsDispensable();
+    AddInput("SampleWeight",
+             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
+             "each sample. And it is a dispensable input. The default value of "
+             "sample is 1.")
+        .AsDispensable();
+    AddOutput("Cost",
+              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
+    AddOutput("SampleLogits",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "Given X is  the dot product of input tensor and sampled labels' "
+              "weights."
+              "Then 'SampleLogits' is sigmoid(X).")
+        .AsIntermediate();
+    AddOutput("SampleLabels",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "")
+        .AsIntermediate();
+    AddAttr<int>("num_total_classes",
+                 "Total number of classes in all samples.");
+    AddAttr<int>("num_neg_samples",
+                 "The number of negative classes. The default value is 10.")
+        .SetDefault(10);
+    AddAttr<std::vector<int>>("custom_neg_classes",
+                              "This attribute only be used in unitest. Classes "
+                              "in this list wiil be used as negative classes "
+                              "for every samples. Under normal conditions, "
+                              "user should avoid setting this attribute.")
+        .SetDefault({});
+    AddComment(R"DOC(
+Compute and return the noise-contrastive estimation training loss.
+See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+By default this operator uses a uniform distribution for sampling.
+)DOC");
+  }
+};
+
+class NCEOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasInput("Cost"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
+                   "The input(Out@GRAD) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto w_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(w_grad_name)) {
+      ctx->SetOutputDim(w_grad_name, w_dims);
+    }
+
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(nce_grad,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h
new file mode 100644
index 0000000000..86fa13a649
--- /dev/null
+++ b/paddle/operators/nce_op.h
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <math.h>
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+void PrepareSamples(const framework::ExecutionContext& context) {
+  auto label = context.Input<Tensor>("Label");
+  const int64_t* label_data = label->data<int64_t>();
+  auto label_dims = label->dims();
+  int num_total_classes = context.Attr<int>("num_total_classes");
+  // for unitest
+  std::vector<int> custom_neg_classes =
+      context.Attr<std::vector<int>>("custom_neg_classes");
+  // random machine
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
+
+  auto sample_labels = context.Output<Tensor>("SampleLabels");
+  auto sample_labels_dims = sample_labels->dims();
+  int64_t* sample_labels_data =
+      sample_labels->mutable_data<int64_t>(context.GetPlace());
+
+  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
+  int index = 0;
+  for (int64_t i = 0; i < label_dims[0]; ++i) {
+    int j = 0;
+    for (; j < num_label; ++j) {
+      sample_labels_data[index++] = label_data[i * num_label + j];
+    }
+    if (custom_neg_classes.size() > 0) {
+      for (auto label : custom_neg_classes) {
+        sample_labels_data[index++] = label;
+      }
+    } else {
+      for (; j < sample_labels_dims[1]; ++j) {
+        // TODO(wanghaoshuang): support more distribution sampling
+        sample_labels_data[index++] = rand(rng);
+      }
+    }
+  }
+}
+
+template <typename DeviceContext, typename T>
+class NCEKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PrepareSamples<DeviceContext, T>(context);
+    auto sample_labels = context.Output<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_out = context.Output<Tensor>("SampleLogits");
+    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    auto label = context.Input<Tensor>("Label");
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    auto out = context.Output<Tensor>("Cost");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int64_t num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    // forward bias
+    auto bias = context.Input<Tensor>("Bias");
+    if (bias != nullptr) {
+      const T* bias_data = bias->data<T>();
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = bias_data[sample_labels_data[i]];
+      }
+    } else {
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = 0;
+      }
+    }
+    // forward mul
+    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+           weight_mat.chip(sample_labels_data[i], 0))
+              .sum();
+      sample_out_data[i] += result(0);
+      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+    }
+    // forward cost
+    for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      int64_t j = 0;
+      out_data[i] = 0;
+      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
+      // for true classes
+      for (; j < num_true_class; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(o / (o + b));
+        out_data[i] += w * cost;
+      }
+      // for sampled neg classes
+      for (; j < sample_labels->dims()[1]; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(b / (o + b));
+        out_data[i] += w * cost;
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class NCEGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    const T* d_out_data = d_out->data<T>();
+    auto label = context.Input<Tensor>("Label");
+    auto sample_out = context.Input<Tensor>("SampleLogits");
+    const T* sample_out_data = sample_out->data<T>();
+    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    Tensor sample_grad;  // tmp tensor
+    T* sample_grad_data =
+        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
+    // backward cost
+    for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+      T o = sample_out_data[i];
+      T w = sample_weight == nullptr
+                ? 1
+                : sample_weight_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
+                                ? w * (b / (o + b)) * (o - 1)
+                                : w * (o * (1 - o) / (o + b));
+      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+    }
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+    // get d_w
+    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+    if (d_w != nullptr) {
+      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_w_matrix.chip(sample_labels_data[i], 0) +=
+            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            sample_grad_data[i];
+      }
+    }
+    // get d_x
+    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    if (d_x != nullptr) {
+      auto* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+      std::fill(d_x_data, d_x_data + d_x->numel(), 0.0);
+      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
+      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (int64_t i = 0; i < sample_labels->numel(); ++i) {
+        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op.cc b/paddle/operators/net_op.cc
new file mode 100644
index 0000000000..000e029840
--- /dev/null
+++ b/paddle/operators/net_op.cc
@@ -0,0 +1,103 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/net_op.h"
+#include <set>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+const char NetOp::kAll[] = "all";
+
+void NetOp::CompleteAddOp(bool calc) {
+  add_op_done_ = true;
+  if (!calc) return;
+  std::set<std::string> input_set;
+  std::set<std::string> output_set;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->Inputs()) {
+      for (auto& var_name : ipt.second) {
+        // If input variable has been in output set, then it will be
+        // added into intermediate_outputs_. Otherwise, it will be
+        // added into input set.
+        if (Contains(output_set, var_name)) {
+          intermediate_outputs_.insert(var_name);
+        } else {
+          input_set.insert(var_name);
+        }
+      }
+    }
+
+    for (auto& opt : op->Outputs()) {
+      for (auto& var_name : opt.second) {
+        output_set.insert(var_name);
+      }
+    }
+  }
+  auto& inputs = inputs_[kAll];
+  inputs.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs));
+  auto& outputs = outputs_[kAll];
+  outputs.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
+}
+
+std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
+  std::ostringstream os;
+  os << OperatorBase::DebugStringEx(scope) << std::endl;
+  for (auto& op : ops_) {
+    std::istringstream is(op->DebugStringEx(scope));
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
+  }
+  return os.str();
+}
+
+bool NetOp::IsNetOp() const { return true; }
+
+std::vector<std::string> NetOp::OutputVars(bool has_intermediate) const {
+  std::vector<std::string> all;
+  for (auto& pair : this->outputs_) {
+    for (auto& var_name : pair.second) {
+      all.push_back(var_name);
+    }
+  }
+  if (has_intermediate) {
+    return all;
+  }
+  std::vector<std::string> ret_val;
+  for (auto& each : all) {
+    if (!Contains(intermediate_outputs_, each)) {
+      ret_val.push_back(each);
+    }
+  }
+  return ret_val;
+}
+
+NetOp::NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+             const framework::VariableNameMap& outputs,
+             const framework::AttributeMap& attrs)
+    : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+std::unique_ptr<framework::OperatorBase> NetOp::Clone() const {
+  PADDLE_ENFORCE(
+      add_op_done_,
+      "Must clone a sealed NetOp, invoke Net::CompleteAddOp before clone");
+  return std::unique_ptr<OperatorBase>(new NetOp(*this));
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
new file mode 100644
index 0000000000..b24042f5ef
--- /dev/null
+++ b/paddle/operators/net_op.h
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <set>
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+/**
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the APIs
+ * it defines.
+ */
+class NetOp : public framework::OperatorBase {
+ public:
+  static const char kAll[];
+  NetOp()
+      : framework::OperatorBase("plain_net", framework::VariableNameMap{},
+                                framework::VariableNameMap{},
+                                framework::AttributeMap{}) {}
+
+  NetOp(const std::string& type, const framework::VariableNameMap& inputs,
+        const framework::VariableNameMap& outputs,
+        const framework::AttributeMap& attrs);
+
+  NetOp(const NetOp& o) : framework::OperatorBase(o.type_, {}, {}, o.attrs_) {
+    this->ops_.reserve(o.ops_.size());
+    std::transform(
+        o.ops_.begin(), o.ops_.end(), std::back_inserter(this->ops_),
+        [](const std::unique_ptr<framework::OperatorBase>& op) {
+          return std::unique_ptr<framework::OperatorBase>(op->Clone());
+        });
+    this->CompleteAddOp();
+  }
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, place);
+    }
+  }
+
+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  void AppendOp(const framework::OperatorBase& op) { AppendOp(op.Clone()); }
+
+  /**
+   * @brief Add an operator by ptr
+   */
+  void AppendOp(std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot AppendOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    ops_.push_back(std::move(op));
+  }
+
+  void InsertOp(size_t pos, std::unique_ptr<framework::OperatorBase> op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE_NOT_NULL(op, "Cannot Insert Null op");
+    PADDLE_ENFORCE_LE(pos, ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, std::move(op));
+  }
+
+  void InsertOp(size_t pos, const framework::OperatorBase& op) {
+    InsertOp(pos, op.Clone());
+  }
+
+  void CompleteAddOp(bool calculate = true);
+
+  std::string DebugStringEx(
+      const framework::Scope* scope = nullptr) const override;
+
+  bool IsNetOp() const override;
+  std::vector<std::string> OutputVars(bool has_intermediate) const override;
+
+  std::unique_ptr<framework::OperatorBase> Clone() const override;
+
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops_;
+
+ private:
+  bool add_op_done_{false};
+  std::set<std::string> intermediate_outputs_;
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc
new file mode 100644
index 0000000000..9358f29f62
--- /dev/null
+++ b/paddle/operators/net_op_test.cc
@@ -0,0 +1,100 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/operators/net_op.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
+static int run_cnt = 0;
+
+class TestOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  DEFINE_OP_CLONE_METHOD(TestOp);
+  void Run(const Scope& scope, const platform::Place& place) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<NetOp>();
+  ASSERT_NE(net, nullptr);
+
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                 {{"Out", {"y"}}}, framework::AttributeMap{})));
+  net->AppendOp(std::unique_ptr<TestOp>(
+      new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}},
+                 {{"Out", {"z"}}}, framework::AttributeMap{})));
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"},
+                               net->Inputs(NetOp::kAll));
+  AssertSameVectorWithoutOrder({"y", "z"}, net->Outputs(NetOp::kAll));
+
+  auto final_outs = net->OutputVars(false);
+
+  ASSERT_EQ(final_outs.size(), 1UL);
+  ASSERT_EQ(final_outs[0], "z");
+}
+
+TEST(NetOp, insert_op) {
+  NetOp net;
+  auto op1 = std::unique_ptr<framework::NOP>(
+      new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}},
+                         {{"Out", {"y"}}}, framework::AttributeMap{}));
+  net.AppendOp(*op1);
+  net.InsertOp(0, *op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, std::move(op1));
+  ASSERT_EQ(3UL, net.ops_.size());
+}
+
+TEST(NetOp, Clone) {
+  NetOp net;
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.AppendOp(std::unique_ptr<framework::NOP>(new framework::NOP{
+      "empty2", framework::VariableNameMap{}, framework::VariableNameMap{},
+      framework::AttributeMap{}}));
+  net.CompleteAddOp(true);
+  auto new_net_op = net.Clone();
+  ASSERT_NE(new_net_op, nullptr);
+  ASSERT_TRUE(new_net_op->IsNetOp());
+  auto* new_net = static_cast<NetOp*>(new_net_op.get());
+  ASSERT_EQ(2UL, new_net->ops_.size());
+  ASSERT_EQ(new_net->ops_[0]->Type(), "empty");
+  ASSERT_EQ(new_net->ops_[1]->Type(), "empty2");
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/norm_op.cc b/paddle/operators/norm_op.cc
new file mode 100644
index 0000000000..0eeafcaae0
--- /dev/null
+++ b/paddle/operators/norm_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/norm_op.h"
+namespace paddle {
+namespace operators {
+
+template <typename AttrType>
+class NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of norm operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput("Scale",
+             "(Tensor) The input tensor of norm operator. "
+             "The format of input tensor is C * 1.");
+    AddAttr<AttrType>("epsilon",
+                      "(float, default 1e-10) Constant "
+                      "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddOutput("Out",
+              "(Tensor) The output tensor of norm operator."
+              "N * M."
+              "M = C * H * W");
+    AddComment(R"DOC(
+       "Input shape: $(N, C, H, W)$
+        Scale shape: $(C, 1)$
+        Output shape: $(N, C, H, W)$
+        Where
+        forward
+          $$
+            [\frac {x_{1}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{2}}{\sqrt{\sum{x_{i}^{2}}}} \frac {x_{3}}{\sqrt{\sum{x_{i}^{2}}}} \cdot  \cdot  \cdot \frac {x_{n}}{\sqrt{\sum{x_{i}^{2}}}}]
+          $$
+        backward
+          $$
+            \frac{\frac{\mathrm{d}L }{\mathrm{d}y_{1}} - \frac {x_{1}\sum {\frac{\mathrm{d} L}{\mathrm{d} y_{j}}}x_{j}}{\sum x_{j}^{2}} }{\sqrt{\sum{x_{j}^{2}}}}
+          $$
+        )DOC");
+  }
+};
+
+class NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"),
+                   "Input(Scale) of NormOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of NormOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim("Out", in_x_dims);
+  }
+};
+
+class NormOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(norm, ops::NormOp, ops::NormOpMaker<float>, norm_grad,
+            ops::NormOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormKernel<paddle::platform::CPUDeviceContext, double, float>);
+REGISTER_OP_CPU_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CPUDeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.cu b/paddle/operators/norm_op.cu
new file mode 100644
index 0000000000..2941c89b93
--- /dev/null
+++ b/paddle/operators/norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    norm, ops::NormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormKernel<paddle::platform::CUDADeviceContext, double, float>);
+REGISTER_OP_CUDA_KERNEL(
+    norm_grad, ops::NormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::NormGradKernel<paddle::platform::CUDADeviceContext, double, float>);
diff --git a/paddle/operators/norm_op.h b/paddle/operators/norm_op.h
new file mode 100644
index 0000000000..5759d6f1f0
--- /dev/null
+++ b/paddle/operators/norm_op.h
@@ -0,0 +1,175 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    out->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor out_batch = out->Slice(n, n + 1);
+      auto out_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              out_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp = framework::EigenVector<T, Eigen::RowMajor,
+                                        Eigen::DenseIndex>::Flatten(tmp_tensor);
+      // get colsum and sqrt , inverse
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp.device(*place) = x_square_batch_eigen.sum(dim);
+      tmp.device(*place) = (tmp + epsilon).sqrt().inverse();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      out_batch_eigen.device(*place) =
+          in_x_batch_eigen * (tmp.broadcast(broadcast_dim_col));
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      out_batch_eigen.device(*place) =
+          out_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+template <typename DeviceContext, typename T, typename AttrType = T>
+class NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* scale = context.Input<framework::Tensor>("Scale");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto epsilon = static_cast<T>(context.Attr<AttrType>("epsilon"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    int batch_size = in_x->dims()[0];
+    int channels = in_x->dims()[1];
+    int height = in_x->dims()[2];
+    int width = in_x->dims()[3];
+    int fea_len = height * width;
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto scale_eigen =
+        framework::EigenVector<T, Eigen::RowMajor, Eigen::DenseIndex>::Flatten(
+            *scale);
+    auto x =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            *in_x, framework::make_ddim({batch_size, fea_len * channels}));
+    // get square
+    framework::Tensor x_square;
+    x_square.mutable_data<T>(in_x->dims(), context.GetPlace());
+    auto x_square_eigen =
+        framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+            x_square, framework::make_ddim({batch_size, fea_len * channels}));
+    x_square_eigen.device(*place) = x.square();
+
+    for (int n = 0; n < batch_size; ++n) {
+      framework::Tensor in_x_batch = in_x->Slice(n, n + 1);
+      auto in_x_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_x_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor in_g_batch = in_x_grad->Slice(n, n + 1);
+      auto in_g_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              in_g_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor x_square_batch = x_square.Slice(n, n + 1);
+      auto x_square_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              x_square_batch, framework::make_ddim({channels, fea_len}));
+      framework::Tensor outg_batch = out_grad->Slice(n, n + 1);
+      auto outg_batch_eigen =
+          framework::EigenMatrix<T, Eigen::RowMajor, Eigen::DenseIndex>::From(
+              outg_batch, framework::make_ddim({channels, fea_len}));
+
+      framework::Tensor tmp_tensor;
+      tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                 context.GetPlace());
+      auto tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(tmp_tensor);
+      auto dim = Eigen::array<int, 1>({{0}});
+      tmp_eigen.device(*place) = (in_x_batch_eigen * outg_batch_eigen).sum(dim);
+      framework::Tensor norm_tmp_tensor;
+      norm_tmp_tensor.mutable_data<T>(framework::make_ddim({1, fea_len}),
+                                      context.GetPlace());
+      auto norm_tmp_eigen =
+          framework::EigenVector<T, Eigen::RowMajor,
+                                 Eigen::DenseIndex>::Flatten(norm_tmp_tensor);
+      norm_tmp_eigen.device(*place) =
+          (x_square_batch_eigen.sum(dim) + epsilon).sqrt();
+      Eigen::array<int, 2> broadcast_dim_col;
+      broadcast_dim_col[1] = 1;
+      broadcast_dim_col[0] = channels;
+      in_g_batch_eigen.device(*place) =
+          in_x_batch_eigen * tmp_eigen.broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen /
+          (norm_tmp_eigen * norm_tmp_eigen).broadcast(broadcast_dim_col);
+      in_g_batch_eigen.device(*place) = outg_batch_eigen - in_g_batch_eigen;
+      // outg_batch_eigen + (in_g_batch_eigen * -1);
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen / norm_tmp_eigen.broadcast(broadcast_dim_col);
+      Eigen::array<int, 2> broadcast_dim_row;
+      broadcast_dim_row[1] = fea_len;
+      broadcast_dim_row[0] = 1;
+      in_g_batch_eigen.device(*place) =
+          in_g_batch_eigen * (scale_eigen.broadcast(broadcast_dim_row));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/one_hot_op.cc b/paddle/operators/one_hot_op.cc
new file mode 100644
index 0000000000..e78b7468de
--- /dev/null
+++ b/paddle/operators/one_hot_op.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace operators {
+
+class OneHotOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of OneHotOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OneHotOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "Rank of Input(X) should be at least 2.");
+    PADDLE_ENFORCE_GE(x_dims[x_dims.size() - 1], 1U,
+                      "Last dimension of Input(X) should be 1.");
+
+    int depth = ctx->Attrs().Get<int>("depth");
+
+    PADDLE_ENFORCE_GT(depth, 0, "Should provide a positive depth (%d).", depth);
+
+    framework::DDim out_dims(x_dims);
+    out_dims[out_dims.size() - 1] = depth;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /* --> */ "Out");
+  }
+};
+
+class OneHotOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  OneHotOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, LoDTensor<int>) Input variable with rank at least 2. "
+             "The last dimension of X should be 1. Each value of X is an index "
+             "to indicate the position.");
+    AddOutput("Out",
+              "(Tensor, Tensor<float>) Output tensor with same rank as X. "
+              "The tensor consists of one-hot representations of values in X.");
+    AddAttr<int>("depth",
+                 "A positive integer to specify the length of one-hot vector.");
+    AddAttr<int>("dtype",
+                 "An integer to specify the data type of one-hot "
+                 "vector. The default value is FP32.")
+        .SetDefault(paddle::framework::proto::DataType::FP32);
+    AddComment(R"DOC(
+One Hot Operator. This operator creates the one-hot representations for input
+index values. The following example will help to explain the function of this
+operator:
+
+X is a LoDTensor:
+  X.lod = [[0, 1, 4]]
+  X.shape = [4, 1]
+  X.data = [[1], [1], [3], [0]]
+
+set depth = 4
+
+Out is a LoDTensor:
+  Out.lod = [[0, 1, 4]]
+  Out.shape = [4, 4]
+  Out.data = [[0., 1., 0., 0.],
+              [0., 1., 0., 0.],
+              [0., 0., 0., 1.],
+              [1., 0., 0., 0.]]
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(one_hot, ops::OneHotOp, ops::OneHotOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    one_hot, ops::OneHotKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::OneHotKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.cu b/paddle/operators/one_hot_op.cu
new file mode 100644
index 0000000000..16f6d9433e
--- /dev/null
+++ b/paddle/operators/one_hot_op.cu
@@ -0,0 +1,80 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/one_hot_op.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/gpu_info.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename InT, typename OutT>
+__global__ void FillOutputKernel(const InT* p_in_data, OutT* p_out_data,
+                                 const int64_t numel, const int depth) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    *(p_out_data + (idx * depth) + p_in_data[idx]) = 1.0;
+  }
+}
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpCUDAFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  const DeviceContext& ctx_;
+  int depth_;
+
+  OneHotOpCUDAFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                      int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    auto stream = ctx_.stream();
+    math::set_constant(ctx_, out_, 0.0);
+
+    FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
+                           PADDLE_CUDA_NUM_THREADS,
+                       PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        p_in_data, p_out_data, numel, depth_);
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpCUDAFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    one_hot, ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::OneHotCUDAKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/one_hot_op.h b/paddle/operators/one_hot_op.h
new file mode 100644
index 0000000000..12031ede2c
--- /dev/null
+++ b/paddle/operators/one_hot_op.h
@@ -0,0 +1,68 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename InT>
+struct OneHotOpFunctor {
+  const framework::LoDTensor* in_;
+  framework::LoDTensor* out_;
+  int depth_;
+  const DeviceContext& ctx_;
+
+  OneHotOpFunctor(const framework::LoDTensor* in, framework::LoDTensor* out,
+                  int depth, const DeviceContext& ctx)
+      : in_(in), out_(out), depth_(depth), ctx_(ctx) {}
+
+  template <typename OutT>
+  void operator()() const {
+    auto* p_in_data = in_->data<InT>();
+    auto numel = in_->numel();
+    auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
+    math::set_constant(ctx_, out_, 0.0);
+
+    for (int i = 0; i < numel; ++i) {
+      PADDLE_ENFORCE_GE(p_in_data[i], 0,
+                        "Illegal index value, should be at least 0.");
+      PADDLE_ENFORCE_LT(p_in_data[i], depth_,
+                        "Illegal index value, should be less than depth (%d).",
+                        depth_);
+      *(p_out_data + i * depth_ + p_in_data[i]) = 1.0;
+    }
+  }
+};
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class OneHotKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int depth = context.Attr<int>("depth");
+
+    framework::VisitDataType(
+        static_cast<framework::proto::DataType>(context.Attr<int>("dtype")),
+        OneHotOpFunctor<DeviceContext, T>(
+            in, out, depth, context.template device_context<DeviceContext>()));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/op_documentation/batch_norm_op.md b/paddle/operators/op_documentation/batch_norm_op.md
new file mode 100644
index 0000000000..d1392619c4
--- /dev/null
+++ b/paddle/operators/op_documentation/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+  estimated_mean = E[x]
+else
+  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+<img src="../images/batch_norm_op_kernel.png" width="800"/>
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python 
+def batch_norm_layer(net, 
+                     input,
+                     output, 
+                     scale, 
+                     bias, 
+                     use_global_est = False, 
+                     epsilon = 1e-6,
+                     momentum = 0.99):
+	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+	batch_mean = scope.new_var(name = 'batch_mean')
+	batch_var = scope.new_var(name = 'batch_var')
+	batch_norm_op = Operator('batch_norm_op',
+	                         x = input,
+	                         estimated_mean = mean_cache,
+	                         estimated_mean = var_cache,
+	                         scale = scale,
+	                         bias = bias,
+	                         y = output,
+	                         batch_mean = batch_mean,
+	                         batch_var = batch_var,
+	                         saved_mean = mean_cache,
+	                         saved_var = var_cache,
+	                         is_infer = False,
+	                         use_global_est = use_global_est,
+	                         epsilon = epsilon,
+	                         momentum = momentum)
+	net.append_op(batch_norm_op)
+	return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+    # ...
+    net.train()  # run training model
+    if pass_id % 100 == 0:
+        net.infer(test_image)    # run inferencing model
+    # ...
+``` 
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+<div align=center>
+<img src="../images/batch_norm_fork.png" width="500"/>
+</div>
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/paddle/operators/op_documentation/name_convention.md b/paddle/operators/op_documentation/name_convention.md
new file mode 100644
index 0000000000..a02b356f05
--- /dev/null
+++ b/paddle/operators/op_documentation/name_convention.md
@@ -0,0 +1,65 @@
+## Operator's Parameter Name Convention
+
+To make the operator document itself more clear, we recommend operator names obey the listing conventions.
+
+### OpProtoMaker names
+
+When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
+
+- Input/Output.
+  - Input/Output names follow the **CamelCase**. e.g. `X`, `Y`, `Matrix`, `LastAxisInMatrix`. Input/Output much more like Variables, we prefer to meaningful English words.
+  - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
+
+- Attribute.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
+
+- Comments.
+  - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
+  - Operator comment format of` R"DOC(your comment here)DOC"`. You should explain the input/output of the operator first. If there is math calculation in this operator, you should write the equation in the comment. e.g. `Out = X + Y`.
+
+- Order.
+  - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
+
+### Best Practice
+
+Here we give some examples to show how these rules will be used.
+
+- The operator has one input, one output. e.g.`relu`, inputs: `X`, outputs: `Out`.
+
+- The operator has two input, one output. e.g. `rowwise_add`, inputs : `X`, `Y`, outputs : `Out`.
+
+- The operator contains attribute. e.g. `cosine`, inputs : `X`, `axis`, outputs : `Out`.
+
+  We give a full example of Accumulator Operator.
+
+```c++
+class AccumulateOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AccumulateOpMaker(OpProto *proto,
+                    OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor that has to be accumulated to the output tensor.
+    If the output size is not the same as input size,
+    the output tensor is first reshaped and initialized to zero, and only then, accumulation is done.");
+    AddOutput("Out", "(Tensor) Accumulated output tensor");
+    AddAttr<float>("gamma", "(float, default 1.0) Accumulation multiplier").SetDefault(1.0f);
+    AddComment(R"DOC(
+Accumulate Operator.
+
+This operator accumulates the input tensor to the output tensor. If the
+output tensor already has the right size, we add to it; otherwise, we first
+initialize the output tensor to all zeros, and then do accumulation. Any
+further calls to the operator, given that no one else fiddles with the output
+in the interim, will do simple accumulations.
+
+Accumulation is done as follows:
+
+Out = 1*X + gamma*Out
+
+where X is the input tensor, Out is the output tensor and gamma is the multiplier
+argument.
+
+)DOC");
+  }
+};
+```
diff --git a/paddle/operators/op_documentation/net_op_design.md b/paddle/operators/op_documentation/net_op_design.md
new file mode 100644
index 0000000000..a5f0483081
--- /dev/null
+++ b/paddle/operators/op_documentation/net_op_design.md
@@ -0,0 +1,250 @@
+# Network Design
+
+`Network` is the container and controller of a set of operators,
+user can build a real network from a `NetDesc` which is a protobuf message 
+and use `Network.Run()` to run all the operators in the network.
+
+A network object knows all Operators belonging to this network. Variables, 
+which are inputs and outputs of these operators, 
+are created and managed by a hierarchy of Scope objects.
+
+# API
+
+## Net
+To make the `Network` extendable, a base class is defined like this
+
+```c++
+// operator's index stored in a network.
+typedef int OpIndex;
+
+// The minimum a network should be implemented.
+class Net {
+ public:
+  // run all the operators and return success(true) or not, with all the
+  // variables are located in `scope`. `context` describes the detail execution
+  // environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+  // If no positive indexes are provided, all operators in `ops_` will run.
+  virtual Error Run(Scope *scope, OpContext *context, OpIndex begin = -1,
+                   OpIndex end = -1) const = 0;
+
+  // Add an Operator according to `def`.
+  virtual OpIndex AddOp(const proto::OpDef &def) = 0;
+
+  // Add optimizer operators acctording to `attrs`.
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) = 0;
+
+  // Add backward operators.
+  virtual Error AddBackwardOps() = 0;
+
+  // Infer the shapes of variables required by operators in the network. The
+  // `scope` will be mutated according to the inferred shapes.
+
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+};
+```
+
+All network implementations should build networks from a protobuf message which 
+describes the structure of a real network; `Run` method should be implemented by 
+all implementations to offer a universal method to forward or backward compute a network.
+
+`Net::Create` is a method of factory pattern and can be implemented like
+
+```c++
+std::unique<Net> Net::Create(const NetDesc& def) {
+  switch (def.model_type()) {
+    case NN:
+      return new Network(def);
+    case Recursive:
+      return new RecursiveNet(def);
+    case Recurrent:
+      return new RecurrentNet(def);
+  }
+  return nullptr;
+}
+```
+
+Network is designed as the container of operators. to make it more extendable,
+we decouple it from the related variable resources. 
+
+`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
+
+Finally, `Net` can be used as followed
+
+```c++
+Scope default_scope;
+OpContext default_context;
+auto net = Net::CreateNet(def);
+
+if (net) {
+  net.Run(&default_scope, &default_context);
+}
+```
+
+## `PlainNet` as a simple implementation of `BaseNet`
+
+A very basic implementation is as follows. All it does is simply to run every operators in sequence.
+
+```c++
+class PlainNet : public Net {
+ public:
+  // Create a network describe by `def`.  NetDesc is the definition of a network.
+  PlainNet(const NetDesc &def);
+
+  // Infer all the operators' input and output varialbes' shapes, will be called before every mini-batch
+  training.
+  virtual Error InferShape(Scope *scope) override;
+
+  // Run all the operators with the `scope`, if no scope is provided, default
+  // scope will be used instead. If no OpContext is provicded, default context will be used.
+  virtual Error Run(Scope *scope = nullptr, OpContext *context=nullptr, OpIndex begin = -1,
+                   OpIndex end = -1) const override;
+
+  virtual OpIndex AddOp(const proto::OpDef &def) override;
+
+  virtual Error AddOptimizerOps(const OptAttrs &attrs) override;
+
+  virtual Error AddBackwardOps() override;
+
+ protected:
+  // Create operators accordding to `def`, will be called by the constructor.
+  Error BuildNet(const NetDesc &def);
+
+  // Add a operator which is identified as `type` and has attributes described
+  // in `attrs`, the `inputs` are the keys of readonly input variables,
+  // `outputs` are keys of mutable output variables. An `OpIndex` will be
+  // returned to indicate the offset of the new operator in `ops_`.
+  OpIndex AddOp(const std::string &type, const std::vector<string> &inputs,
+                const std::vector<string> &outputs,
+                const OprAttr &attrs = OprAttr());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+```
+
+`PlainNet` will create operators so that a private member `ops_` is defined,
+the operators are created by `CreateNet`, and each operator is created by `AddOp`.
+
+
+## PlainNet Usage
+`PlainNet` can be used to define and run a network as follows
+
+```c++
+// create an empty scope located on CPU device.
+Scope scope(CPUPlace());
+
+// create and init variables described in `net_desc`.
+scope.CreateVariables(net_desc);
+scope.InitVariables(net_desc);
+
+// create a network according to `net_desc`
+auto net = Net::CreateNet(net_desc);
+// Add more operators if needed.
+net->AddOp(add...);
+net->AddOp(fc...);
+
+net->AddBackwardOps();
+net->AddOptimizerOps();
+
+// run the network providing the `scope`.
+net.Run(&scope);
+```
+
+## `NetBuilder` as a C++ syntax wrapper
+This is a detailed description of the user-related C++ network API, and may not needed in the prototype development stage.
+
+The `NetBuilder` will give users a much simpler syntax as follows to create a network, and demonstrates how to use the `BaseNet`'s raw interfaces.
+
+```c++
+Variable* fc_out = builder.AddOp("fc", input=image, size=100, activation="Sigmoid");
+Variable* prediction = builder.AddOp("fc", input=fc_out, size=10, activation="Sigmoid");
+Variable* loss = builder.AddOp("cross_entropy", input=prediction, label=label);
+Variable* avg_loss = builder.AddOp("mean", loss);
+
+builder.BackwardFrom(avg_loss)
+builder.AddOptimization(1e-4, "adam");
+builder.Run();
+```
+
+`NetBuilder` will call `Net` 's virtual functions to change the real network structure, here is a sample definition
+
+```c++
+class NetBuilder final {
+ public:
+  NetBuilder(Net* net) : net_(net) {}
+
+  Variable* AddOp(const string& type, const vector<Variable>& inputs,
+                  size_t size, Activation act) {
+    // much code here.
+    // ...
+    net_->AddOp(def);
+    need_rebuild_net_ = true;
+    net_->InferShape();
+    // ...
+  }
+
+  Error BackwardFrom(const Variable& cost);
+
+  Error Run(Scope* scope, OpContext* context, bool need_backward = true) {
+    // backward.
+    if (need_backward) {
+      if (need_rebuild_net_) {
+        AddBackwardOps();
+        AddOptimizerOps();
+      }
+      net_->Run(scope, context);
+      return;
+    }
+    // just forward.
+    net_->Run(scope, context, 0, last_forward_op_);
+  }
+
+ protected:
+  Error AddBackwardOps();
+  Error AddOptimizerOps();
+
+ private:
+  Net* net_;
+  OpIndex last_forward_op_{-1};
+  bool need_rebuild_net_{true};
+}
+```
+
+## Compatibility with RNN
+
+Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, 
+for example we can implement a simple recurrent neural network as follows
+
+```c++
+// copy some `vars` form `source` to `target`
+void Copy(const Scope &source, Scope &target,
+          const std::vector<std::string> &vars);
+
+Scope default_scope;
+// some initial mutations on `default_scope` here.
+
+auto rnn_step_net = PlainNet(rnn_step_net_def);
+
+// Create rnn's states, the last scope is used to store rnn outputs.
+Scope *rnn_states = new Scope[num_states + 1];
+
+for (int i = 0; i < num_states + 1; i++) {
+  // Initialize all rnn state scopes, copy parameters and so on.
+  rnn_states[i].CreateVars(rnn_step_net_def);
+  Copy(default_scope, rnn_states[i], rnn_related_vars);
+  // Prepare rnn's inlinks, just copy inlink variables to each state.
+  Copy(default_scope, rnn_states[i], inlink_vars);
+}
+
+// Run the rnn.
+for (int i = 0; i < num_states; i++) {
+  rnn_step_net.Run(rnn_states[i]);
+  // Copy current state's state variables to next state, the related variables
+  // are named like "previous_state_xxx".
+  Copy(rnn_states[i], rnn_states[i + 1], pre_state_vars)
+}
+
+// Copy rnn's final outputs to `default_scope`.
+Copy(rnn_states[num_states], default_scope, outlink_vars);
+```
diff --git a/paddle/operators/op_documentation/op_markdown_format.md b/paddle/operators/op_documentation/op_markdown_format.md
new file mode 100644
index 0000000000..0ee804d592
--- /dev/null
+++ b/paddle/operators/op_documentation/op_markdown_format.md
@@ -0,0 +1,64 @@
+# Standard Markdown Format for Operators
+The following should be the standard format for documentation for all the operators that will get rendered in the `html`:
+
+```
+Operator Name (In PaddlePaddle)
+
+Operator Name (Standard)
+
+Operator description.
+
+LaTeX equation of how the operator performs an update.
+
+The signature of the operator.
+```
+
+Each section mentioned above has been covered in further detail in the rest of the document.
+
+# PaddlePaddle Operator Name
+This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
+`array to lod tensor` should be written as `array_to_lod_tensor`.
+
+This naming convention should be standard across all PaddlePaddle operators.
+
+# Standard Operator Name
+This is the standard name of the operator as used in the community. The general standard is usually:
+- Standard abbreviations like `SGD` are written in all capital letters.
+- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
+- Keep numbers inside a word as is, with no boundary delimiters.
+- Follow the name of the operator with the keyword: `Activation Operator.`
+
+# Operator description
+This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
+
+# LaTeX equation
+This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
+
+# The signature
+This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
+`Section :
+VariableName : (VariableType) VariableDescription
+...
+...
+`
+
+
+The following example for an `sgd` operator covers the above mentioned sections as they would ideally look like in the `html`:
+
+```
+sgd
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+param_out = param_learning_rate * grad
+
+Inputs:
+Param : (Tensor) Input parameter
+LearningRate : (Tensor) Learning rate of SGD
+Grad : (Tensor) Input gradient
+
+Outputs:
+ParamOut : (Tensor) Output parameter
+```
diff --git a/paddle/operators/op_documentation/rnn_design.md b/paddle/operators/op_documentation/rnn_design.md
new file mode 100644
index 0000000000..3d38b9a0ad
--- /dev/null
+++ b/paddle/operators/op_documentation/rnn_design.md
@@ -0,0 +1,239 @@
+# RNN 变长输入设计
+对变长序列的学习,现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式,
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持,本文也将基于该模块的思路,设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape,因此基于tensor 的主流框架在存储变长序列时,
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协, 从用户角度,在使用RNN类模型时自然会比较介意padding的存在,
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗,tensorflow和mxnet均使用了bucketing来进行优化[1][2],
+但不管是padding还是bucket,对于用户都是额外的使用负担。
+
+因此,**paddle原生支持变长序列的方式,能直接满足用户对变长序列的最直接的需求,在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持,需要对目前框架做一些修改,下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上,
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息,更高维度的序列则无法直接支持;
+
+为了支持 `N-level` 序列的存储,本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息,和paddle目前做法一致。
+
+为了更透明地传递序列信息,我们引入了一种新的tensor 称为 `LODTensor`[4],
+其关于tensor相关的接口都直接继承自 `Tensor`,但另外添加了序列相关接口。
+如此,在操作一个 `LODTensor` 时,普通 `Op` 直接当成 `Tensor` 使用,
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下:
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中, `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价,
+可以认为 `LODTensor` 是 `Tensor` 的扩展,几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递,框架里很多 `Tensor` 都需要变成 `LODTensor`,
+简单实现,直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`,这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外,用户有可能需要感知序列的存在(比如序列的可视化需要解析模型中输出的序列),因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性,以实现`lod_start_pos`的传递:
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存,以存储自己独立的修改,并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况,调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的,调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中,并且只需要初始化一次,基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性,默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ,并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此,`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到,对于需要修改 `load_start_pos` 的Op,`OperatorBase` 会分配一块自己的内存以存储修改,
+Op在 `Run` 的实现中,操作更新自己的 `load_start_pos` ,
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后,从前往后的时间步的batch size会自然地递减,可以直接塞入 Net 做batch计算
+
+比如原始的输入:
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后,每个会有4个时间步,每个时间步的输入如下(纵向排列)
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化,这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置,并添加一个新的接口 
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化,以下现有的接口需要针对性地修改:
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外,由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用,因此会变成 `RecurrentOp` 一个新的output输出,
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化,`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息,将原始的序列按照排序后的序列顺序,从横向切割,转为每个step中的inputs。
+
+即下面的转变:
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序(以防止Infer阶段顺序打乱)
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
new file mode 100644
index 0000000000..90c53bd177
--- /dev/null
+++ b/paddle/operators/pad_op.cc
@@ -0,0 +1,140 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pad_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class PadOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of PadOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
+                      "Size of paddings should be equal to 2 * dimension size "
+                      "of input tensor.");
+    std::vector<int64_t> out_dims(x_dim.size());
+    for (int i = 0; i < x_dim.size(); ++i) {
+      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
+    if (out_dims[0] == x_dim[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class PadOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PadOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input of pad op. "
+             "The input should be a k-D tensor(k > 0 and k < 7)");
+    AddOutput("Out",
+              "The output of pad op. "
+              "A tensor with the same shape as X.");
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>) "
+        "A list<int> to describe the padding rules for each dimension. "
+        "For 2-D image tensor, paddings=[0, 1, 2, 3] means "
+        "padding 0 row to top, 1 row to bottom, 2 columns to left "
+        "and 3 columns to right. Size of paddings should be equal to "
+        "2 * dimension size of the input tensor.");
+    AddAttr<float>("pad_value",
+                   "(float, default 0.0) "
+                   "The value to fill the padded areas.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Pad Operator.
+
+Pad input into output, as specified by paddings and pad_value. 
+The input should be a k-D tensor(k > 0 and k < 7). As an example:
+
+Given:
+
+X = [[1, 2],
+     [3, 4]],
+
+paddings = [0, 1, 1, 2],
+
+and
+
+pad_value = 0,
+
+we have:
+
+Out = [[0, 1, 2, 0, 0]
+       [0, 3, 4, 0, 0]
+       [0, 0, 0, 0, 0]]
+
+)DOC");
+  }
+};
+
+class PadOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+class PadOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* bind = new framework::OpDesc();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad_grad");
+    return std::unique_ptr<framework::OpDesc>(bind);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
+REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu
new file mode 100644
index 0000000000..433b5f1112
--- /dev/null
+++ b/paddle/operators/pad_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/pad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    pad, ops::PadKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    pad_grad, ops::PadGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
new file mode 100644
index 0000000000..fdf91a5776
--- /dev/null
+++ b/paddle/operators/pad_op.h
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T, size_t D>
+void PadFunction(const framework::ExecutionContext& context) {
+  auto pads = context.Attr<std::vector<int>>("paddings");
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = pads[i * 2];
+    paddings[i].second = pads[i * 2 + 1];
+  }
+  T pad_value = context.Attr<T>("pad_value");
+
+  auto* x = context.Input<Tensor>("X");
+  auto* out = context.Output<Tensor>("Out");
+  out->mutable_data<T>(context.GetPlace());
+
+  auto x_tensor = EigenTensor<T, D>::From(*x);
+  auto out_tensor = EigenTensor<T, D>::From(*out);
+  auto& place =
+      *context.template device_context<DeviceContext>().eigen_device();
+  out_tensor.device(place) = x_tensor.pad(paddings, pad_value);
+}
+
+template <typename DeviceContext, typename T>
+class PadKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    int rank = context.Input<Tensor>("X")->dims().size();
+    switch (rank) {
+      case 1:
+        PadFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        PadFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        PadFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        PadFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        PadFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        PadFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "PadOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, size_t D>
+void PadGradFunction(const framework::ExecutionContext& context) {
+  auto pads = context.Attr<std::vector<int>>("paddings");
+  Eigen::array<std::pair<int, int>, D> paddings;
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    paddings[i].first = -pads[i * 2];
+    paddings[i].second = -pads[i * 2 + 1];
+  }
+  auto* d_out = context.Input<Tensor>(framework::GradVarName("Out"));
+  auto* d_x = context.Output<Tensor>(framework::GradVarName("X"));
+  if (d_x != nullptr) {
+    d_x->mutable_data<T>(context.GetPlace());
+    auto d_x_tensor = EigenTensor<T, D>::From(*d_x);
+    auto d_out_tensor = EigenTensor<T, D>::From(*d_out);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0);
+  }
+}
+
+template <typename DeviceContext, typename T>
+class PadGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    size_t rank =
+        context.Input<Tensor>(framework::GradVarName("Out"))->dims().size();
+    switch (rank) {
+      case 1:
+        PadGradFunction<DeviceContext, T, 1>(context);
+        break;
+      case 2:
+        PadGradFunction<DeviceContext, T, 2>(context);
+        break;
+      case 3:
+        PadGradFunction<DeviceContext, T, 3>(context);
+        break;
+      case 4:
+        PadGradFunction<DeviceContext, T, 4>(context);
+        break;
+      case 5:
+        PadGradFunction<DeviceContext, T, 5>(context);
+        break;
+      case 6:
+        PadGradFunction<DeviceContext, T, 6>(context);
+        break;
+      default:
+        PADDLE_THROW(
+            "PadOp only support tensors with no more than 6 dimensions.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/parallel_do_op.cc b/paddle/operators/parallel_do_op.cc
new file mode 100644
index 0000000000..67f9854c02
--- /dev/null
+++ b/paddle/operators/parallel_do_op.cc
@@ -0,0 +1,378 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/threadpool.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+static constexpr char kInputs[] = "inputs";
+static constexpr char kParameters[] = "parameters";
+static constexpr char kPlaces[] = "places";
+
+static constexpr char kOutputs[] = "outputs";
+static constexpr char kParallelScopes[] = "parallel_scopes";
+
+static constexpr char kParallelBlock[] = "sub_block";
+
+using LoDTensor = framework::LoDTensor;
+using SelectedRows = framework::SelectedRows;
+
+static void SplitTensorAndMoveTensorToScopes(
+    const framework::Scope &scope, std::vector<framework::Scope *> *sub_scopes,
+    const std::vector<platform::Place> &places,
+    const std::vector<std::string> &names) {
+  size_t num_sub_scopes = 0;
+  for (auto &argu : names) {
+    const auto &tensor =
+        detail::Ref(scope.FindVar(argu),
+                    "Cannot find variable %s in the parent scope", argu)
+            .Get<LoDTensor>();
+    auto lod_tensors = tensor.SplitLoDTensor(places);
+
+    for (auto &lod : lod_tensors) {
+      VLOG(3) << lod.dims();
+    }
+    if (num_sub_scopes == 0) {
+      num_sub_scopes = lod_tensors.size();
+    } else {
+      PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size());
+    }
+    PADDLE_ENFORCE_NE(num_sub_scopes, 0);
+    if (sub_scopes->size() == 0) {
+      sub_scopes->reserve(num_sub_scopes);
+      for (size_t i = 0; i < num_sub_scopes; ++i) {
+        sub_scopes->emplace_back(&scope.NewScope());
+      }
+    }
+
+    for (size_t i = 0; i < lod_tensors.size(); ++i) {
+      *detail::Ref(sub_scopes->at(i)->Var(argu),
+                   "Cannot find variable in the sub-scope", argu)
+           .GetMutable<LoDTensor>() = lod_tensors[i];
+    }
+  }
+}
+
+inline void CopyOrShare(const framework::Variable &src,
+                        const platform::Place &dst_place,
+                        framework::Variable *dst) {
+  if (src.IsType<LoDTensor>()) {
+    if (src.Get<LoDTensor>().place() == dst_place) {
+      dst->GetMutable<LoDTensor>()->ShareDataWith(src.Get<LoDTensor>());
+    } else {
+      Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
+    }
+  } else if (src.IsType<SelectedRows>()) {
+    auto &src_sr = src.Get<SelectedRows>();
+    auto *dst_sr = dst->GetMutable<SelectedRows>();
+    dst_sr->set_rows(src_sr.rows());
+    dst_sr->set_height(src_sr.height());
+    if (src_sr.value().place() == dst_place) {
+      dst_sr->mutable_value()->ShareDataWith(src_sr.value());
+    } else {
+      Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
+    }
+  } else {
+    PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
+  }
+}
+
+void WaitOnPlace(const platform::Place place) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto &dev_ctx = *pool.Get(place);
+  dev_ctx.Wait();
+}
+
+void WaitOnPlaces(const std::vector<platform::Place> places) {
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+
+  for (auto &place : places) {
+    auto &dev_ctx = *pool.Get(place);
+    dev_ctx.Wait();
+  }
+}
+
+class ParallelDoOp : public framework::OperatorBase {
+ public:
+  ParallelDoOp(const std::string &type,
+               const framework::VariableNameMap &inputs,
+               const framework::VariableNameMap &outputs,
+               const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
+                            ->GetMutable<std::vector<framework::Scope *>>();
+
+    // split input
+    SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places,
+                                     Inputs(kInputs));
+
+    // copy parameter
+    for (auto &param : Inputs(kParameters)) {
+      PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
+                     "Only support parameter type as LoDTensor");
+      auto &src = scope.FindVar(param)->Get<LoDTensor>();
+      for (size_t i = 0; i < sub_scopes.size(); ++i) {
+        auto &place = places[i];
+        auto *sub_scope = sub_scopes[i];
+        auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
+        framework::Copy(src, place, dst);
+      }
+    }
+    WaitOnPlaces(places);
+
+    std::vector<std::future<void>> workers;
+    workers.reserve(places.size());
+    for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) {
+      auto &place = places[place_idx];
+      auto *cur_scope = sub_scopes[place_idx];
+
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    // merge output
+    for (auto &o_name : Outputs(kOutputs)) {
+      std::vector<const framework::LoDTensor *> lod_tensors;
+      lod_tensors.reserve(sub_scopes.size());
+      for (auto *sub_scope : sub_scopes) {
+        lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
+      }
+
+      auto *lod_tensor_to_be_merged =
+          scope.FindVar(o_name)->GetMutable<LoDTensor>();
+      lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "").AsDuplicable();
+    AddInput(kParameters, "").AsDuplicable();
+    AddInput(kPlaces, "");
+    AddOutput(kOutputs, "").AsDuplicable();
+    AddOutput(kParallelScopes, "");
+    AddAttr<framework::BlockDesc *>(kParallelBlock, "");
+    AddComment(R"DOC(
+ParallelDo Operator.
+)DOC");
+  }
+};
+
+class ParallelDoGradOp : public framework::OperatorBase {
+ public:
+  ParallelDoGradOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
+    auto *program = block->Program();
+
+    auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
+                           ->Get<std::vector<framework::Scope *>>();
+
+    auto &places = scope.FindVar(Input(kPlaces))->Get<platform::PlaceList>();
+
+    // feed output@grad
+    SplitTensorAndMoveTensorToScopes(
+        scope, const_cast<std::vector<framework::Scope *> *>(&sub_scopes),
+        places, Inputs(framework::GradVarName(kOutputs)));
+    WaitOnPlaces(places);
+
+    // exe run
+    std::vector<std::future<void>> workers;
+    for (size_t i = 0; i < sub_scopes.size(); ++i) {
+      auto &place = places[i];
+      auto *cur_scope = sub_scopes[i];
+
+      // execute
+      workers.emplace_back(framework::Async([program, cur_scope, place, block] {
+        framework::Executor executor(place);
+        executor.Run(*program, cur_scope, block->ID(),
+                     false /*create_local_scope*/);
+      }));
+    }
+    for (auto &worker : workers) {
+      worker.wait();
+    }
+    WaitOnPlaces(places);
+
+    AccumulateGrad(scope, place, sub_scopes, places);
+  }
+
+  void AccumulateGrad(const framework::Scope &scope,
+                      const platform::Place &place,
+                      const std::vector<framework::Scope *> &sub_scopes,
+                      const platform::PlaceList &places) const {
+    for (auto &s : Outputs(framework::GradVarName(kParameters))) {
+      std::string tmp_name;
+      auto *tmp = sub_scopes[0]->Var(&tmp_name);
+
+      for (size_t i = 1; i < sub_scopes.size(); ++i) {
+        CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp);
+        WaitOnPlace(places[0]);
+
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}},
+            framework::AttributeMap{});
+        VLOG(3) << sum_op->DebugStringEx(sub_scopes[0]);
+        sum_op->Run(*sub_scopes[0], places[0]);
+        WaitOnPlace(places[0]);
+      }
+
+      CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s));
+    }
+    WaitOnPlaces(places);
+  }
+};
+
+std::ostream &operator<<(std::ostream &sout,
+                         const std::vector<std::string> &strs) {
+  std::copy(strs.begin(), strs.end(),
+            std::ostream_iterator<std::string>(sout, ","));
+  return sout;
+}
+
+class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("parallel_do_grad");
+    for (auto &input_param : this->InputNames()) {
+      VLOG(3) << input_param;
+      grad->SetInput(input_param, this->Input(input_param));
+      if (input_param != kPlaces) {
+        grad->SetOutput(framework::GradVarName(input_param),
+                        this->InputGrad(input_param, false));
+      }
+    }
+    auto *g_block = this->grad_block_[0];
+
+    // All variable name that needed by gradient operators
+    std::unordered_set<std::string> all_inputs_in_grad_blocks;
+
+    for (size_t i = 0; i < g_block->OpSize(); ++i) {
+      auto *op = g_block->Op(i);
+      for (auto &var_name : op->InputArgumentNames()) {
+        all_inputs_in_grad_blocks.insert(var_name);
+      }
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kParallelScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        std::vector<std::string> og_names;
+        for (auto &og_name : this->OutputGrad(output_param)) {
+          if (all_inputs_in_grad_blocks.count(og_name) != 0) {
+            // there are some gradient operators who need the OG. So make this
+            // OG as an input of parallel.do
+            og_names.push_back(og_name);
+          }
+          // else, there is no operator who need the OG. Do not use this OG as
+          // an input
+        }
+        grad->SetInput(framework::GradVarName(output_param), og_names);
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kParameters, kInputs};
+    std::vector<std::string> output{kOutputs};
+
+    PADDLE_ENFORCE(ctx->HasInputs(kParameters));
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+    PADDLE_ENFORCE(ctx->HasInputs(kInputs));
+
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+
+    ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                       ctx->GetInputsDim(kParameters));
+
+    auto i_dims = ctx->GetInputsDim(kInputs);
+    auto ig_names = ctx->Outputs(framework::GradVarName(kInputs));
+
+    for (size_t i = 0; i < ig_names.size(); ++i) {
+      auto &ig_name = ig_names[i];
+      if (ig_name == framework::kEmptyVarName) {
+        continue;
+      }
+
+      ctx->SetDims({ig_name}, {i_dims[i]});
+    }
+
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
+                  paddle::operators::ParallelDoOpProtoMaker,
+                  paddle::operators::ParallelDoGradOpDescMaker);
+REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
+                  paddle::operators::ParallelDoGradOpShapeInference);
diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc
new file mode 100644
index 0000000000..446fb0819d
--- /dev/null
+++ b/paddle/operators/pool_cudnn_op.cu.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/pool_op.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedPoolingDescriptor = platform::ScopedPoolingDescriptor;
+using DataLayout = platform::DataLayout;
+using PoolingMode = platform::PoolingMode;
+
+template <typename T>
+class PoolCUDNNOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    Tensor *output = ctx.Output<Tensor>("Out");
+
+    const T *input_data = input->data<T>();
+    T *output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    PADDLE_ENFORCE(platform::dynload::cudnnPoolingForward(
+        handle, cudnn_pool_desc, &alpha, cudnn_input_desc, input_data, &beta,
+        cudnn_output_desc, output_data));
+  }
+};
+
+template <typename T>
+class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+
+    const Tensor *input = ctx.Input<Tensor>("X");
+    const Tensor *output = ctx.Input<Tensor>("Out");
+    const Tensor *output_grad =
+        ctx.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = ctx.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+
+    if (ctx.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(input->dims()[i + 2]);
+      }
+    }
+
+    const T *input_data = input->data<T>();
+    const T *output_data = output->data<T>();
+    const T *output_grad_data = output_grad->data<T>();
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedPoolingDescriptor pool_desc;
+    DataLayout layout;
+
+    if (strides.size() == 2U) {
+      layout = DataLayout::kNCHW;
+    } else {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
+        layout, framework::vectorize2int(input->dims()));
+    cudnnTensorDescriptor_t cudnn_output_desc = output_desc.descriptor<T>(
+        layout, framework::vectorize2int(output->dims()));
+
+    PoolingMode pooling_mode;
+    if (pooling_type == "max") {
+      pooling_mode = PoolingMode::kMaximum;
+    } else {
+      pooling_mode = PoolingMode::kAverage;
+    }
+
+    cudnnPoolingDescriptor_t cudnn_pool_desc =
+        pool_desc.descriptor(pooling_mode, ksize, paddings, strides);
+
+    // ------------------- cudnn pool algorithm ---------------------
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    T alpha = 1.0f, beta = 0.0f;
+
+    if (input_grad) {
+      T *input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      // Because beta is zero, it is unnecessary to reset input_grad.
+
+      PADDLE_ENFORCE(platform::dynload::cudnnPoolingBackward(
+          handle, cudnn_pool_desc, &alpha, cudnn_output_desc, output_data,
+          cudnn_output_desc, output_grad_data, cudnn_input_desc, input_data,
+          &beta, cudnn_input_desc, input_grad_data));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_KERNEL(pool2d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool2d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
+
+REGISTER_OP_KERNEL(pool3d, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNOpKernel<float>,
+                   ops::PoolCUDNNOpKernel<double>);
+REGISTER_OP_KERNEL(pool3d_grad, CUDNN, ::paddle::platform::CUDAPlace,
+                   ops::PoolCUDNNGradOpKernel<float>,
+                   ops::PoolCUDNNGradOpKernel<double>);
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
new file mode 100644
index 0000000000..b97333bb1a
--- /dev/null
+++ b/paddle/operators/pool_op.cc
@@ -0,0 +1,306 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Pooling should not be null.");
+
+  auto in_x_dims = ctx->GetInputDim("X");
+
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
+  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                 "Pooling intput should be 4-D or 5-D tensor.");
+
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
+    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      paddings[i] = 0;
+      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+  }
+
+  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                 "Input size and pooling size should be consistent.");
+  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                    "Strides size and pooling size should be the same.");
+  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                    "Paddings size and pooling size should be the same.");
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < ksize.size(); ++i) {
+    output_shape.push_back(
+        OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("X", "Out");
+}
+
+framework::OpKernelType PoolOp::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                 "Input(X@GRAD) should not be null.");
+  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+}
+
+framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext &ctx) const {
+  bool use_cudnn = ctx.Attr<bool>("use_cudnn");
+  use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
+#ifdef PADDLE_WITH_CUDA
+  if (platform::is_gpu_place(ctx.GetPlace())) {
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
+  }
+#endif
+  framework::LibraryType library_;
+  if (use_cudnn) {
+    library_ = framework::LibraryType::kCUDNN;
+  } else {
+    library_ = framework::LibraryType::kPlain;
+  }
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
+      layout_, library_);
+}
+
+Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "(Tensor) The input tensor of pooling operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
+
+  AddAttr<std::string>("pooling_type",
+                       "(string), pooling type, can be \"max\" for max-pooling "
+                       "and \"avg\" for average-pooling.")
+      .InEnum({"max", "avg"});
+  AddAttr<std::vector<int>>("ksize",
+                            "(vector<int>) The pooling window "
+                            "size(height, width) of the pooling operator. "
+                            "If global_pooling = true, ksize and paddings will "
+                            "be ignored.");  // TODO(Chengduo): Add checker.
+                                             // (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>("global_pooling",
+                "(bool, default false) Whether to use the global pooling. "
+                "If global_pooling = true, ksize and paddings will be ignored.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "(vector<int>, default {1, 1}), strides(height, "
+                            "width) of pooling operator.")
+      .SetDefault({1, 1});
+  // TODO(Chengduo): Add checker. (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
+      "operator."
+      "If global_pooling = true, paddings and ksize will be ignored.")
+      .SetDefault({0, 0});
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+
+  AddComment(R"DOC(
+Pool2d Operator.
+
+The pooling2d operation calculates the output based on
+the input, pooling_type and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:   
+  Input:
+       X shape: $(N, C, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$ 
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
+)DOC");
+}
+
+Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput("X",
+           "(Tensor) The input tensor of pooling operator. "
+           "The format of input tensor is NCDHW, where N is batch size, C is "
+           "the number of channels, and D, H and W is the depth, height and "
+           "width of "
+           "the feature, respectively.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator."
+            "The format of output tensor is also NCDHW, "
+            "where N is batch size, C is "
+            "the number of channels, and D, H and W is the depth, height and "
+            "width of the feature, respectively.");
+
+  AddAttr<std::string>("pooling_type",
+                       "(string) Pooling type, can be \"max\" for max-pooling "
+                       "and \"avg\" for average-pooling.")
+      .InEnum({"max", "avg"});
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "(vector<int>) The pooling window size(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will "
+      "be ignored.");  // TODO(Chengduo): Add checker.
+                       // (Currently,
+  // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "global_pooling",
+      "(bool, default false) Whether to use the global pooling. "
+      "If global_pooling = true, ksize and paddings wille be ignored.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>(
+      "strides",
+      "(vector<int>, default {1,1,1}) Strides(depth, height, "
+      "width) of the pooling operator.")
+      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
+      "width) of pooling operator. "
+      "If global_pooling = true, ksize and paddings will be ignored.")
+      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool, default false) Only used in cudnn kernel, need install cudnn")
+      .SetDefault(false);
+  AddAttr<std::string>(
+      "data_format",
+      "(string, default NCHW) Only used in "
+      "An optional string from: \"NHWC\", \"NCHW\". "
+      "Defaults to \"NHWC\". Specify the data format of the output data, "
+      "the input will be transformed automatically. ")
+      .SetDefault("AnyLayout");
+  // TODO(dzhwinter): need to registered layout transform function
+
+  AddComment(R"DOC(
+Pool3d Operator.
+
+The pooling3d operation calculates the output based on
+the input, pooling_type, ksize, strides, and paddings parameters.
+Input(X) and output(Out) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. Parameters(ksize, strides, paddings) 
+are three elements. These three elements represent depth, height and 
+width, respectively. The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
+
+)DOC");
+}
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool2d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>)
+
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    pool3d_grad, ops::PoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc
new file mode 100644
index 0000000000..39a9dfbf79
--- /dev/null
+++ b/paddle/operators/pool_op.cu.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool2d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+
+REGISTER_OP_CUDA_KERNEL(
+    pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    pool3d_grad,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
new file mode 100644
index 0000000000..d6ba5e298a
--- /dev/null
+++ b/paddle/operators/pool_op.h
@@ -0,0 +1,183 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
+template <typename DeviceContext, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              DeviceContext, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              DeviceContext, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process,
+                         out);
+        }
+      } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      set_constant(dev_ctx, in_x_grad, 0.0);
+
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<DeviceContext, T>
+                pool2d_backward;
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<DeviceContext, T>
+                pool3d_backward;
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, in_x_grad);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                DeviceContext, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides,
+                            paddings, pool_process, in_x_grad);
+          }
+        } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
new file mode 100644
index 0000000000..1d31d813af
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.cc
@@ -0,0 +1,291 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline int OutputSizeMaxPool(int input_size, int filter_size, int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
+                   "Output(Mask) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+      }
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and pooling size should be consistent.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "Strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "Paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i],
+                                               paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool2dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the image, "
+        "and W is the width of the image.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is "
+              "the number of channels, H is the height of the image "
+              "and W is the width of the image.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the image, "
+              "and W is the width of the image. "
+              "It represents the index in the current feature map.");
+
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector<int>) The pooling window size(height, "
+                              "width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "(bool, default:false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1, 1}), strides(height, "
+                              "width) of pooling operator.")
+        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
+        "operator. "
+        "If global_pooling = true, paddings and will be ignored.")
+        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+MaxPool2d Operator.
+
+The maxPooling2d with index operation calculates the output and the mask
+based on the input, ksize, strides, and paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format, where N is batch size, C is the
+number of channels, H is the height of the feature, 
+and W is the width of the feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, H_{out}, W_{out})$
+       Mask shape: $(N, C, H_{out}, W_{out})$
+  Where
+       $$
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
+       $$
+
+)DOC");
+  }
+};
+
+class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool3dWithIndexOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor of pooling operator. "
+             "The format of input tensor is NCDHW, where N is batch size, C is "
+             "the number of channels, and D, H and W are the depth, height and "
+             "width of "
+             "the image, respectively");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, "
+              "and D, H and W are the depth, height and "
+              "width of the image, respectively.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator. "
+              "The format of output tensor is also NCDHW, "
+              "where N is the batch size, C is the number of channels, and "
+              "D, H and W are the depth, height and width "
+              "of the image, respectively. "
+              "It represents the index in the current feature map.");
+
+    AddAttr<std::vector<int>>("ksize",
+                              "(vector<int>) The pooling window size(depth, "
+                              "height, width) of pooling operator. "
+                              "If global_pooling = true, ksize and paddings "
+                              "will be ignored.");  // TODO(Chengduo): Add
+                                                    // checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "(bool, default false) Whether to use the global pooling. "
+        "If global_pooling = true, ksize and paddings will be ignored.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "(vector<int>, default {1,1,1}), strides(depth, "
+                              "height, width) of pooling operator.")
+        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "(vector, default {0,0,0}), paddings(depth, "
+        "height, width) of pooling operator. "
+        "If global_pooling = true, paddings and ksize will be ignored.")
+        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+    // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+MaxPool3d Operator.
+
+The maxpooling3d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters.
+Input(X) and output(Out, Mask) are in NCDHW format, where N is batch
+size, C is the number of channels, and D, H and W are the depth, height and
+width of the feature, respectively. 
+Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
+  Output:
+       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
+       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
+  Where
+       $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+       $$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
+
+REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, float, int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUDeviceContext, double,
+                                int>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUDeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc
new file mode 100644
index 0000000000..4c9804da63
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.cu.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
+
+REGISTER_OP_CUDA_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, float,
+                                int>,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CUDADeviceContext, double,
+                                int>);
+REGISTER_OP_CUDA_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, float,
+                                    int>,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CUDADeviceContext, double,
+                                    int>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
new file mode 100644
index 0000000000..4f4087d1dd
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.h
@@ -0,0 +1,110 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T1> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    Tensor* mask = context.Output<Tensor>("Mask");
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        paddle::operators::math::MaxPool2dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
+            pool2d_forward;
+        pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+      } break;
+      case 3: {
+        paddle::operators::math::MaxPool3dWithIndexFunctor<DeviceContext, T1,
+                                                           T2>
+            pool3d_forward;
+        pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask);
+      } break;
+      default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T1, typename T2>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* mask = context.Input<Tensor>("Mask");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        paddings[i] = 0;
+        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
+      }
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T1>(context.GetPlace());
+      auto& device_ctx = context.template device_context<DeviceContext>();
+      math::set_constant(device_ctx, in_x_grad, 0);
+
+      switch (ksize.size()) {
+        case 2: {
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
+              pool2d_backward;
+          pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
+        } break;
+        case 3: {
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<DeviceContext,
+                                                                 T1, T2>
+              pool3d_backward;
+          pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides,
+                          paddings, in_x_grad);
+        } break;
+        default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc
new file mode 100644
index 0000000000..5aa5167dbb
--- /dev/null
+++ b/paddle/operators/positive_negative_pair_op.cc
@@ -0,0 +1,179 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/positive_negative_pair_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PositiveNegativePairOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("Score"),
+        "Input(Score) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("Label"),
+        "Input(Label) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("QueryID"),
+        "Input(QueryID) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("PositivePair"),
+        "Output(PositivePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegativePair"),
+        "Output(NegativePair) of PositiveNegativePairOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NeutralPair"),
+        "Output(NeutralPair) of PositiveNegativePairOp should not be null.");
+    auto scalar_dim = framework::make_ddim({1});
+    if (ctx->HasInput("AccumulatePositivePair") ||
+        ctx->HasInput("AccumulateNegativePair") ||
+        ctx->HasInput("AccumulateNeutralPair")) {
+      PADDLE_ENFORCE(ctx->HasInput("AccumulatePositivePair") &&
+                         ctx->HasInput("AccumulateNegativePair") &&
+                         ctx->HasInput("AccumulateNeutralPair"),
+                     "All optional inputs(AccumulatePositivePair, "
+                     "AccumulateNegativePair, AccumulateNeutralPair) of "
+                     "PositiveNegativePairOp are required if one of them is "
+                     "specified.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulatePositivePair"), scalar_dim,
+                        "Shape of AccumulatePositivePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNegativePair"), scalar_dim,
+                        "Shape of AccumulateNegativePair should be {1}.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("AccumulateNeutralPair"), scalar_dim,
+                        "Shape of AccumulateNeutralPair should be {1}.");
+    }
+
+    auto score_dim = ctx->GetInputDim("Score");
+    auto label_dim = ctx->GetInputDim("Label");
+    auto query_dim = ctx->GetInputDim("QueryID");
+    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        label_dim[0], score_dim[0],
+        "Tensor Score and Label should have the same height (batch size).");
+    PADDLE_ENFORCE_EQ(label_dim[1], 1,
+                      "The width of Label should be 1, i.e. each item should "
+                      "have a scalar label.");
+    PADDLE_ENFORCE(query_dim == label_dim,
+                   "QueryID should have the same shape as Label.");
+    if (ctx->HasInput("Weight")) {
+      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                     "Weight should have the same shape as Label.");
+    }
+    int column = ctx->Attrs().Get<int>("column");
+    auto depth = score_dim[1];
+    PADDLE_ENFORCE(column < depth && column >= -depth,
+                   "Attribute column should be in the range of [-%l, %l)",
+                   depth, depth);
+
+    ctx->SetOutputDim("PositivePair", scalar_dim);
+    ctx->SetOutputDim("NegativePair", scalar_dim);
+    ctx->SetOutputDim("NeutralPair", scalar_dim);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
+        ctx.device_context());
+  }
+};
+
+class PositiveNegativePairOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PositiveNegativePairOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Score",
+             "(Tensor, float) Model Score on an item (with "
+             "respect to QueryID). It's a 2-D tensor with shape [batch_size, "
+             "depth], where the column specified by the attribute \"column\" "
+             "is used as item score.");
+    AddInput("Label",
+             "(Tensor, float) Label of an item (with repsect to "
+             "QueryId). It's a 2-D tensor with shape [batch_size, 1].");
+    AddInput("QueryID",
+             "(Tensor, int64) Query ID that indicates the context. Its shape "
+             "should be the same as Label.");
+    AddInput(
+        "AccumulatePositivePair",
+        "(float) Optional. The accumulated number of positive pairs over a "
+        "stream of data. If provided, the output PositivePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput(
+        "AccumulateNegativePair",
+        "(float) Optional. The accumulated number of negative pairs over a "
+        "stream of data. If provided, the output NegativePair will be "
+        "initialized with this number rather than 0. it won't be modified "
+        "in place.")
+        .AsDispensable();
+    AddInput("AccumulateNeutralPair",
+             "(float) Optional. The accumulated number of neutral pairs over a "
+             "stream of data. If provided, the output NeutralPair will be "
+             "initialized with this number rather than 0. it won't be modified "
+             "in place.")
+        .AsDispensable();
+    AddInput("Weight",
+             "(float) Optional. Weight of current item. If specified, its "
+             "shape should be the same as Label, and the meaning of the output "
+             "changes from numbers of pairs to the total sum of pairs' "
+             "weights. Weight of a pair of items is the average of their "
+             "weights.")
+        .AsDispensable();
+    AddOutput("PositivePair",
+              "(float) Number of positive pairs, i.e. the pairs of "
+              "items that are ranked correctly.");
+    AddOutput("NegativePair",
+              "(float) Number of negative pairs, i.e. the pairs of "
+              "items that are ranked incorrectly.");
+    AddOutput("NeutralPair",
+              "(float) Number of neutral pairs, i.e. the pairs of items "
+              "that have the same score.")
+        .AsDispensable();
+    AddAttr<int>(
+        "column",
+        "(int, default -1) The column position of Score used to rank items in "
+        "descending order. It must be in the range of [-rank(Score), "
+        "rank(Score)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Noting that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+PositiveNegativePairOp can be used to evaluate Learning To Rank(LTR) model's
+performance.
+
+Within some context, e.g. the "query", a LTR model generates scores for a list
+of items, which gives a partial order of the items. PositiveNegativePairOp
+takes a list of reference rank order (Input("Label")) and the model generated
+scores (Input(Score)) as inputs and counts the pairs that ranked correctly
+and incorrectly.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(positive_negative_pair,
+                             ops::PositiveNegativePairOp,
+                             ops::PositiveNegativePairOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    positive_negative_pair,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, float>,
+    ops::PositiveNegativePairKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h
new file mode 100644
index 0000000000..977e59b7d2
--- /dev/null
+++ b/paddle/operators/positive_negative_pair_op.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class PositiveNegativePairKernel : public framework::OpKernel<T> {
+ public:
+  struct PredictionResult {
+    PredictionResult(T score, T label, T weight)
+        : score(score), label(label), weight(weight) {}
+    T score;
+    T label;
+    T weight;
+  };
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto score_t = context.Input<Tensor>("Score");
+    auto label_t = context.Input<Tensor>("Label");
+    auto query_t = context.Input<Tensor>("QueryID");
+    auto acc_positive_t = context.Input<Tensor>("AccumulatePositivePair");
+    auto acc_negative_t = context.Input<Tensor>("AccumulateNegativePair");
+    auto acc_neutral_t = context.Input<Tensor>("AccumulateNeutralPair");
+    auto positive_t = context.Output<Tensor>("PositivePair");
+    auto negative_t = context.Output<Tensor>("NegativePair");
+    auto neutral_t = context.Output<Tensor>("NeutralPair");
+    auto weight_t = context.Input<Tensor>("Weight");
+
+    auto score = score_t->data<T>();
+    auto label = label_t->data<T>();
+    auto query = query_t->data<int64_t>();
+    const T* weight = nullptr;
+    if (weight_t != nullptr) {
+      weight = weight_t->data<T>();
+    }
+    T* positive = positive_t->mutable_data<T>(context.GetPlace());
+    T* negative = negative_t->mutable_data<T>(context.GetPlace());
+    T* neutral = neutral_t->mutable_data<T>(context.GetPlace());
+
+    auto score_dim = score_t->dims();
+    auto batch_size = score_dim[0];
+    auto width = score_dim[1];
+    auto column = context.Attr<int32_t>("column");
+    if (column < 0) {
+      column += width;
+    }
+
+    // construct document instances for each query: Query => List[<score#0,
+    // label#0, weight#0>, ...]
+    std::unordered_map<int64_t, std::vector<PredictionResult>> predictions;
+    for (auto i = 0; i < batch_size; ++i) {
+      if (predictions.find(query[i]) == predictions.end()) {
+        predictions.emplace(
+            std::make_pair(query[i], std::vector<PredictionResult>()));
+      }
+      predictions[query[i]].emplace_back(score[i * width + column], label[i],
+                                         weight_t != nullptr ? weight[i] : 1.0);
+    }
+
+    // for each query, accumulate pair counts
+    T pos = 0, neg = 0, neu = 0;
+    if (acc_positive_t != nullptr && acc_negative_t != nullptr &&
+        acc_neutral_t != nullptr) {
+      pos = acc_positive_t->data<T>()[0];
+      neg = acc_negative_t->data<T>()[0];
+      neu = acc_neutral_t->data<T>()[0];
+    }
+    auto evaluate_one_list = [&pos, &neg,
+                              &neu](std::vector<PredictionResult> vec) {
+      for (auto ite1 = vec.begin(); ite1 != vec.end(); ++ite1) {
+        for (auto ite2 = ite1 + 1; ite2 != vec.end(); ++ite2) {
+          if (ite1->label == ite2->label) {  // labels are equal, ignore.
+            continue;
+          }
+          T w = (ite1->weight + ite2->weight) * 0.5;
+          if (ite1->score == ite2->score) {
+            neu += w;
+          }
+          (ite1->score - ite2->score) * (ite1->label - ite2->label) > 0.0
+              ? pos += w
+              : neg += w;
+        }
+      }
+    };
+    for (auto prediction : predictions) {
+      evaluate_one_list(prediction.second);
+    }
+    *positive = pos;
+    *negative = neg;
+    *neutral = neu;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc
new file mode 100644
index 0000000000..f1598d53ca
--- /dev/null
+++ b/paddle/operators/precision_recall_op.cc
@@ -0,0 +1,182 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/precision_recall_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PrecisionRecallOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("MaxProbs"),
+                   "Input(MaxProbs) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchMetrics"),
+                   "Output(BatchMetrics) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("AccumMetrics"),
+                   "Output(AccumMetrics) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("AccumStatesInfo"),
+                   "Output(AccumStatesInfo) should not be null.");
+
+    int64_t cls_num =
+        static_cast<int64_t>(ctx->Attrs().Get<int>("class_number"));
+    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
+    auto labels_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+                      "Each instance contains one max probability, so the "
+                      "shape of Input(MaxProbs) should be [batch_size, 1].");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
+                      "The shape of Input(Indices) should be [batch_size, 1].");
+    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(MaxProbs) and "
+                      "Input(Labels) both are batch_size and the shape should "
+                      "be the same.");
+    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+                      "The 2nd dimension of Input(Labels) contains instance "
+                      "label and the shape should be equal to 1.");
+    if (ctx->HasInput("Weights")) {
+      auto weights_dims = ctx->GetInputDim("Weights");
+      PADDLE_ENFORCE_EQ(weights_dims,
+                        framework::make_ddim({max_probs_dims[0], 1}),
+                        "The shape of Input(Weights) should be "
+                        "[batch_size, 1].");
+    }
+    if (ctx->HasInput("StatesInfo")) {
+      auto states_dims = ctx->GetInputDim("StatesInfo");
+      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+                        "The shape of Input(StatesInfo) should be "
+                        "[class_number, 4].");
+    }
+
+    // Layouts of BatchMetrics and AccumMetrics both are:
+    // [
+    //  macro average precision, macro average recall, macro average F1 score,
+    //  micro average precision, micro average recall, micro average F1 score
+    // ]
+    ctx->SetOutputDim("BatchMetrics", {6});
+    ctx->SetOutputDim("AccumMetrics", {6});
+    // Shape of AccumStatesInfo is [class_number, 4]
+    // The layout of each row is:
+    // [ TP, FP, TN, FN ]
+    ctx->SetOutputDim("AccumStatesInfo", {cls_num, 4});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
+        ctx.device_context());
+  }
+};
+
+class PrecisionRecallOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrecisionRecallOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("MaxProbs",
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each row contains the max probability "
+             "of an instance which computed by the previous top_k (k=1) "
+             "operator.");
+    AddInput("Indices",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each row contains the corresponding "
+             "index which computed by the previous top_k (k=1) operator.");
+    AddInput("Labels",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. Each element is a label and the "
+             "value should be in [0, class_number - 1].");
+    AddInput("Weights",
+             "(Tensor, default Tensor<float>) A 2-D tensor with shape N x 1, "
+             "where N is the batch size. This input is optional. If provided, "
+             "weight of instance would be considered when computing metrics.")
+        .AsDispensable();
+    AddInput("StatesInfo",
+             "(Tensor, default Tensor<int>) A 2-D tensor with shape D x 4, "
+             "where D is the number of classes. This input is optional. If "
+             "provided, current state will be accumulated to this state and "
+             "the accumulation state will be the output state.")
+        .AsDispensable();
+    AddOutput("BatchMetrics",
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for current batch data. "
+              "The layout is [macro average precision, macro average recall, "
+              "macro f1 score, micro average precision, micro average recall, "
+              "micro f1 score].");
+    AddOutput("AccumMetrics",
+              "(Tensor, default Tensor<float>) A 1-D tensor with shape {6}. "
+              "This output tensor contains metrics for accumulated data. "
+              "The layout is [macro average precision, macro average recall, "
+              "macro f1 score, micro average precision, micro average recall, "
+              "micro f1 score].");
+    AddOutput("AccumStatesInfo",
+              "(Tensor, default Tensor<float>) A 2-D tensor with shape D x 4, "
+              "where D is equal to class number. This output tensor contains "
+              "accumulated state variables used to compute metrics. The layout "
+              "for each class is [true positives, false positives, "
+              "true negatives, false negatives].");
+    AddAttr<int>("class_number", "(int) Number of classes to be evaluated.");
+    AddComment(R"DOC(
+Precision Recall Operator.
+
+When given Input(Indices) and Input(Labels), this operator can be used
+to compute various metrics including:
+1. macro average precision
+2. macro average recall
+3. macro f1 score
+4. micro average precision
+5. micro average recall
+6. micro f1 score
+
+To compute the above metrics, we need to do statistics for true positives,
+false positives and false negatives. Here the count of true negatives is not
+necessary, but counting it may provide potential usage and the cost is
+trivial, so the operator also provides the count of true negatives.
+
+We define state as a 2-D tensor with shape [class_number, 4]. Each row of a
+state contains statistic variables for corresponding class. Layout of each row
+is: TP(true positives), FP(false positives), TN(true negatives),
+FN(false negatives). If Input(Weights) is provided, TP, FP, TN, FN will be
+calculated by given weight instead of the instance count.
+
+This operator also supports metrics computing for cross-batch situation. To
+achieve this, Input(StatesInfo) should be provided. State of current batch
+data will be accumulated to Input(StatesInfo) and Output(AccumStatesInfo)
+is the accumulation state.
+
+Output(BatchMetrics) is metrics of current batch data while
+Output(AccumStatesInfo) is metrics of accumulation data.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(precision_recall, ops::PrecisionRecallOp,
+                             ops::PrecisionRecallOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    precision_recall,
+    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, float>,
+    ops::PrecisionRecallKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h
new file mode 100644
index 0000000000..c0d55405a3
--- /dev/null
+++ b/paddle/operators/precision_recall_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+enum StateVariable { TP = 0, FP, TN, FN };
+
+template <typename DeviceContext, typename T>
+class PrecisionRecallKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in0 = ctx.Input<Tensor>("Indices");
+    auto* in1 = ctx.Input<Tensor>("Labels");
+    auto* in2 = ctx.Input<Tensor>("Weights");
+    auto* in3 = ctx.Input<Tensor>("StatesInfo");
+    auto* out0 = ctx.Output<Tensor>("BatchMetrics");
+    auto* out1 = ctx.Output<Tensor>("AccumMetrics");
+    auto* out2 = ctx.Output<Tensor>("AccumStatesInfo");
+
+    const int* ids_data = in0->data<int>();
+    const int* labels_data = in1->data<int>();
+    size_t cls_num = static_cast<size_t>(ctx.Attr<int>("class_number"));
+    const T* weights_data = in2 ? in2->data<T>() : nullptr;
+    const T* states_data = in3 ? in3->data<T>() : nullptr;
+    double* batch_metrics_data = out0->mutable_data<double>(ctx.GetPlace());
+    double* accum_metrics_data = out1->mutable_data<double>(ctx.GetPlace());
+    out2->mutable_data<T>(ctx.GetPlace());
+    auto accum_states = EigenMatrix<T>::From(*out2);
+    accum_states.setZero();
+    T* accum_states_data = out2->data<T>();
+
+    size_t sample_num = in0->dims()[0];
+    size_t state_var_num = 4;  // TP FP TN FN
+
+    // get states info for current batch
+    for (size_t i = 0; i < sample_num; ++i) {
+      size_t idx = ids_data[i];
+      size_t label = labels_data[i];
+
+      PADDLE_ENFORCE(idx >= 0 && idx < cls_num,
+                     "Class index of each instance should be in "
+                     "[0, class_number).");
+      PADDLE_ENFORCE(label >= 0 && label < cls_num,
+                     "Label of each instance should be in [0, class_number).");
+
+      T w = weights_data ? weights_data[i] : 1.0;
+      if (idx == label) {
+        accum_states_data[idx * state_var_num + TP] += w;
+        for (size_t j = 0; j < cls_num; ++j) {
+          accum_states_data[j * state_var_num + TN] += w;
+        }
+        accum_states_data[idx * state_var_num + TN] -= w;
+      } else {
+        accum_states_data[label * state_var_num + FN] += w;
+        accum_states_data[idx * state_var_num + FP] += w;
+        for (size_t j = 0; j < cls_num; ++j) {
+          accum_states_data[j * state_var_num + TN] += w;
+        }
+        accum_states_data[idx * state_var_num + TN] -= w;
+        accum_states_data[label * state_var_num + TN] -= w;
+      }
+    }
+
+    ComputeMetrics(accum_states_data, batch_metrics_data, state_var_num,
+                   cls_num);
+
+    if (states_data) {
+      for (size_t i = 0; i < cls_num; ++i) {
+        for (size_t j = 0; j < state_var_num; ++j) {
+          size_t idx = i * state_var_num + j;
+          accum_states_data[idx] += states_data[idx];
+        }
+      }
+    }
+
+    ComputeMetrics(accum_states_data, accum_metrics_data, state_var_num,
+                   cls_num);
+  }
+
+  // expose to be reused
+  static inline T CalcPrecision(T tp_count, T fp_count) {
+    if (tp_count > 0.0 || fp_count > 0.0) {
+      return tp_count / (tp_count + fp_count);
+    }
+    return 1.0;
+  }
+
+  static inline T CalcRecall(T tp_count, T fn_count) {
+    if (tp_count > 0.0 || fn_count > 0.0) {
+      return tp_count / (tp_count + fn_count);
+    }
+    return 1.0;
+  }
+
+  static inline T CalcF1Score(T precision, T recall) {
+    if (precision > 0.0 || recall > 0.0) {
+      return 2 * precision * recall / (precision + recall);
+    }
+    return 0.0;
+  }
+
+ protected:
+  void ComputeMetrics(const T* states_data, double* metrics_data,
+                      size_t state_var_num, size_t cls_num) const {
+    T total_tp_count = 0;
+    T total_fp_count = 0;
+    T total_fn_count = 0;
+    T macro_avg_precision = 0.0;
+    T macro_avg_recall = 0.0;
+
+    for (size_t i = 0; i < cls_num; ++i) {
+      T tp_count = states_data[i * state_var_num + TP];
+      T fp_count = states_data[i * state_var_num + FP];
+      T fn_count = states_data[i * state_var_num + FN];
+      total_tp_count += tp_count;
+      total_fp_count += fp_count;
+      total_fn_count += fn_count;
+      macro_avg_precision += CalcPrecision(tp_count, fp_count);
+      macro_avg_recall += CalcRecall(tp_count, fn_count);
+    }
+    macro_avg_precision /= cls_num;
+    macro_avg_recall /= cls_num;
+    T macro_f1_score = CalcF1Score(macro_avg_precision, macro_avg_recall);
+
+    T micro_avg_precision = CalcPrecision(total_tp_count, total_fp_count);
+    T micro_avg_recall = CalcRecall(total_tp_count, total_fn_count);
+    T micro_f1_score = CalcF1Score(micro_avg_precision, micro_avg_recall);
+
+    // fill metrics data
+    metrics_data[0] = macro_avg_precision;
+    metrics_data[1] = macro_avg_recall;
+    metrics_data[2] = macro_f1_score;
+    metrics_data[3] = micro_avg_precision;
+    metrics_data[4] = micro_avg_recall;
+    metrics_data[5] = micro_f1_score;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
new file mode 100644
index 0000000000..ddc21a6570
--- /dev/null
+++ b/paddle/operators/prelu_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/prelu_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PReluOp : public framework::OperatorWithKernel {
+ public:
+  PReluOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
+    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                   "Size of weight Alpha must be one.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PReluOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of prelu operator.");
+    AddInput("Alpha", "The alpha weight of prelu operator.");
+    AddOutput("Out", "The output tensor of prelu operator.");
+    AddComment(R"DOC(
+PRelu Operator.
+
+The equation is:
+
+$$
+f(x) =
+\begin{cases}
+\alpha * x, \quad  \text{if} \ x < 0 \\
+x,         \qquad  \text{if} \ x >= 0
+\end{cases}
+$$
+
+The input `X` can carry the LoD (Level of Details) information,
+or not. And the output shares the LoD information with input `X`.
+
+)DOC");
+  }
+};
+
+// The operator to calculate gradients of a prelu operator.
+class PReluGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->SetOutputDim(framework::GradVarName("Alpha"),
+                      ctx->GetInputDim("Alpha"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad,
+            ops::PReluGradOp);
+REGISTER_OP_CPU_KERNEL(
+    prelu, ops::PReluKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    prelu_grad,
+    ops::PReluGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu
new file mode 100644
index 0000000000..1718bb5cd6
--- /dev/null
+++ b/paddle/operators/prelu_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/prelu_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    prelu,
+    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(prelu_grad,
+                        paddle::operators::PReluGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
new file mode 100644
index 0000000000..56f9a553ec
--- /dev/null
+++ b/paddle/operators/prelu_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using platform::Transform;
+
+template <typename T>
+class PReluFunctor {
+ public:
+  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
+
+  HOSTDEVICE T operator()(const T& x) const {
+    if (x > 0)
+      return x;
+    else
+      return x * (*alpha_);
+  }
+
+ private:
+  const T* alpha_;
+};
+
+template <typename DeviceContext, typename T>
+class PReluKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* out = context.Output<Tensor>("Out");
+
+    const T* x_ptr = x->data<T>();
+    T* o_ptr = out->mutable_data<T>(context.GetPlace());
+
+    auto* alpha_ptr = alpha->data<T>();
+
+    int numel = x->numel();
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), x_ptr,
+          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
+  }
+};
+
+template <typename T>
+class PReluGradFunctor {
+ public:
+  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
+
+  HOSTDEVICE T operator()(const T& out, const T& dout) const {
+    if (out > 0)
+      return dout;
+    else
+      return dout * (*alpha_);
+  }
+
+ private:
+  const T* alpha_;
+};
+
+template <typename DeviceContext, typename T>
+class PReluGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    auto* out = context.Input<Tensor>("Out");
+    auto* alpha = context.Input<Tensor>("Alpha");
+    auto* alpha_ptr = alpha->data<T>();
+
+    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+    const T* dout_ptr = dout->data<T>();
+    const T* out_ptr = out->data<T>();
+    int numel = dx->numel();
+
+    Transform<DeviceContext> trans;
+    trans(context.template device_context<DeviceContext>(), out_ptr,
+          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
+
+    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/print_op.cc b/paddle/operators/print_op.cc
new file mode 100644
index 0000000000..8b233d64c9
--- /dev/null
+++ b/paddle/operators/print_op.cc
@@ -0,0 +1,283 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <ctime>
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace operators {
+
+#define CLOG std::cout
+
+const std::string kForward = "FORWARD";
+const std::string kBackward = "BACKWARD";
+const std::string kBoth = "BOTH";
+
+struct Formater {
+  std::string message;
+  std::string name;
+  std::vector<int> dims;
+  std::type_index dtype{typeid(char)};
+  framework::LoD lod;
+  int summarize;
+  void* data{nullptr};
+
+  void operator()(size_t size) {
+    PrintMessage();
+    PrintName();
+    PrintDims();
+    PrintDtype();
+    PrintLod();
+    PrintData(size);
+  }
+
+ private:
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintName() {
+    if (!name.empty()) {
+      CLOG << "Tensor[" << name << "]" << std::endl;
+    }
+  }
+  void PrintDims() {
+    if (!dims.empty()) {
+      CLOG << "\tshape: [";
+      for (auto i : dims) {
+        CLOG << i << ",";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+  void PrintDtype() {
+    if (dtype.hash_code() != typeid(char).hash_code()) {
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
+    }
+  }
+  void PrintLod() {
+    if (!lod.empty()) {
+      CLOG << "\tLoD: [";
+      for (auto level : lod) {
+        CLOG << "[ ";
+        for (auto i : level) {
+          CLOG << i << ",";
+        }
+        CLOG << " ]";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+
+  void PrintData(size_t size) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    // print float
+    if (dtype.hash_code() == typeid(float).hash_code()) {
+      Display<float>(size);
+    }
+    if (dtype.hash_code() == typeid(double).hash_code()) {
+      Display<double>(size);
+    }
+    if (dtype.hash_code() == typeid(int).hash_code()) {
+      Display<int>(size);
+    }
+    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+      Display<int64_t>(size);
+    }
+  }
+
+  template <typename T>
+  void Display(size_t size) {
+    auto* d = (T*)data;
+    CLOG << "\tdata: ";
+    if (summarize != -1) {
+      summarize = std::min(size, (size_t)summarize);
+      for (int i = 0; i < summarize; i++) {
+        CLOG << d[i] << ",";
+      }
+    } else {
+      for (size_t i = 0; i < size; i++) {
+        CLOG << d[i] << ",";
+      }
+    }
+    CLOG << std::endl;
+  }
+};
+
+// TODO(ChunweiYan) there should be some other printers for TensorArray
+class TensorPrintOp : public framework::OperatorBase {
+ public:
+  TensorPrintOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  TensorPrintOp(const TensorPrintOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented.");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    const framework::Variable* in_var_ptr = nullptr;
+    std::string phase = kForward;
+    std::string printed_var_name = "";
+
+    auto& inputs = Inputs();
+    if (inputs.find("In") != inputs.end() && !Inputs("In").empty()) {
+      in_var_ptr = scope.FindVar(Input("In"));
+      printed_var_name = Inputs("In").front();
+    } else if (inputs.find("In@GRAD") != inputs.end() &&
+               !Inputs("In@GRAD").empty()) {
+      in_var_ptr = scope.FindVar(Input("In@GRAD"));
+      printed_var_name = Inputs("In@GRAD").front();
+      phase = kBackward;
+    } else {
+      PADDLE_THROW("Unknown phase, should be forward or backward.");
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(in_var_ptr);
+
+    auto& in_tensor = in_var_ptr->Get<framework::LoDTensor>();
+    auto* out_var_ptr = scope.FindVar(Output("Out"));
+    auto& out_tensor = *out_var_ptr->GetMutable<framework::LoDTensor>();
+
+    // Just copy data from input tensor to output tensor
+    // output tensor share same memory with input tensor
+    out_tensor.ShareDataWith(in_tensor);
+    out_tensor.set_lod(in_tensor.lod());
+
+    std::string print_phase = Attr<std::string>("print_phase");
+    if (print_phase != phase && print_phase != kBoth) {
+      return;
+    }
+
+    int first_n = Attr<int>("first_n");
+    if (first_n > 0 && ++times_ > first_n) return;
+
+    framework::LoDTensor printed_tensor;
+    printed_tensor.set_lod(in_tensor.lod());
+    printed_tensor.Resize(in_tensor.dims());
+
+    if (platform::is_cpu_place(in_tensor.place())) {
+      printed_tensor.ShareDataWith(in_tensor);
+    } else {
+      // copy data to cpu to print
+      platform::CPUPlace place;
+      framework::Copy(in_tensor, place, &printed_tensor);
+    }
+
+    Formater formater;
+    if (Attr<bool>("print_tensor_name")) {
+      formater.name = printed_var_name;
+    }
+    if (Attr<bool>("print_tensor_type")) {
+      formater.dtype = printed_tensor.type();
+    }
+    if (Attr<bool>("print_tensor_shape")) {
+      auto& dims = printed_tensor.dims();
+      formater.dims.resize(dims.size());
+      for (int i = 0; i < dims.size(); ++i) formater.dims[i] = dims[i];
+    }
+    if (Attr<bool>("print_tensor_lod")) {
+      formater.lod = printed_tensor.lod();
+    }
+    formater.summarize = Attr<int>("summarize");
+    formater.data = (void*)printed_tensor.data<void>();
+    formater(printed_tensor.numel());
+  }
+
+ private:
+  mutable int times_{0};
+};
+
+class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("In", "Input tensor to be displayed.");
+    AddAttr<int>("first_n", "Only log `first_n` number of times.");
+    AddAttr<std::string>("message", "A string message to print as a prefix.");
+    AddAttr<int>("summarize", "Number of elements printed.");
+    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
+    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
+    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
+    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
+    AddAttr<std::string>(
+        "print_phase",
+        "(string, default 'BOTH') Which phase to display including 'FORWARD' "
+        "'BACKWARD' and 'BOTH'.")
+        .SetDefault(kBoth)
+        .InEnum({kForward, kBackward, kBoth});
+    AddOutput("Out", "Output tensor with same data as input tensor.");
+    AddComment(R"DOC(
+Creates a print op that will print when a tensor is accessed.
+
+Wraps the tensor passed in so that whenever that a tensor is accessed,
+the message `message` is printed, along with the current value of the
+tensor `t`.)DOC");
+  }
+};
+
+class InferShapeForward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In"), "Input(In) should not be null.");
+    context->ShareLoD("In", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In"));
+  }
+};
+
+class InferShapeBackward : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("In@GRAD"),
+                   "Input(In@GRAD) should not be null.");
+    context->ShareLoD("In@GRAD", /*->*/ "Out");
+    context->SetOutputDim("Out", context->GetInputDim("In@GRAD"));
+  }
+};
+
+class InferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {}
+};
+
+class PrintOpProtoAndCheckGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("print_grad");
+    op_desc_ptr->SetInput("In@GRAD", OutputGrad("Out"));
+    op_desc_ptr->SetOutput("Out", InputGrad("In"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(print, ops::TensorPrintOp, ops::PrintOpProtoAndCheckMaker,
+                  ops::PrintOpProtoAndCheckGradOpMaker, ops::InferShapeForward,
+                  ops::InferVarType);
+REGISTER_OPERATOR(print_grad, ops::TensorPrintOp, ops::InferShapeBackward);
diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
new file mode 100644
index 0000000000..105ff4ac3e
--- /dev/null
+++ b/paddle/operators/prior_box_op.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/prior_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+class PriorBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of PriorBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Image"),
+                   "Input(Image) of PriorBoxOp should not be null.");
+
+    auto image_dims = ctx->GetInputDim("Image");
+    auto input_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE(image_dims.size() == 4, "The layout of image is NCHW.");
+    PADDLE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW.");
+
+    PADDLE_ENFORCE_LT(input_dims[2], image_dims[2],
+                      "The height of input must smaller than image.");
+
+    PADDLE_ENFORCE_LT(input_dims[3], image_dims[3],
+                      "The width of input must smaller than image.");
+
+    auto min_sizes = ctx->Attrs().Get<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx->Attrs().Get<std::vector<int>>("max_sizes");
+    auto variances = ctx->Attrs().Get<std::vector<float>>("variances");
+    auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
+    bool flip = ctx->Attrs().Get<bool>("flip");
+
+    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                      "Size of min_sizes must be at least 1.");
+    for (size_t i = 0; i < min_sizes.size(); ++i) {
+      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
+    }
+
+    std::vector<float> aspect_ratios_vec;
+    ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
+
+    int num_priors = aspect_ratios_vec.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      PADDLE_ENFORCE_EQ(max_sizes.size(), min_sizes.size(),
+                        "The number of min_size and max_size must be equal.");
+      for (size_t i = 0; i < min_sizes.size(); ++i) {
+        PADDLE_ENFORCE_GT(max_sizes[i], min_sizes[i],
+                          "max_size[%d] must be greater than min_size[%d].", i,
+                          i);
+        num_priors += 1;
+      }
+    }
+
+    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
+    for (size_t i = 0; i < variances.size(); ++i) {
+      PADDLE_ENFORCE_GT(variances[i], 0.0,
+                        "variance[%d] must be greater than 0.", i);
+    }
+
+    const float step_h = ctx->Attrs().Get<float>("step_h");
+    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+    const float step_w = ctx->Attrs().Get<float>("step_w");
+    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
+
+    std::vector<int64_t> dim_vec(4);
+    dim_vec[0] = input_dims[2];
+    dim_vec[1] = input_dims[3];
+    dim_vec[2] = num_priors;
+    dim_vec[3] = 4;
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec));
+    ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec));
+  }
+};
+
+class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PriorBoxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor, default Tensor<float>), "
+             "the input feature data of PriorBoxOp, The layout is NCHW.");
+    AddInput("Image",
+             "(Tensor, default Tensor<float>), "
+             "the input image data of PriorBoxOp, The layout is NCHW.");
+    AddOutput("Boxes",
+              "(Tensor, default Tensor<float>), the output prior boxes of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddOutput("Variances",
+              "(Tensor, default Tensor<float>), the expanded variances of "
+              "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
+              "H is the height of input, W is the width of input, num_priors "
+              "is the box count of each position.");
+    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
+                              "List of min sizes of generated prior boxes.");
+    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
+                              "List of max sizes of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "aspect_ratios", "(vector<float>) ",
+        "List of aspect ratios of generated prior boxes.");
+    AddAttr<std::vector<float>>(
+        "variances", "(vector<float>) ",
+        "List of variances to be encoded in prior boxes.");
+    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        .SetDefault(true);
+    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+        .SetDefault(true);
+    AddAttr<float>("step_w",
+                   "Prior boxes step across width, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("step_h",
+                   "Prior boxes step across height, 0 for auto calculation.")
+        .SetDefault(0.0);
+    AddAttr<float>("offset",
+                   "(float) "
+                   "Prior boxes center offset.")
+        .SetDefault(0.5);
+    AddComment(R"DOC(
+Prior box operator
+Generate prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+Each position of the input produce N prior boxes, N is determined by
+ the count of min_sizes, max_sizes and aspect_ratios, The size of the
+ box is in range(min_size, max_size) interval, which is generated in
+ sequence according to the aspect_ratios.
+
+Please get more information from the following papers:
+https://arxiv.org/abs/1512.02325.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    prior_box, ops::PriorBoxOpKernel<paddle::platform::CPUPlace, float>,
+    ops::PriorBoxOpKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
new file mode 100644
index 0000000000..e0a663ace8
--- /dev/null
+++ b/paddle/operators/prior_box_op.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
+                               bool flip,
+                               std::vector<float>& output_aspect_ratior) {
+  constexpr float epsilon = 1e-6;
+  output_aspect_ratior.clear();
+  output_aspect_ratior.push_back(1.);
+  for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
+    float ar = input_aspect_ratior[i];
+    bool already_exist = false;
+    for (size_t j = 0; j < output_aspect_ratior.size(); ++j) {
+      if (fabs(ar - output_aspect_ratior[j]) < epsilon) {
+        already_exist = true;
+        break;
+      }
+    }
+    if (!already_exist) {
+      output_aspect_ratior.push_back(ar);
+      if (flip) {
+        output_aspect_ratior.push_back(1. / ar);
+      }
+    }
+  }
+}
+
+template <typename T>
+struct ClipFunctor {
+  HOSTDEVICE T operator()(T in) const {
+    return std::min<T>(std::max<T>(in, 0.), 1.);
+  }
+};
+
+template <typename Place, typename T>
+class PriorBoxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<paddle::framework::Tensor>("Input");
+    auto* image = ctx.Input<paddle::framework::Tensor>("Image");
+    auto* boxes = ctx.Output<paddle::framework::Tensor>("Boxes");
+    auto* vars = ctx.Output<paddle::framework::Tensor>("Variances");
+
+    auto min_sizes = ctx.Attr<std::vector<int>>("min_sizes");
+    auto max_sizes = ctx.Attr<std::vector<int>>("max_sizes");
+    auto input_aspect_ratio = ctx.Attr<std::vector<float>>("aspect_ratios");
+    auto variances = ctx.Attr<std::vector<float>>("variances");
+    auto flip = ctx.Attr<bool>("flip");
+    auto clip = ctx.Attr<bool>("clip");
+
+    std::vector<float> aspect_ratios;
+    ExpandAspectRatios(input_aspect_ratio, flip, aspect_ratios);
+
+    T step_w = static_cast<T>(ctx.Attr<float>("step_w"));
+    T step_h = static_cast<T>(ctx.Attr<float>("step_h"));
+    T offset = static_cast<T>(ctx.Attr<float>("offset"));
+
+    auto img_width = image->dims()[3];
+    auto img_height = image->dims()[2];
+
+    auto feature_width = input->dims()[3];
+    auto feature_height = input->dims()[2];
+
+    T step_width, step_height;
+    if (step_w == 0 || step_h == 0) {
+      step_width = static_cast<T>(img_width) / feature_width;
+      step_height = static_cast<T>(img_height) / feature_height;
+    } else {
+      step_width = step_w;
+      step_height = step_h;
+    }
+
+    int num_priors = aspect_ratios.size() * min_sizes.size();
+    if (max_sizes.size() > 0) {
+      num_priors += max_sizes.size();
+    }
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    vars->mutable_data<T>(ctx.GetPlace());
+
+    auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
+    for (int h = 0; h < feature_height; ++h) {
+      for (int w = 0; w < feature_width; ++w) {
+        T center_x = (w + offset) * step_width;
+        T center_y = (h + offset) * step_height;
+        T box_width, box_height;
+        int idx = 0;
+        for (size_t s = 0; s < min_sizes.size(); ++s) {
+          int min_size = min_sizes[s];
+          // first prior: aspect_ratio = 1, size = min_size
+          box_width = box_height = min_size;
+          // xmin
+          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          // ymin
+          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          // xmax
+          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          // ymax
+          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+
+          idx++;
+          if (max_sizes.size() > 0) {
+            int max_size = max_sizes[s];
+            // second prior: aspect_ratio = 1,
+            // size = sqrt(min_size * max_size)
+            box_width = box_height = sqrt(min_size * max_size);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+
+          // rest of priors
+          for (size_t r = 0; r < aspect_ratios.size(); ++r) {
+            float ar = aspect_ratios[r];
+            if (fabs(ar - 1.) < 1e-6) {
+              continue;
+            }
+            box_width = min_size * sqrt(ar);
+            box_height = min_size / sqrt(ar);
+            // xmin
+            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            // ymin
+            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            // xmax
+            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            // ymax
+            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            idx++;
+          }
+        }
+      }
+    }
+
+    if (clip) {
+      platform::Transform<platform::CPUDeviceContext> trans;
+      ClipFunctor<T> clip_func;
+      trans(ctx.template device_context<platform::CPUDeviceContext>(),
+            boxes->data<T>(), boxes->data<T>() + boxes->numel(),
+            boxes->data<T>(), clip_func);
+    }
+
+    framework::Tensor var_t;
+    var_t.mutable_data<T>(
+        framework::make_ddim({1, static_cast<int>(variances.size())}),
+        ctx.GetPlace());
+    auto var_et = framework::EigenTensor<T, 2>::From(var_t);
+    for (size_t i = 0; i < variances.size(); ++i) {
+      var_et(0, i) = variances[i];
+    }
+
+    int box_num = feature_height * feature_width * num_priors;
+    auto var_dim = vars->dims();
+    vars->Resize({box_num, static_cast<int>(variances.size())});
+
+    auto e_vars = framework::EigenMatrix<T, Eigen::RowMajor>::From(*vars);
+    e_vars = var_et.broadcast(Eigen::DSizes<int, 2>(box_num, 1));
+
+    vars->Resize(var_dim);
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc
new file mode 100644
index 0000000000..b92f46b5bd
--- /dev/null
+++ b/paddle/operators/proximal_adagrad_op.cc
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/proximal_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ProximalAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of ProximalAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of ProximalAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("MomentOut"),
+        "Output(MomentOut) of ProximalAdagradOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and Grad of ProximalAdagrad Op must have same dimension.");
+
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Moment"),
+        "Param and Moment of ProximalAdagrad Op must have same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+  }
+};
+
+class ProximalAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ProximalAdagradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter that has to be updated.");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) "
+             "Moment parameter that has to be updated.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment value.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Proximal Adagrad Optimizer.
+
+Optimizer that implements the proximal adagrad algorithm:
+
+$$
+moment = moment + grad * grad \\
+prox\_param = param - learning\_rate * grad * (1 / \sqrt{moment}) \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1 , 0)
+$$
+
+The paper that proposed Proximal GD: 
+(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+Here, we use the adagrad learning rate as specified here: 
+(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp,
+                             ops::ProximalAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    proximal_adagrad,
+    ops::ProximalAdagradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu
new file mode 100644
index 0000000000..42a178f94b
--- /dev/null
+++ b/paddle/operators/proximal_adagrad_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/proximal_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    proximal_adagrad,
+    ops::ProximalAdagradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h
new file mode 100644
index 0000000000..523924d80e
--- /dev/null
+++ b/paddle/operators/proximal_adagrad_op.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ProximalAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+
+    auto grad = ctx.Input<Tensor>("Grad");
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto m = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto m_out = EigenVector<T>::Flatten(*moment_out);
+    auto* place = ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    m_out.device(*place) = m + g * g;
+    auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt();
+    if (l1 > static_cast<T>(0)) {
+      p_out.device(*place) =
+          prox_param.sign() *
+          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
+                .cwiseMax(static_cast<T>(0.0))) /
+           (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize)));
+    } else {
+      p_out.device(*place) =
+          prox_param / (static_cast<T>(1.0) + (lr * l2).broadcast(grad_dsize));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
new file mode 100644
index 0000000000..2d3bbdaf32
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.cc
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/proximal_gd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ProximalGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of ProximalGDOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of ProximalGDOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of ProximalGD Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+  }
+};
+
+class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ProximalGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0) "
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+ProximalGD Operator.
+
+Optimizer that implements the proximal gradient descent algorithm:
+
+$$
+prox\_param = param - learning\_rate * grad \\
+param = sign(prox\_param) / (1 + learning\_rate * l2) *
+        \max(|prox\_param| - learning\_rate * l1, 0)
+$$        
+
+The paper that proposed Proximal Gradient Descent:
+(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
+                             ops::ProximalGDOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
new file mode 100644
index 0000000000..b7dd840d19
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/proximal_gd_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    proximal_gd,
+    ops::ProximalGDOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
new file mode 100644
index 0000000000..64648b3cca
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class ProximalGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto prox_param = p - lr.broadcast(grad_dsize) * g;
+    if (l1 > 0) {
+      p_out.device(place) =
+          prox_param.sign() *
+          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
+                .cwiseMax(T(0.0))) /
+           (1.0 + (lr * l2).broadcast(grad_dsize)));
+    } else {
+      p_out.device(place) =
+          prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
new file mode 100644
index 0000000000..f2164a0f80
--- /dev/null
+++ b/paddle/operators/rank_loss_op.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RankLossOp : public framework::OperatorWithKernel {
+ public:
+  RankLossOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
+
+    auto label_dims = ctx->GetInputDim("Label");
+    auto left_dims = ctx->GetInputDim("Left");
+    auto right_dims = ctx->GetInputDim("Right");
+
+    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
+    ctx->SetOutputDim("Out", label_dims);
+  }
+};
+
+class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RankLossOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Label",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
+    AddComment(R"DOC(
+RankLoss Operator.
+
+RankLoss operator for RankNet
+(http://icml.cc/2015/wp-content/uploads/2015/06/icml_ranking.pdf). 
+RankNet is a pairwise ranking model with
+one training sample consisting of a pair of doc A and B, and the label P
+indicating that A is ranked higher than B or not:
+
+P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
+the input pair.
+
+The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
+
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
+  o_{i,j} =  o_i - o_j  \\
+  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
+$$
+
+The operator can take batch inputs with size batch_size (batch_size >= 1).
+
+)DOC");
+  }
+};
+
+class RankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  RankLossGradOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Left");
+    auto left_grad_name = framework::GradVarName("Left");
+    auto right_grad_name = framework::GradVarName("Right");
+
+    if (ctx->HasOutput(left_grad_name)) {
+      ctx->SetOutputDim(left_grad_name, dims);
+    }
+
+    if (ctx->HasOutput(right_grad_name)) {
+      ctx->SetOutputDim(right_grad_name, dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad,
+            ops::RankLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss, ops::RankLossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    rank_loss_grad,
+    ops::RankLossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
new file mode 100644
index 0000000000..294b227383
--- /dev/null
+++ b/paddle/operators/rank_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/rank_loss_op.h"
+
+REGISTER_OP_CUDA_KERNEL(rank_loss,
+                        paddle::operators::RankLossKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(rank_loss_grad,
+                        paddle::operators::RankLossGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
new file mode 100644
index 0000000000..bd0c49ca6e
--- /dev/null
+++ b/paddle/operators/rank_loss_op.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RankLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
+    out_t->mutable_data<T>(ctx.GetPlace());
+
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    out.device(dev) =
+        (1. + (left - right).exp()).log() - label * (left - right);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class RankLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_left_t =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Left"));
+    auto* d_right_t =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Right"));
+
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* left_t = ctx.Input<framework::Tensor>("Left");
+    auto* right_t = ctx.Input<framework::Tensor>("Right");
+
+    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto left = framework::EigenVector<T>::Flatten(*left_t);
+    auto right = framework::EigenVector<T>::Flatten(*right_t);
+
+    // compute d_left
+    if (d_left_t) {
+      d_left_t->mutable_data<T>(ctx.GetPlace());
+      auto d_left = framework::EigenVector<T>::Flatten(*d_left_t);
+      d_left.device(dev) = d_out * (1. / (1. + (right - left).exp()) - label);
+    }
+    // compute d_right
+    if (d_right_t) {
+      d_right_t->mutable_data<T>(ctx.GetPlace());
+      auto d_right = framework::EigenVector<T>::Flatten(*d_right_t);
+      d_right.device(dev) =
+          -d_out * (1.0 / (1. + (right - left).exp()) - label);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
new file mode 100644
index 0000000000..a136c5b447
--- /dev/null
+++ b/paddle/operators/recurrent_op.cc
@@ -0,0 +1,635 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+constexpr char kInputs[] = "inputs";
+constexpr char kInitialStates[] = "initial_states";
+constexpr char kParameters[] = "parameters";
+constexpr char kOutputs[] = "outputs";
+constexpr char kStepScopes[] = "step_scopes";
+constexpr char kExStates[] = "ex_states";
+constexpr char kStates[] = "states";
+constexpr char kStepBlock[] = "sub_block";
+constexpr char kReverse[] = "reverse";
+constexpr char kIsTrain[] = "is_train";
+#define GRAD_SUFFIX "@GRAD"
+constexpr char kInputGrads[] = "inputs" GRAD_SUFFIX;
+constexpr char kOutputGrads[] = "outputs" GRAD_SUFFIX;
+constexpr char kParamGrads[] = "parameters" GRAD_SUFFIX;
+constexpr char kInitStateGrads[] = "initial_states" GRAD_SUFFIX;
+
+using StepScopeVar = std::vector<framework::Scope *>;
+
+// StepScopes manages scopes inside RNN.
+//    StepScopes::CurScope() get the current scope
+//    StepScopes::ExScope() get the ex-scope, or scope in previous time step.
+//    StepScopes::Next() move to next time step.
+//
+// if is_train = False, then
+//   there are two scopes for the RNN and just support forward.
+// else
+//   the len(scopes) == seq_len
+//
+// if is_backward = True, then
+//   reversely access scopes
+// else
+//   access scopes from begin to end.
+class StepScopes {
+ public:
+  StepScopes(const framework::Scope &parent, StepScopeVar *scopes,
+             bool is_train, size_t seq_len, bool is_backward = false)
+      : counter_(is_backward ? seq_len - 1 : 0UL),
+        scopes_(scopes),
+        is_train_(is_train),
+        is_backward_(is_backward) {
+    size_t num_step_scopes = is_train ? seq_len : 2;
+    PADDLE_ENFORCE(is_train || !is_backward,
+                   "Cannot backward when is not training");
+    if (!is_backward_) {
+      PADDLE_ENFORCE(scopes->empty());
+      scopes->reserve(static_cast<size_t>(num_step_scopes));
+      for (size_t i = 0; i < num_step_scopes; ++i) {
+        scopes->emplace_back(&parent.NewScope());
+      }
+    }
+  }
+
+  framework::Scope &CurScope() { return GetScope(counter_); }
+
+  framework::Scope &ExScope() {
+    auto &scope = GetScope(is_backward_ ? counter_ + 1 : counter_ - 1);
+    return scope;
+  }
+
+  void Next() {
+    if (is_backward_) {
+      --counter_;
+    } else {
+      ++counter_;
+    }
+  }
+
+ private:
+  framework::Scope &GetScope(size_t scope_id) const {
+    if (!is_train_) {
+      scope_id %= 2;
+    }
+    PADDLE_ENFORCE_LT(scope_id, scopes_->size());
+    return *(*scopes_)[scope_id];
+  }
+
+  size_t counter_;
+  StepScopeVar *scopes_;
+  bool is_train_;
+  bool is_backward_;
+};
+
+// Base class for RecurrentOp/RecurrentGradOp
+//    Some common protected functions for RecurrentOp/RecurrentGradOp
+class RecurrentBase : public framework::OperatorBase {
+ public:
+  RecurrentBase(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  // Get SequenceLength from Scope
+  //   The sequence length is got from input tensor. The input tensor's
+  //   dimension should be [SEQ_LEN, ..., ...]. The first of the tensor's shape
+  //   is SEQ_LEN. The second of the tensor's shape could be the batch size or
+  //   nested sequence length.
+  int64_t GetSequenceLength(const framework::Scope &scope) const {
+    // Dim format SEQ_LEN, BATCH_SIZE, ...
+    int64_t seq_len = -1;
+    auto &all_inputs = Inputs(kInputs);
+    PADDLE_ENFORCE(!all_inputs.empty());
+    for (auto &iname : all_inputs) {
+      auto *var = scope.FindVar(iname);
+      PADDLE_ENFORCE(var != nullptr);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>());
+      auto &dim = var->Get<framework::LoDTensor>().dims();
+      if (seq_len == -1) {
+        seq_len = dim[0];
+      } else {
+        PADDLE_ENFORCE_EQ(seq_len, dim[0]);
+      }
+    }
+    return seq_len;
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   dst_tensor.ShareDataWith(src_tensor)
+  static void LinkTensor(const framework::Scope &src_scope,
+                         const std::vector<std::string> &src_vars,
+                         framework::Scope *dst_scope,
+                         const std::vector<std::string> &dst_vars) {
+    LinkTensorWithCallback(
+        src_scope, src_vars, dst_scope, dst_vars,
+        [&](const framework::Tensor &src, framework::Tensor *dst) {
+          dst->ShareDataWith(src);
+        });
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.Var, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     framework::Scope *dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+    }
+  }
+
+  // for src_tensor, dst_tensor in zip(map(src_scope.FindVar, src_vars),
+  //                                   map(dst_scope.FindVar, dst_vars)):
+  //   callback(src_tensor, &dst_tensor)
+  template <typename Callback>
+  static void LinkTensorWithCallback(const framework::Scope &src_scope,
+                                     const std::vector<std::string> &src_vars,
+                                     const framework::Scope &dst_scope,
+                                     const std::vector<std::string> &dst_vars,
+                                     Callback callback) {
+    PADDLE_ENFORCE_EQ(src_vars.size(), dst_vars.size());
+    for (size_t i = 0; i < dst_vars.size(); ++i) {
+      VLOG(10) << "Link " << src_vars[i] << " to " << dst_vars[i];
+      AccessTensor(src_scope, src_vars[i], dst_scope, dst_vars[i], callback);
+    }
+  }
+
+  // (seq_len, shape) -> return [seq_len] + list(shape)
+  static framework::DDim PrependDims(size_t seq_len,
+                                     const framework::DDim &src) {
+    auto dims = framework::vectorize(src);
+    dims.insert(dims.begin(), static_cast<int64_t>(seq_len));
+    return framework::make_ddim(dims);
+  }
+
+ private:
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           framework::Scope *dst_scope,
+                           const std::string &dst_var_name, Callback callback) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+
+    auto *dst_var = dst_scope->Var(dst_var_name);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+
+  template <typename Callback>
+  static void AccessTensor(const framework::Scope &src_scope,
+                           const std::string &src_var_name,
+                           const framework::Scope &dst_scope,
+                           const std::string &dst_var_name, Callback callback) {
+    auto *src_var = src_scope.FindVar(src_var_name);
+    PADDLE_ENFORCE(src_var != nullptr);
+    auto &src_tensor = src_var->Get<framework::LoDTensor>();
+    auto *dst_var = dst_scope.FindVar(dst_var_name);
+    PADDLE_ENFORCE(dst_var != nullptr);
+    auto *dst_tensor = dst_var->GetMutable<framework::LoDTensor>();
+    callback(src_tensor, dst_tensor);
+  }
+};
+
+class RecurrentOp : public RecurrentBase {
+ public:
+  RecurrentOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : RecurrentBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto seq_len = static_cast<size_t>(this->GetSequenceLength(scope));
+    VLOG(3) << "Static RNN input sequence length = " << seq_len;
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
+    auto reverse = Attr<bool>(kReverse);
+
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    for (size_t i = 0; i < seq_len; ++i) {
+      size_t seq_offset = reverse ? seq_len - i - 1 : i;
+      VLOG(3) << "Recurrent operate at the time step " << seq_offset;
+
+      auto &cur_scope = scopes.CurScope();
+
+      // Link outside::input --> inside::input
+      //   inside::input = outside::input[seq_offset: seq_offset+1]
+      LinkTensorWithCallback(
+          scope, Inputs(kInputs), &cur_scope, Inputs(kInputs),
+          [&seq_offset](const framework::Tensor &outside,
+                        framework::Tensor *inside) {
+            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+            auto dims = framework::vectorize(inside->dims());
+            dims.erase(dims.begin());
+            inside->Resize(framework::make_ddim(dims));
+          });
+
+      if (i == 0) {
+        // Link initial states  --> ex_states
+        LinkTensor(scope, Inputs(kInitialStates), &cur_scope,
+                   Attr<std::vector<std::string>>(kExStates));
+      } else {
+        auto &ex_scope = scopes.ExScope();
+        // Link ex_scope::state --> cur_scope::ex_state
+        LinkTensor(ex_scope, Attr<std::vector<std::string>>(kStates),
+                   &cur_scope, Attr<std::vector<std::string>>(kExStates));
+      }
+
+      // Every inputs are linked now, execute!
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/);
+
+      // get device context from pool
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      // Copy inside::output -> outside::output
+      //    outside::output[seq_offset: seq_offset + 1] = inside::output
+      this->LinkTensorWithCallback(
+          cur_scope, Outputs(kOutputs), scope, Outputs(kOutputs),
+          [&](const framework::LoDTensor &src_tensor,
+              framework::LoDTensor *dst_tensor) {
+            if (i == 0) {  // create output tensor at begin
+              dst_tensor->Resize(PrependDims(seq_len, src_tensor.dims()));
+              dst_tensor->mutable_data(place, src_tensor.type());
+            }
+
+            auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
+            // Explicit copy output since the local RNN scope can be destroyed
+            // early.
+            framework::Copy(src_tensor, place, dev_ctx, &dst_out);
+          });
+
+      scopes.Next();
+    }
+  }
+
+ private:
+  StepScopes CreateStepScopes(const framework::Scope &scope,
+                              size_t seq_len) const {
+    auto *var = scope.FindVar(Output(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
+                      Attr<bool>(kIsTrain), seq_len);
+  }
+};
+
+class RecurrentGradOp : public RecurrentBase {
+ public:
+  RecurrentGradOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : RecurrentBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto seq_len = static_cast<size_t>(GetSequenceLength(scope));
+    StepScopes scopes = CreateStepScopes(scope, seq_len);
+    auto reverse = Attr<bool>(kReverse);
+
+    framework::Executor executor(place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t step_id = 0; step_id < seq_len; ++step_id) {
+      size_t seq_offset = reverse ? step_id : seq_len - step_id - 1;
+      VLOG(3) << "Recurrent backward operate at the time step " << seq_offset;
+      auto &cur_scope = scopes.CurScope();
+      // Link outside::output_grads --> inside::output_grads
+      //   inside::output_grad = outside::output_grad[seq_offset:seq_offset+1]
+      LinkTensorWithCallback(
+          scope, Inputs(kOutputGrads), &cur_scope, Inputs(kOutputGrads),
+          [&](const framework::Tensor &outside, framework::Tensor *inside) {
+            inside->ShareDataWith(outside.Slice(seq_offset, seq_offset + 1));
+            auto dims = framework::vectorize(inside->dims());
+            dims.erase(dims.begin());
+            inside->Resize(framework::make_ddim(dims));
+          });
+      auto og_set = List2Set(Inputs(kOutputGrads));
+
+      if (VLOG_IS_ON(10)) {
+        std::ostringstream sout;
+        std::copy(og_set.begin(), og_set.end(),
+                  std::ostream_iterator<std::string>(sout, ","));
+        VLOG(10) << " RNN output gradients = [" << sout.str() << "]";
+      }
+
+      // Link states
+      //   if cur_scope::cur_state_grad in out_grads:
+      //     cur_scope::cur_state_grad += ex_scope::ex_state_grad
+      //   else:
+      //     ex_scope::ex_state_grad --> cur_scope::cur_state_grad
+      if (step_id != 0) {  // not at beginning
+        auto &ex_scope = scopes.ExScope();
+        auto ex_state_grads =
+            GradVarLists(Attr<std::vector<std::string>>(kExStates));
+        auto cur_state_grads =
+            GradVarLists(Attr<std::vector<std::string>>(kStates));
+
+        PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size());
+        for (size_t i = 0; i < ex_state_grads.size(); ++i) {
+          auto &cur_grad = cur_state_grads[i];
+          auto &ex_grad = ex_state_grads[i];
+          auto &ex_tensor =
+              ex_scope.FindVar(ex_grad)->Get<framework::LoDTensor>();
+
+          VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad;
+          auto *cur_grad_var = cur_scope.Var(cur_grad);
+          auto cur_grad_tensor =
+              cur_grad_var->GetMutable<framework::LoDTensor>();
+          framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
+        }
+      }
+
+      VLOG(5) << "Recurrent memory linking finished ";
+      // Run step block with cur_scope
+      executor.Run(*program, &cur_scope, block->ID(),
+                   false /*create_local_scope*/);
+
+      VLOG(5) << "executor.Run finished ";
+
+      auto local_var_names = LocalVarNames(cur_scope);
+
+      // Accumulate params
+      //   if (step == 0):
+      //      outside::param_grad = 0.0
+      //   outside::param_grad += inside::param_grad
+      {
+        auto &pg_names = Outputs(kParamGrads);
+        auto &p_names = Inputs(kParameters);
+        PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+
+        for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+          auto inside_grad_name = framework::GradVarName(p_names[param_id]);
+
+          // If does not compute gradient of that variable inside rnn, just
+          // continue
+          if (local_var_names.find(inside_grad_name) == local_var_names.end()) {
+            continue;
+          }
+
+          // zero gradient variable in step 0
+          if (step_id == 0) {
+            auto &inside_tensor = cur_scope.FindVar(inside_grad_name)
+                                      ->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {pg_names[param_id]}}}, attrs);
+            zero_op->Run(scope, place);
+          }
+
+          auto new_inside_name = cur_scope.Rename(inside_grad_name);
+          // sum gradient
+
+          auto sum_op = framework::OpRegistry::CreateOp(
+              "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+              {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+          sum_op->Run(cur_scope, place);
+
+          cur_scope.Rename(new_inside_name, inside_grad_name);
+        }
+      }
+      VLOG(5) << "Accumulate Parameter finished ";
+
+      // Copy input gradient from inside to outside
+      //   outside::input_grad[seq_offset: seq_offset + 1] = inside::input_grad
+      LinkTensorWithCallback(
+          cur_scope, GradVarLists(Inputs(kInputs)), scope, Outputs(kInputGrads),
+          [&](const framework::LoDTensor &inside,
+              framework::LoDTensor *outside) {
+            if (inside.memory_size() == 0) {  // IG is not created.
+              return;
+            }
+            if (step_id == 0) {  // alloc memory
+              outside->Resize(PrependDims(seq_len, inside.dims()));
+              outside->mutable_data(place, inside.type());
+            }
+
+            auto dst = outside->Slice(seq_offset, seq_offset + 1);
+            framework::Copy(inside, place, dev_ctx, &dst);
+          });
+      VLOG(5) << "Link outside gradient finished ";
+
+      if (step_id + 1 == seq_len) {  // at_end
+        // copy initialize states gradient from inside to outside
+        LinkTensorWithCallback(
+            cur_scope, GradVarLists(Attr<std::vector<std::string>>(kExStates)),
+            scope, Outputs(kInitStateGrads),
+            [&](const framework::LoDTensor &inside,
+                framework::LoDTensor *outside) {
+              outside->Resize(inside.dims());
+              outside->mutable_data(place, inside.type());
+              framework::Copy(inside, place, dev_ctx, outside);
+            });
+        VLOG(5) << "Link initialize state gradient finished ";
+      }
+      scopes.Next();
+    }
+  }
+
+ private:
+  StepScopes CreateStepScopes(const framework::Scope &scope,
+                              size_t seq_len) const {
+    auto *var = scope.FindVar(Input(kStepScopes));
+    PADDLE_ENFORCE(var != nullptr);
+    return StepScopes(scope, var->GetMutable<StepScopeVar>(),
+                      Attr<bool>(kIsTrain), seq_len, true /*is_backward*/);
+  }
+
+  std::unordered_set<std::string> List2Set(
+      const std::vector<std::string> &list) const {
+    std::unordered_set<std::string> local_var_name_set;
+    local_var_name_set.reserve(list.size());
+    for (auto &each : list) {
+      local_var_name_set.insert(each);
+    }
+    return local_var_name_set;
+  }
+
+  std::unordered_set<std::string> LocalVarNames(
+      const framework::Scope &scope) const {
+    return this->List2Set(scope.LocalVarNames());
+  }
+  static std::vector<std::string> GradVarLists(
+      const std::vector<std::string> &var_names) {
+    std::vector<std::string> retv;
+    retv.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(), std::back_inserter(retv),
+                   framework::GradVarName);
+    return retv;
+  }
+};
+
+class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecurrentOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kInputs, "rnn inputs").AsDuplicable();
+    AddInput(kInitialStates, "rnn initial states").AsDuplicable();
+    AddInput(kParameters,
+             "Parameters are used by step block as its input. However, the "
+             "input is not a sequence tensor. Every time step, each operator "
+             "in step block just use the parameter directly.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "The output sequence of RNN. The sequence length must be same.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "StepScopes contain all local variables in each time step.");
+    AddAttr<std::vector<std::string>>(kExStates,
+                                      string::Sprintf(
+                                          R"DOC(The ex-state variable names.
+The ex-state means the state value in the ex-timestep or the previous time step
+[%s, %s, %s] must be the same order)DOC",
+                                          kExStates, kStates, kInitStateGrads));
+    AddAttr<std::vector<std::string>>(
+        kStates,
+        string::Sprintf(
+            "The state variable names. [%s, %s, %s] must be the same order",
+            kExStates, kStates, kInitStateGrads));
+    AddAttr<framework::BlockDesc *>(kStepBlock, "The step block inside RNN");
+    AddAttr<bool>(kReverse, R"DOC(Calculate RNN reversely or not.
+By default reverse=False
+
+Assume the input data is [A, B, C, D]
+
+if reverse is False:
+  the computation of RNN is like
+      A          B          C         D
+      |          |          |         |
+      v          v          v         v
+     rnn -----> rnn -----> rnn ----> rnn
+      |          |          |         |
+      v          v          v         v
+      o          o          o         o
+
+if reverse is True
+  the computation of RNN is like
+      A          B          C         D
+      |          |          |         |
+      v          v          v         v
+     rnn <----- rnn <----- rnn <---- rnn
+      |          |          |         |
+      v          v          v         v
+      o          o          o         o
+)DOC").SetDefault(false);
+    AddAttr<bool>(kIsTrain, "").SetDefault(true);
+    AddComment(R"DOC(
+Static Length Recurrent Operator.
+
+The static length recurrent operator can only operate on fixed size sequence
+data, i.e. in each mini-batch, the sequence length of all inputs are the same.
+
+)DOC");
+  }
+};
+
+class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<framework::OpDesc> Apply() const {
+    auto *grad = new framework::OpDesc();
+    grad->SetType("recurrent_grad");
+    for (auto &input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(framework::GradVarName(input_param),
+                      this->InputGrad(input_param, false));
+    }
+
+    for (auto &output_param : this->OutputNames()) {
+      if (output_param == kStepScopes) {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->Output(output_param));
+      } else {
+        grad->SetInput(output_param, this->Output(output_param));
+        grad->SetInput(framework::GradVarName(output_param),
+                       this->OutputGrad(output_param));
+      }
+    }
+    grad->SetAttrMap(this->Attrs());
+    grad->SetBlockAttr(kStepBlock, *grad_block_[0]);
+
+    return std::unique_ptr<framework::OpDesc>(grad);
+  }
+};
+
+class RecurrentGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    std::vector<std::string> input{kInputs, kInitialStates};
+    std::vector<std::string> output{kOutputs};
+    for (auto &s : input) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
+    }
+    for (auto &s : output) {
+      PADDLE_ENFORCE(ctx->HasInputs(s));
+    }
+    for (auto &s : input) {
+      ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
+    }
+    if (ctx->HasInputs(kParameters)) {
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
+      ctx->SetOutputsDim(framework::GradVarName(kParameters),
+                         ctx->GetInputsDim(kParameters));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(recurrent, paddle::operators::RecurrentOp,
+                  paddle::operators::RecurrentOpProtoMaker,
+                  paddle::operators::RecurrentGradOpDescMaker);
+REGISTER_OPERATOR(recurrent_grad, paddle::operators::RecurrentGradOp,
+                  paddle::operators::RecurrentGradOpShapeInference);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000..49e1eb3402
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace operators {
+
+constexpr char kOptimizeBlock[] = "OptimizeBlock";
+
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
+static void CreateTensorFromMessageType(framework::Variable *var,
+                                        sendrecv::VarType var_type) {
+  if (var_type == sendrecv::VarType::LOD_TENSOR) {
+    var->GetMutable<framework::LoDTensor>();
+  } else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
+    var->GetMutable<framework::SelectedRows>();
+  } else {
+    PADDLE_THROW(
+        "VariableMessage type %d is not in "
+        "[LoDTensor, SelectedRows]",
+        var_type);
+  }
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
+    }
+  }
+
+  void Stop() override {
+    detail::MessageWithName term_msg;
+    term_msg.first = LISTEN_TERMINATE_MESSAGE;
+    rpc_service_->Push(term_msg);
+    rpc_service_->ShutDown();
+    server_thread_->join();
+  }
+
+  std::string GetGradVarNameForTrainer(const std::string &varname) const {
+    if (grads_counter_.find(varname) == grads_counter_.end()) {
+      grads_counter_[varname] = 0;
+    }
+    return string::Sprintf("%s.trainer_%d", varname, grads_counter_[varname]++);
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    framework::Scope &recv_scope = scope.NewScope();
+
+    // FIXME(Yancey1989): initialize rpc server with laze mode.
+    rpc_service_->SetScope(&recv_scope);
+    rpc_service_->SetDevCtx(&dev_ctx);
+    auto param_list = Attr<std::vector<std::string>>("ParamList");
+    auto grad_list = Attr<std::vector<std::string>>("GradList");
+    auto fan_in = Attr<int>("Fanin");
+
+    auto *block = Attr<framework::BlockDesc *>(kOptimizeBlock);
+    auto *program = block->Program();
+    framework::Executor executor(dev_place);
+
+    // TODO(typhoonzero): change this to a while_op for every cluster-batch.
+    bool exit_flag = false;
+    while (!exit_flag) {
+      // Get from multiple trainers, we don't care about the order in which
+      // the gradients arrives, just add suffix 0~n and merge the gradient.
+      rpc_service_->SetCond(0);
+      size_t recv_var_cnt = 0;
+      int batch_barrier = 0;
+      while (batch_barrier != fan_in) {
+        const detail::MessageWithName &v = rpc_service_->Get();
+        auto grad_var_name = v.first;
+        if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          LOG(INFO) << "received terminate message and exit";
+          exit_flag = true;
+          break;
+        } else if (grad_var_name == BATCH_BARRIER_MESSAGE) {
+          VLOG(3) << "recv batch barrier message";
+          batch_barrier++;
+          continue;
+        } else {
+          // receive a variable
+          recv_var_cnt++;
+          auto it =
+              std::find(grad_list.begin(), grad_list.end(), grad_var_name);
+          std::string param_var_name;
+          if (it != grad_list.end()) {
+            param_var_name = param_list[it - grad_list.begin()];
+          } else {
+            LOG(ERROR) << "grad has no paired param:" << grad_var_name;
+          }
+          VLOG(3) << "received grad: " << grad_var_name
+                  << " updating param: " << param_var_name;
+
+          if (fan_in > 1) {
+            grad_var_name = this->GetGradVarNameForTrainer(grad_var_name);
+          }
+          auto *var = recv_scope.FindVar(grad_var_name);
+          if (var == nullptr) {
+            LOG(ERROR) << "Can not find server side var: " << grad_var_name;
+            PADDLE_THROW("Can not find server side var");
+          }
+          detail::DeserializeFromMessage(v.second, dev_ctx, var);
+        }
+      }
+      VLOG(3) << "recv " << recv_var_cnt << " parmeters for one barrier.";
+      // TODO(Yancey1989): merge SelectedRows variables here
+      if (exit_flag) {
+        break;
+      }
+
+      try {
+        executor.Run(*program, &recv_scope, block->ID(), /*global_block*/
+                     false /*create_local_scope*/, false /*create_vars*/);
+      } catch (std::exception &e) {
+        LOG(ERROR) << "run sub program error " << e.what();
+      }
+      rpc_service_->SetCond(1);
+      rpc_service_->WaitClientGet(recv_var_cnt);
+      grads_counter_.clear();
+    }  // while(true)
+  }
+
+ protected:
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+  mutable std::unordered_map<std::string, int> grads_counter_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recieve tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDesc *>(
+        kOptimizeBlock, "Serialized ProgramDesc string for recv to run.");
+    AddAttr<std::vector<std::string>>(
+        "ParamList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>(
+        "GradList", "type list of string",
+        "grad->param name mapping to find which parameters to optimize.")
+        .SetDefault({});
+    AddAttr<int>("Fanin", "type int",
+                 "Number of trainers in the current cluster job")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
new file mode 100644
index 0000000000..84f24a9095
--- /dev/null
+++ b/paddle/operators/reduce_op.cc
@@ -0,0 +1,214 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class ReduceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReduceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReduceOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    bool reduce_all = ctx->Attrs().Get<bool>("reduce_all");
+    bool keep_dim = ctx->Attrs().Get<bool>("keep_dim");
+    if (reduce_all) {
+      if (keep_dim)
+        ctx->SetOutputDim(
+            "Out", framework::make_ddim(std::vector<int64_t>(x_rank, 1)));
+      else
+        ctx->SetOutputDim("Out", {1});
+    } else {
+      auto dims_vector = vectorize(x_dims);
+      if (keep_dim || x_rank == 1) {
+        dims_vector[dim] = 1;
+      } else {
+        dims_vector.erase(dims_vector.begin() + dim);
+      }
+      auto out_dims = framework::make_ddim(dims_vector);
+      ctx->SetOutputDim("Out", out_dims);
+      if (dim != 0) {
+        // Only pass LoD when not reducing on the first dim.
+        ctx->ShareLoD("X", /*->*/ "Out");
+      }
+    }
+  }
+};
+
+class ReduceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_rank = x_dims.size();
+    PADDLE_ENFORCE_LE(x_rank, 6, "Tensors with rank at most 6 are supported.");
+    int dim = ctx->Attrs().Get<int>("dim");
+    if (dim < 0) dim = x_rank + dim;
+    PADDLE_ENFORCE_LT(
+        dim, x_rank,
+        "The dim should be in the range [-rank(input), rank(input)).");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
+    }
+  }
+};
+
+class ReduceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReduceOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input tensor. Tensors with rank at most 6 are "
+             "supported.");
+    AddOutput("Out", "(Tensor) The result tensor.");
+    AddAttr<int>(
+        "dim",
+        "(int, default 0) The dimension to reduce. "
+        "Must be in the range [-rank(input), rank(input)). "
+        "If `dim < 0`, the dim to reduce is `rank + dim`. "
+        "Note that reducing on the first dim will make the LoD info lost.")
+        .SetDefault(0);
+    AddAttr<bool>("keep_dim",
+                  "(bool, default false) "
+                  "If true, retain the reduced dimension with length 1.")
+        .SetDefault(false);
+    AddAttr<bool>("reduce_all",
+                  "(bool, default false) "
+                  "If true, output a scalar reduced along all dimensions.")
+        .SetDefault(false);
+    comment_ = R"DOC(
+{ReduceOp} Operator.
+
+This operator computes the {reduce} of input tensor along the given dimension. 
+The result tensor has 1 fewer dimension than the input unless keep_dim is true.
+If reduce_all is true, just reduce along all dimensions and output a scalar.
+
+)DOC";
+    AddComment(comment_);
+  }
+
+ protected:
+  std::string comment_;
+
+  void Replace(std::string &src, std::string from, std::string to) {
+    std::size_t len_from = std::strlen(from.c_str());
+    std::size_t len_to = std::strlen(to.c_str());
+    for (std::size_t pos = src.find(from); pos != std::string::npos;
+         pos = src.find(from, pos + len_to)) {
+      src.replace(pos, len_from, to);
+    }
+  }
+
+  void SetComment(std::string name, std::string op) {
+    Replace(comment_, "{ReduceOp}", name);
+    Replace(comment_, "{reduce}", op);
+  }
+};
+
+class ReduceSumOpMaker : public ReduceOpMaker {
+ public:
+  ReduceSumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceSum", "sum");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMeanOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMean", "mean");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMaxOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMax", "max");
+    AddComment(comment_);
+  }
+};
+
+class ReduceMinOpMaker : public ReduceOpMaker {
+ public:
+  ReduceMinOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : ReduceOpMaker(proto, op_checker) {
+    SetComment("ReduceMin", "min");
+    AddComment(comment_);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
+            ops::ReduceGradOp);
+
+REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
+            reduce_mean_grad, ops::ReduceGradOp);
+
+REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
+            ops::ReduceGradOp);
+
+REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
+            ops::ReduceGradOp);
+
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)         \
+  REGISTER_OP_CPU_KERNEL(reduce_type,                                          \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           float, ops::functor>,               \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           double, ops::functor>,              \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int, ops::functor>,                 \
+                         ops::ReduceKernel<paddle::platform::CPUDeviceContext, \
+                                           int64_t, ops::functor>);            \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      reduce_type##_grad,                                                      \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, float,         \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, double,        \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int,           \
+                            ops::grad_functor>,                                \
+      ops::ReduceGradKernel<paddle::platform::CPUDeviceContext, int64_t,       \
+                            ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
new file mode 100644
index 0000000000..4ed1e051db
--- /dev/null
+++ b/paddle/operators/reduce_op.cu
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/reduce_op.h"
+
+namespace ops = paddle::operators;
+
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)    \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type, ops::ReduceKernel<paddle::platform::CUDADeviceContext, \
+                                     float, ops::functor>,                \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, double,      \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int,         \
+                        ops::functor>,                                    \
+      ops::ReduceKernel<paddle::platform::CUDADeviceContext, int64_t,     \
+                        ops::functor>);                                   \
+  REGISTER_OP_CUDA_KERNEL(                                                \
+      reduce_type##_grad,                                                 \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,   \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,  \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,     \
+                            ops::grad_functor>,                           \
+      ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t, \
+                            ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
new file mode 100644
index 0000000000..da5f397776
--- /dev/null
+++ b/paddle/operators/reduce_op.h
@@ -0,0 +1,257 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+struct SumFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.sum(dim);
+  }
+};
+
+struct SumGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim);
+  }
+};
+
+struct MeanFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.mean(dim);
+  }
+};
+
+struct MeanGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    dx.device(place) = dy.broadcast(dim) / dx.constant(size);
+  }
+};
+
+struct MaxFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.maximum(dim);
+  }
+};
+
+struct MinFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) {
+    y.device(place) = x.minimum(dim);
+  }
+};
+
+struct MaxOrMinGradFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename DX,
+            typename DY, typename Dim>
+  void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy,
+                  const Dim& dim, int size) {
+    auto equals = x == y.broadcast(dim);
+    auto ones = dx.constant(1);
+    auto zeros = dx.constant(0);
+    // If there are multiple minimum or maximum elements, the subgradient of
+    // each is the set [0, 1], and we pass gradient to all of them here.
+    dx.device(place) = dy.broadcast(dim) * equals.select(ones, zeros);
+  }
+};
+
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      // Flatten and reduce 1-D tensor
+      auto* input = context.Input<Tensor>("X");
+      auto* output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input);
+      auto out = EigenScalar<T>::From(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto reduce_dim = Eigen::array<int, 1>({{0}});
+      Functor functor;
+      functor(place, x, out, reduce_dim);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceCompute<1>(context);
+          break;
+        case 2:
+          ReduceCompute<2>(context);
+          break;
+        case 3:
+          ReduceCompute<3>(context);
+          break;
+        case 4:
+          ReduceCompute<4>(context);
+          break;
+        case 5:
+          ReduceCompute<5>(context);
+          break;
+        case 6:
+          ReduceCompute<6>(context);
+          break;
+      }
+    }
+  }
+
+ private:
+  template <size_t D>
+  void ReduceCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<Tensor>("X");
+    auto* output = context.Output<Tensor>("Out");
+    output->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenTensor<T, D>::From(*input);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    auto reduce_dim = Eigen::array<int, 1>({{dim}});
+    // construct the squeezed output tensor
+    bool keep_dim = context.Attr<bool>("keep_dim");
+    DDim dims = output->dims();
+    auto dims_vector = vectorize(dims);
+    if (keep_dim && x_rank > 1) {
+      dims_vector.erase(dims_vector.begin() + dim);
+      dims = framework::make_ddim(dims_vector);
+    }
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+
+    if (D == 1) {
+      auto out = EigenScalar<T>::From(*output);
+      functor(place, x, out, reduce_dim);
+    } else {
+      auto out = EigenTensor<T, (D - 1)>::From(*output, dims);
+      functor(place, x, out, reduce_dim);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T, typename Functor>
+class ReduceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    if (reduce_all) {
+      auto* input0 = context.Input<Tensor>("X");
+      auto* input1 = context.Input<Tensor>("Out");
+      auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+      auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+      output->mutable_data<T>(context.GetPlace());
+      auto x = EigenVector<T>::Flatten(*input0);
+      auto x_reduce = EigenVector<T>::From(*input1);
+      auto x_reduce_grad = EigenVector<T>::From(*input2);
+      auto x_grad = EigenVector<T>::Flatten(*output);
+      auto& place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      auto broadcast_dim =
+          Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
+      Functor functor;
+      functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+              broadcast_dim[0]);
+    } else {
+      int rank = context.Input<Tensor>("X")->dims().size();
+      switch (rank) {
+        case 1:
+          ReduceGradCompute<1>(context);
+          break;
+        case 2:
+          ReduceGradCompute<2>(context);
+          break;
+        case 3:
+          ReduceGradCompute<3>(context);
+          break;
+        case 4:
+          ReduceGradCompute<4>(context);
+          break;
+        case 5:
+          ReduceGradCompute<5>(context);
+          break;
+        case 6:
+          ReduceGradCompute<6>(context);
+          break;
+      }
+    }
+  }
+
+ private:
+  template <size_t D>
+  void ReduceGradCompute(const framework::ExecutionContext& context) const {
+    auto* input0 = context.Input<Tensor>("X");
+    auto* input1 = context.Input<Tensor>("Out");
+    auto* input2 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* output = context.Output<Tensor>(framework::GradVarName("X"));
+
+    output->mutable_data<T>(context.GetPlace());
+    auto x = EigenTensor<T, D>::From(*input0);
+    auto x_grad = EigenTensor<T, D>::From(*output);
+    auto x_rank = static_cast<int>(x.dimensions().size());
+    int dim = static_cast<int>(context.Attr<int>("dim"));
+    if (dim < 0) dim = x_rank + dim;
+    DDim dims = input0->dims();
+    dims[dim] = 1;
+    auto x_reduce = EigenTensor<T, D>::From(*input1, dims);
+    auto x_reduce_grad = EigenTensor<T, D>::From(*input2, dims);
+
+    Eigen::array<int, D> broadcast_dim;
+    for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1;
+    broadcast_dim[dim] = input0->dims()[dim];
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    Functor functor;
+    functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim,
+            broadcast_dim[dim]);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
+  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
+  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
+  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
+  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor);
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
new file mode 100644
index 0000000000..3c30447949
--- /dev/null
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -0,0 +1,270 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+class ReorderLoDTensorByRankTableOpProtoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
+                                          OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input lod tensor to be reordered according to "
+             "Input(RankTable).");
+    AddInput("RankTable",
+             "(LoDRankTable), the rank table according to which Input(X) is "
+             "reordered.");
+    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
+
+Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
+input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
+Input(X) according to the information provided by Input(RankTable).
+
+For example:
+
+If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
+Input(X) will be reordered that the fourth sequence in Input(X) will become the
+first one, and then followed by the original first, third, and the second one.
+
+This is:
+X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
+Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
+
+If the LoD information of Input(X) is empty, this means Input(X) is not sequence
+data. This is also identical to a batch of sequences where each sequence has a
+fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
+each slice of Input(X) along the first axis according to Input(RankTable).
+
+This is:
+X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
+indices in RankTable are [3, 0, 2, 1].
+Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
+
+NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
+not need to be calculated according to Input(X). It can be calculated according
+to another different sequence, and then this operator sorts Input(X) according
+to the given LoDRankTable.
+
+)DOC");
+  }
+};
+
+class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
+ public:
+  ReorderLoDTensorByRankTableBase(const std::string &type,
+                                  const framework::VariableNameMap &inputs,
+                                  const framework::VariableNameMap &outputs,
+                                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto &x =
+        detail::Ref(scope.FindVar(Input("X")),
+                    "Cannot find input lod tensor variable %s", Input("X"))
+            .Get<framework::LoDTensor>();
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")),
+                                   "Cannot find input rank table variable %s",
+                                   Input("RankTable"))
+                           .Get<framework::LoDRankTable>();
+    auto &out =
+        *detail::Ref(scope.FindVar(Output("Out")),
+                     "Cannot find output lod tensor variable %s", Output("Out"))
+             .GetMutable<framework::LoDTensor>();
+
+    out.Resize(x.dims());
+    out.mutable_data(x.place(), x.type());
+    this->process(place, x, rank_table, &out);
+  }
+
+ protected:
+  virtual void process(const platform::Place &place,
+                       const framework::LoDTensor &x,
+                       const framework::LoDRankTable &rank_table,
+                       framework::LoDTensor *out) const = 0;
+
+  struct AbsoluteRankTableItem {
+    size_t offset;  // the absolute/accumulated offset.
+    size_t length;  // the length
+    framework::LoD lod;
+  };
+
+  std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
+      const framework::LoDTensor &x) const {
+    std::vector<AbsoluteRankTableItem> absolute_table;
+
+    if (x.lod().empty()) {
+      // For Tensor without lod, such as the output of sequence_pool_op
+      size_t size = x.dims()[0];
+      absolute_table.reserve(size);
+      for (size_t i = 0; i < size; ++i) {
+        absolute_table.emplace_back();
+        absolute_table.back().length = 1;
+        absolute_table.back().offset = i;
+      }
+    } else {
+      size_t level = 0;
+      size_t size = x.lod()[level].size();
+
+      for (size_t i = 0; i < size - 1; ++i) {
+        auto lod_offset =
+            framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
+
+        auto &offset = lod_offset.second;
+
+        absolute_table.emplace_back();
+        absolute_table.back().length = offset.second - offset.first;
+        absolute_table.back().offset = offset.first;
+        absolute_table.back().lod = lod_offset.first;
+      }
+    }
+
+    return absolute_table;
+  }
+
+  size_t CopyTensorAndLod(const platform::Place &place,
+                          const AbsoluteRankTableItem &item,
+                          const framework::LoDTensor &x,
+                          framework::LoDTensor *out, size_t out_offset) const {
+    auto &out_lod = *out->mutable_lod();
+    auto len = item.length;
+    auto x_offset = item.offset;
+
+    if (out_lod.empty()) {
+      for (size_t i = 0; i < item.lod.size(); ++i) {
+        out_lod.push_back(std::vector<size_t>({0}));
+      }
+    }
+
+    for (size_t i = 0; i < out_lod.size(); ++i) {
+      auto &out_v = out_lod[i];
+      auto &new_lod_v = item.lod[i];
+
+      for (auto &detail : new_lod_v) {
+        out_v.push_back(out_v.back() + detail);
+      }
+    }
+
+    auto x_sliced = x.Slice(x_offset, x_offset + len);
+    auto out_sliced = out->Slice(out_offset, out_offset + len);
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+    framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
+    out_offset += len;
+    return out_offset;
+  }
+};
+
+class ReorderLoDTensorByRankTableOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankTableOp(const std::string &type,
+                                const framework::VariableNameMap &inputs,
+                                const framework::VariableNameMap &outputs,
+                                const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+    size_t out_offset = 0;
+    out->mutable_lod()->clear();
+    for (auto &item : rank_table.items()) {
+      PADDLE_ENFORCE_LT(item.index, absolute_table.size());
+      out_offset = CopyTensorAndLod(place, absolute_table[item.index], x, out,
+                                    out_offset);
+    }
+  }
+};
+
+class IdentityInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ReorderLodTensorByRankGradOpMaker
+    : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("reorder_lod_tensor_by_rank_grad");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetInput("RankTable", Input("RankTable"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReorderLoDTensorByRankGradOp : public ReorderLoDTensorByRankTableBase {
+ public:
+  ReorderLoDTensorByRankGradOp(const std::string &type,
+                               const framework::VariableNameMap &inputs,
+                               const framework::VariableNameMap &outputs,
+                               const framework::AttributeMap &attrs)
+      : ReorderLoDTensorByRankTableBase(type, inputs, outputs, attrs) {}
+
+ protected:
+  void process(const platform::Place &place, const framework::LoDTensor &x,
+               const framework::LoDRankTable &rank_table,
+               framework::LoDTensor *out) const override {
+    auto absolute_table = GetAbsoluteOffsetAndLengthByLoDRankTable(x);
+
+    // offsets = enumerate([item.index for item in rank_table.items()])
+    std::vector<std::pair<size_t, size_t>> offsets;
+    offsets.reserve(rank_table.items().size());
+    for (size_t i = 0; i < rank_table.items().size(); ++i) {
+      offsets.push_back({i, rank_table.items()[i].index});
+    }
+
+    // offsets.sort(key=lambda x: x[1])
+    std::sort(
+        offsets.begin(), offsets.end(),
+        [](const std::pair<size_t, size_t> &a,
+           const std::pair<size_t, size_t> &b) { return a.second < b.second; });
+
+    // Copy TensorAndLod
+    size_t out_offset = 0;
+    for (auto &offset : offsets) {
+      out_offset = this->CopyTensorAndLod(place, absolute_table[offset.first],
+                                          x, out, out_offset);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank,
+                  ops::ReorderLoDTensorByRankTableOp,
+                  ops::ReorderLodTensorByRankGradOpMaker,
+                  ops::ReorderLoDTensorByRankTableOpProtoMaker,
+                  ops::IdentityInferShape);
+REGISTER_OPERATOR(reorder_lod_tensor_by_rank_grad,
+                  ops::ReorderLoDTensorByRankGradOp, ops::IdentityInferShape);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
new file mode 100644
index 0000000000..b9743a5df1
--- /dev/null
+++ b/paddle/operators/reshape_op.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/reshape_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ReshapeOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ReshapeOp should not be null.");
+
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty.");
+    auto x_dims = ctx->GetInputDim("X");
+
+    std::vector<size_t> neg_dims_idx;
+    // set some dimension to -1 if it is unknown
+    const int unknown_size = -1;
+    for (size_t i = 0; i < shape.size(); ++i) {
+      PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size,
+                     "Each dimension of Attr(shape) must be positive or %d.",
+                     unknown_size);
+      if (shape[i] == unknown_size) {
+        neg_dims_idx.push_back(i);
+        PADDLE_ENFORCE(neg_dims_idx.size() <= 1,
+                       "Only one dimension of Attr(shape) can be unknown.");
+      }
+    }
+
+    int64_t capacity =
+        std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+    int64_t in_size = framework::product(x_dims);
+    if (neg_dims_idx.size() == 1) {
+      // dim infer
+      shape[neg_dims_idx[0]] = in_size / (-capacity);
+      // recalculate capacity
+      capacity = shape[neg_dims_idx[0]] * (-capacity);
+    }
+    // capacity check
+    PADDLE_ENFORCE(capacity == in_size,
+                   "The size of Input(X) mismatches with Attr(shape).");
+    // resize output
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto out_dims = framework::make_ddim(shape_int64);
+    ctx->SetOutputDim("Out", out_dims);
+    if (shape[0] == x_dims[0]) {
+      // Only pass LoD when the first dimension is equal between
+      // output and input.
+      ctx->ShareLoD("X", /*->*/ "Out");
+    }
+  }
+};
+
+class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input tensor of reshape operator.");
+    AddOutput("Out", "The output tensor of reshape operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Target shape of reshape operator.");
+    AddComment(R"DOC(
+Reshape Operator.
+
+Reshape Input(X) into the shape specified by Attr(shape).
+
+An example:
+Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]]
+
+and target shape = [1, 4], the reshape operator will transform
+the tensor X into a 2-D tensor: [[1, 2, 3, 4]]
+
+One dimension in the target shape can be set -1, representing that its
+size is unknown. In this case, the real dimension will be infered from 
+the original shape of Input(X) and other dimensions in the target shape.
+)DOC");
+  }
+};
+
+class ReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  ReshapeGradOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(reshape, ops::ReshapeOp, ops::ReshapeOpMaker, reshape_grad,
+            ops::ReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(reshape,
+                       ops::ReshapeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    reshape_grad, ops::ReshapeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu
new file mode 100644
index 0000000000..f487e43b99
--- /dev/null
+++ b/paddle/operators/reshape_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/reshape_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    reshape,
+    paddle::operators::ReshapeKernel<paddle::platform::CUDAPlace, float>);
+REGISTER_OP_CUDA_KERNEL(
+    reshape_grad,
+    paddle::operators::ReshapeGradKernel<paddle::platform::CUDAPlace, float>);
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
new file mode 100644
index 0000000000..d884b03cad
--- /dev/null
+++ b/paddle/operators/reshape_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class ReshapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto out_dims = out->dims();
+    out->mutable_data<T>(ctx.GetPlace());
+    framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
+    out->Resize(out_dims);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class ReshapeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    d_x->mutable_data<T>(ctx.GetPlace());
+
+    auto in_dims = d_x->dims();
+    framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
+    d_x->Resize(in_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
new file mode 100644
index 0000000000..f7c250bf91
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/rmsprop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RmspropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
+                   "Input(MeanSquare) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of RmspropOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(param_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(Momentum_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
+                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and grad input of RmspropOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+    ctx->SetOutputDim("MeanSquareOut", param_dim);
+  }
+};
+
+class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RmspropOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("MeanSquare",
+             "(Tensor, default Tensor<float>)"
+             " The mean square value that gets updated.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) The moment that gets updated.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+    AddOutput("MomentOut", "(Tensor) Output updated moment.");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value.");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) Constant "
+                   "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddAttr<float>("decay",
+                   "(float, default 0.9) "
+                   "Discounting factor for coming gradient.")
+        .SetDefault(0.9f);
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+Rmsprop Optimizer. 
+
+$$
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad \\
+MomentOut = momentum * Moment +
+            \frac{LearningRate * Grad}{\sqrt{MeanSquareOut + epsilon}} \\
+ParamOut = Param -  MomentOut
+$$
+
+The original slides that proposed Rmsprop: Slide 29 of
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
new file mode 100644
index 0000000000..0295dc262f
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/rmsprop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    rmsprop, ops::RmspropOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
new file mode 100644
index 0000000000..16a561835d
--- /dev/null
+++ b/paddle/operators/rmsprop_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class RmspropOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+    mean_square_out->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rho = ctx.Attr<float>("decay");
+    float momentum = ctx.Attr<float>("momentum");
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    mom_out.device(place) =
+        momentum * mom +
+        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    p_out.device(place) = p - mom_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn_memory_helper_op.cc b/paddle/operators/rnn_memory_helper_op.cc
new file mode 100644
index 0000000000..eb55ed6a05
--- /dev/null
+++ b/paddle/operators/rnn_memory_helper_op.cc
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class RNNMemoryHelperOp : public framework::OperatorBase {
+ public:
+  RNNMemoryHelperOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto mem_var_name = Input("X");
+    auto *mem_var = scope.FindVar(mem_var_name);
+    PADDLE_ENFORCE(mem_var != nullptr,
+                   "Cannot find mem_var in scope, mem_var_name is %s",
+                   mem_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
+    auto &mem_tensor = mem_var->Get<framework::LoDTensor>();
+    out_tensor->ShareDataWith(mem_tensor);
+    out_tensor->set_lod(mem_tensor.lod());
+  }
+};
+
+class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class RNNMemoryHelperOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RNNMemoryHelperOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddOutput("Out", "");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddComment("");
+  }
+};
+
+class RNNMemoryHelperGradOp : public framework::OperatorBase {
+ public:
+  RNNMemoryHelperGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto out_grad_var_name = Input(framework::GradVarName("Out"));
+    auto *out_grad_var = scope.FindVar(out_grad_var_name);
+
+    auto in_grad_var_name = Output(framework::GradVarName("X"));
+    auto *in_grad_var = scope.FindVar(in_grad_var_name);
+    PADDLE_ENFORCE(in_grad_var != nullptr,
+                   "Cannot find in_grad_var in scope, name is %s",
+                   in_grad_var_name);
+
+    if (out_grad_var == nullptr) {
+      VLOG(5) << "Using fill constant 0 as starting gradient";
+      auto in_var_name = Input("X");
+      auto *in_var = scope.FindVar(in_var_name);
+      auto &in_var_tensor = in_var->Get<framework::LoDTensor>();
+
+      framework::AttributeMap attrs;
+      attrs["dtype"] = framework::ToDataType(in_var_tensor.type());
+      attrs["shape"] = framework::vectorize2int(in_var_tensor.dims());
+      attrs["value"] = 0.0f;
+
+      auto zero_op = framework::OpRegistry::CreateOp(
+          "fill_constant", {}, {{"Out", {in_grad_var_name}}}, attrs);
+      zero_op->Run(scope, dev_place);
+    } else {
+      auto &out_grad_tensor = out_grad_var->Get<framework::LoDTensor>();
+      auto *in_grad_tensor = in_grad_var->GetMutable<framework::LoDTensor>();
+      in_grad_tensor->ShareDataWith(out_grad_tensor);
+      in_grad_tensor->set_lod(out_grad_tensor.lod());
+    }
+  }
+};
+
+class RNNMemoryHelperGradOpInfoMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  RNNMemoryHelperGradOpInfoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(framework::GradVarName("Out"), "");
+    AddInput("X", "");
+    AddInput("Out", "");
+    AddOutput(framework::GradVarName("X"), "");
+    AddAttr<int>("dtype",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::proto::DataType::FP32);
+    AddComment("");
+  }
+};
+
+class RNNMemoryHelperGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    auto x_grad_name = framework::GradVarName("X");
+    PADDLE_ENFORCE(ctx->HasOutput(x_grad_name), "");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ x_grad_name);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(rnn_memory_helper, paddle::operators::RNNMemoryHelperOp,
+                  paddle::operators::RNNMemoryHelperOpInfoMaker,
+                  paddle::operators::RNNMemoryHelperOpShapeInference,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(rnn_memory_helper_grad,
+                  paddle::operators::RNNMemoryHelperGradOp,
+                  paddle::operators::RNNMemoryHelperGradOpInfoMaker,
+                  paddle::operators::RNNMemoryHelperGradOpShapeInference);
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
new file mode 100644
index 0000000000..a7351f11c5
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kROISize = 5;
+
+class ROIPoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ROIs"),
+                   "Input(ROIs) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ROIPoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Argmax"),
+                   "Output(Argmax) of ROIPoolOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+    auto rois_dims = ctx->GetInputDim("ROIs");
+
+    PADDLE_ENFORCE(input_dims.size() == 4,
+                   "The format of input tensor is NCHW.");
+    PADDLE_ENFORCE(rois_dims.size() == 2,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+    PADDLE_ENFORCE(rois_dims[1] == kROISize,
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
+
+    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
+    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
+    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
+
+    PADDLE_ENFORCE_GT(pooled_height, 0,
+                      "The pooled output height must greater than 0");
+    PADDLE_ENFORCE_GT(pooled_width, 0,
+                      "The pooled output width must greater than 0");
+    PADDLE_ENFORCE_GT(spatial_scale, 0.0f,
+                      "The spatial scale must greater than 0");
+
+    auto out_dims = input_dims;
+    out_dims[0] = rois_dims[0];
+    out_dims[1] = input_dims[1];
+    out_dims[2] = pooled_height;
+    out_dims[3] = pooled_width;
+
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputDim("Argmax", out_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ROIPoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor), "
+             "the input of ROIPoolOp. "
+             "The format of input tensor is NCHW. Where N is batch size, "
+             "C is the number of input channels, "
+             "H is the height of the feature, and "
+             "W is the width of the feature.");
+    AddInput("ROIs",
+             "(Tensor), "
+             "ROIs (Regions of Interest) to pool over. "
+             "should be a 2-D tensor of shape (num_rois, 5)"
+             "given as [[batch_id, x1, y1, x2, y2], …]. "
+             "Where batch_id is the id of the data, "
+             "(x1, y1) is the top left coordinates, and "
+             "(x2, y2) is the bottom right coordinates.");
+    AddOutput("Out",
+              "(Tensor), "
+              "The output of ROIPoolOp is a 4-D tensor with shape "
+              "(num_rois, channels, pooled_h, pooled_w).");
+    AddOutput("Argmax",
+              "(Tensor), "
+              "Argmaxes corresponding to indices in X used "
+              "for gradient computation. Only output "
+              "if arg “is_test” is false.")
+        .AsIntermediate();
+    AddAttr<float>("spatial_scale",
+                   "(float, default 1.0), "
+                   "Multiplicative spatial scale factor "
+                   "to translate ROI coords from their input scale "
+                   "to the scale used when pooling.")
+        .SetDefault(1.0);
+    AddAttr<int>("pooled_height",
+                 "(int, default 1), "
+                 "The pooled output height.")
+        .SetDefault(1);
+    AddAttr<int>("pooled_width",
+                 "(int, default 1), "
+                 "The pooled output width.")
+        .SetDefault(1);
+    AddComment(R"DOC(
+ROIPool operator
+
+ROI Pooling for Faster-RCNN. The link below is a further introduction: 
+https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    roi_pool_grad,
+    ops::CPUROIPoolGradOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::CPUROIPoolOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
new file mode 100644
index 0000000000..a874befe4d
--- /dev/null
+++ b/paddle/operators/roi_pool_op.cu
@@ -0,0 +1,209 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/roi_pool_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+static constexpr int kROISize = 5;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
+        }
+      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GPUROIPoolBackward(
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
+          static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
+    }
+  }
+}
+
+template <typename Place, typename T>
+class GPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto* argmax = ctx.Output<Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    auto in_stride = framework::stride(in_dims);
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+
+    size_t rois_num = rois->dims()[0];
+    if (rois_num == 0) return;
+
+    int output_size = out->numel();
+    int blocks = NumBlocks(output_size);
+    int threads = kNumCUDAThreads;
+
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
+  }
+};
+
+template <typename Place, typename T>
+class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<Tensor>("X");
+    auto* rois = ctx.Input<Tensor>("ROIs");
+    auto* argmax = ctx.Input<Tensor>("Argmax");
+
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    size_t rois_num = rois->dims()[0];
+    int channels = in->dims()[1];
+    int height = in->dims()[2];
+    int width = in->dims()[3];
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<Place, T> set_zero;
+      set_zero(ctx.cuda_device_context(), x_grad, static_cast<T>(0));
+
+      int output_grad_size = out_grad->numel();
+      int blocks = NumBlocks(output_grad_size);
+      int threads = kNumCUDAThreads;
+
+      if (output_grad_size > 0) {
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    roi_pool_grad,
+    ops::GPUROIPoolGradOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GPUROIPoolOpKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
new file mode 100644
index 0000000000..09a9d3d870
--- /dev/null
+++ b/paddle/operators/roi_pool_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* argmax = ctx.Output<framework::Tensor>("Argmax");
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+    auto spatial_scale = ctx.Attr<float>("spatial_scale");
+
+    auto in_dims = in->dims();
+    int batch_size = in_dims[0];
+    int channels = in_dims[1];
+    int height = in_dims[2];
+    int width = in_dims[3];
+    int rois_num = rois->dims()[0];
+
+    auto in_stride = framework::stride(in_dims);
+    auto argmax_stride = framework::stride(argmax->dims());
+    auto roi_stride = framework::stride(rois->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    const T* input_data = in->data<T>();
+    const int64_t* rois_data = rois->data<int64_t>();
+    T* output_data = out->mutable_data<T>(ctx.GetPlace());
+    int64_t* argmax_data = argmax->mutable_data<int64_t>(ctx.GetPlace());
+
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      PADDLE_ENFORCE_GE(roi_batch_id, 0);
+      PADDLE_ENFORCE_LT(roi_batch_id, batch_size);
+      rois_data += roi_stride[0];
+    }
+
+    rois_data = rois->data<int64_t>();
+    for (int n = 0; n < rois_num; ++n) {
+      int roi_batch_id = rois_data[0];
+      int roi_start_w = round(rois_data[1] * spatial_scale);
+      int roi_start_h = round(rois_data[2] * spatial_scale);
+      int roi_end_w = round(rois_data[3] * spatial_scale);
+      int roi_end_h = round(rois_data[4] * spatial_scale);
+
+      // Force malformed ROIs to be 1x1
+      int roi_height = std::max(roi_end_h - roi_start_h + 1, 1);
+      int roi_width = std::max(roi_end_w - roi_start_w + 1, 1);
+
+      const float bin_size_h =
+          static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+      const float bin_size_w =
+          static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+      const T* batch_data = input_data + roi_batch_id * in_stride[0];
+
+      for (int c = 0; c < channels; ++c) {
+        for (int ph = 0; ph < pooled_height; ++ph) {
+          for (int pw = 0; pw < pooled_width; ++pw) {
+            //  Compute pooling region for this output unit:
+            //  start (included) = floor(ph * roi_height / pooled_height_)
+            //  end (excluded) = ceil((ph + 1) * roi_height / pooled_height_)
+            int hstart =
+                static_cast<int>(floor(static_cast<float>(ph) * bin_size_h));
+            int wstart =
+                static_cast<int>(floor(static_cast<float>(pw) * bin_size_w));
+            int hend =
+                static_cast<int>(ceil(static_cast<float>(ph + 1) * bin_size_h));
+            int wend =
+                static_cast<int>(ceil(static_cast<float>(pw + 1) * bin_size_w));
+
+            hstart = std::min(std::max(hstart + roi_start_h, 0), height);
+            hend = std::min(std::max(hend + roi_start_h, 0), height);
+            wstart = std::min(std::max(wstart + roi_start_w, 0), width);
+            wend = std::min(std::max(wend + roi_start_w, 0), width);
+
+            const int pool_index = ph * pooled_width + pw;
+
+            // Define an empty pooling region to be zero
+            bool is_empty = (hend <= hstart) || (wend <= wstart);
+            output_data[pool_index] =
+                is_empty ? 0 : -std::numeric_limits<T>::max();
+            argmax_data[pool_index] = -1;
+
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                const int index = h * width + w;
+                if (batch_data[index] > output_data[pool_index]) {
+                  output_data[pool_index] = batch_data[index];
+                  argmax_data[pool_index] = index;
+                }
+              }
+            }
+          }
+        }
+
+        batch_data += in_stride[1];
+        output_data += out_stride[1];
+        argmax_data += argmax_stride[1];
+      }
+      // Increment ROI data pointer
+      rois_data += roi_stride[0];
+    }
+    return;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* rois = ctx.Input<framework::Tensor>("ROIs");
+    auto* argmax = ctx.Input<framework::Tensor>("Argmax");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+
+    auto pooled_height = ctx.Attr<int>("pooled_height");
+    auto pooled_width = ctx.Attr<int>("pooled_width");
+
+    if (in_grad) {
+      const int64_t* rois_data = rois->data<int64_t>();
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), in_grad,
+               static_cast<T>(0));
+
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
+
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
+
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
+        for (int c = 0; c < channels; ++c) {
+          for (int ph = 0; ph < pooled_height; ++ph) {
+            for (int pw = 0; pw < pooled_width; ++pw) {
+              int pool_index = ph * pooled_width + pw;
+              if (argmax_data[pool_index] >= 0) {
+                auto index = argmax_data[pool_index];
+                batch_grad_data[index] += out_grad_data[pool_index];
+              }
+            }
+          }
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
+        }
+        rois_data += roi_stride[0];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc
new file mode 100644
index 0000000000..68f4e35315
--- /dev/null
+++ b/paddle/operators/row_conv_op.cc
@@ -0,0 +1,259 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class RowConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of RowConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of RowConvOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(
+        x_dims[1], filter_dims[1],
+        "The 2nd dimension of Input(X) and Input(Filter) should be same.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class RowConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto filter_grad_name = framework::GradVarName("Filter");
+    if (ctx->HasOutput(filter_grad_name)) {
+      auto filter_dims = ctx->GetInputDim("Filter");
+      ctx->SetOutputDim(filter_grad_name, filter_dims);
+    }
+  }
+};
+
+class RowConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RowConvOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), the input(X) is a LodTensor, which supports "
+             "variable time-length input sequences. The underlying tensor "
+             "in this LoDTensor is a matrix with shape (T x N), where T "
+             "is the total time steps in this mini-batch and N is the input "
+             "data dimension.");
+    AddInput("Filter",
+             "(Tensor), the input(Filter) is a learnable parameter. It "
+             "is a 2-D tensor with shape (future_context x N), where, "
+             "future_context is the future context length and N is the data "
+             "dimension.");
+    AddOutput("Out",
+              "(LoDTensor), the output(Out) is a LodTensor, which supports "
+              "variable time-length input sequences. The underlying tensor "
+              "in this LodTensor is a matrix with shape T x N, i.e., the "
+              "same shape as X.");
+    AddComment(R"DOC(
+Row-convolution Operator.
+
+The row convolution is called lookahead convolution.  This operator was 
+introduced in the following paper for DeepSpeech2:
+http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf 
+
+The main motivation is that a bidirectional RNN, useful in DeepSpeech 
+like speech models, learns representation for a sequence by performing a 
+forward and a backward pass through the entire sequence. However, unlike 
+unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online
+and low-latency setting. The lookahead convolution incorporates information 
+from future subsequences in a computationally efficient manner to improve 
+unidirectional recurrent neural networks. The row convolution operator is 
+different from the 1D sequence convolution, and is computed as follows:
+
+Given an input sequence $in$ of length $t$ and input dimension $d$, 
+and a filter ($W$) of size $context \times d$, 
+the output sequence is convolved as:
+
+$$
+out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :}
+$$
+
+)DOC");
+  }
+};
+
+template <typename T>
+class RowConvKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *out = context.Output<LoDTensor>("Out");
+
+    out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = x->lod()[0];
+    auto input_dim = x->dims()[1];  // 'in' is of size T x N
+    size_t num_sequence = batch_indices.size() - 1;
+
+    auto future_context = filter->dims()[0];
+    auto weights = EigenMatrix<T>::From(*filter);
+
+    for (size_t i = 0; i < num_sequence; i++) {
+      int start = static_cast<int>(batch_indices[i]);
+      int end = static_cast<int>(batch_indices[i + 1]);
+      int current_timesteps = end - start;
+      Tensor cur_input_sequence =
+          x->Slice(start, end);  // Current input sequence
+      Tensor cur_output_sequence =
+          out->Slice(start, end);  // Current output sequence
+      auto cip_seq = EigenMatrix<T>::From(cur_input_sequence);
+      auto cot_seq = EigenMatrix<T>::From(cur_output_sequence);
+
+      for (int k = 0; k < current_timesteps;
+           k++) {  // For different time steps in the same sequence
+        for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+             w++) {
+          for (int d = 0; d < input_dim; d++) {
+            if (w == 0) {
+              cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d);
+            } else {
+              cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *x = context.Input<LoDTensor>("X");
+    auto *filter = context.Input<Tensor>("Filter");
+    auto *d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto *dx = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto *d_filter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto input_dim = x->dims()[1];  // 'x' is of size T x N
+    auto batch_indices = x->lod()[0];
+    size_t num_sequence = batch_indices.size() - 1;
+    auto future_context = filter->dims()[0];
+
+    if (d_filter) {
+      d_filter->mutable_data<T>(context.GetPlace());
+      auto dweights =
+          EigenMatrix<T>::From(*d_filter);  // Gradient of weight matrix
+      dweights.setZero();
+
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_input = x->Slice(start, end);  // Current input sequence
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+
+        auto cur_ip = EigenMatrix<T>::From(cur_input);
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dweights (Updating the gradient of weight matrix)
+            for (int d = 0; d < input_dim; d++) {
+              dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      auto weights = EigenMatrix<T>::From(*filter);
+      for (size_t i = 0; i < num_sequence; i++) {  // For different sequences
+        int start = static_cast<int>(batch_indices[i]);
+        int end = static_cast<int>(batch_indices[i + 1]);
+
+        Tensor cur_doutput =
+            d_out->Slice(start, end);  // Current output grad sequence
+        Tensor cur_dinput =
+            dx->Slice(start, end);  // Current input grad sequence
+
+        auto cur_dout = EigenMatrix<T>::From(cur_doutput);
+        auto cur_dip = EigenMatrix<T>::From(cur_dinput);
+        cur_dip.setZero();
+        int current_timesteps = end - start;
+
+        for (int k = 0; k < current_timesteps;
+             k++) {  // For different time steps in the same sequence
+          for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+               w++) {
+            // For dinput (Updating the gradient wrt input)
+            for (int d = 0; d < input_dim; d++) {
+              cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad,
+            ops::RowConvGradOp);
+REGISTER_OP_CPU_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu
new file mode 100644
index 0000000000..41f2c5b9de
--- /dev/null
+++ b/paddle/operators/row_conv_op.cu
@@ -0,0 +1,410 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/row_conv_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using framework::Tensor;
+
+namespace {
+
+inline int DivUp(int x, int y) { return (x + y - 1) / y; }
+
+// Forward prop (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvForwardSharedMemory(const T *in, const T *wt,
+                                           int num_sequence, int input_dim,
+                                           int future_context,
+                                           const size_t *batch_indices,
+                                           T *out) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (d < input_dim)
+                   ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d]
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        out[(start + k) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Forward prop (naive version)
+template <typename T>
+__global__ void RowConvForward(const T *in, const T *wt, int num_sequence,
+                               int input_dim, int future_context,
+                               const size_t *batch_indices, T *out) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+
+  for (size_t i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k + w) < current_timesteps);
+           w++) {
+        sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]);
+      }
+      out[(start + k) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute input gradient (shared memory version, for small future_context)
+template <typename T>
+__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt,
+                                             int num_sequence, int input_dim,
+                                             int future_context,
+                                             const size_t *batch_indices,
+                                             T *din) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int d = blockIdx.x * blx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+  T *sw = mem;
+  if (thy < future_context) {
+    sw[thy * blx + thx] =
+        (d < input_dim) ? wt[thy * input_dim + d] : static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (d < input_dim)
+                   ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d])
+                   : static_cast<T>(0);
+      }
+      if (d < input_dim) {
+        din[(k + start) * input_dim + d] = sum;
+      }
+    }
+  }
+}
+
+// Compute input gradient (Naive version)
+template <typename T>
+__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence,
+                                 int input_dim, int future_context,
+                                 const size_t *batch_indices, T *din) {
+  int d = blockIdx.x * blockDim.x + threadIdx.x;  // index along input_dim
+  int bly = blockDim.y;
+  int thy = threadIdx.y;
+
+  if (d >= input_dim) return;
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    for (int k = thy; k < current_timesteps; k += bly) {
+      T sum = 0;
+      for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) {
+        sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]);
+      }
+      din[(k + start) * input_dim + d] = sum;
+    }
+  }
+}
+
+// Compute W gradient (small future_context version)
+template <typename T>
+__global__ void RowConvGradFilterImproved(const T *in, const T *dout,
+                                          int num_sequence, int input_dim,
+                                          int future_context, int block_x,
+                                          int block_y,
+                                          const size_t *batch_indices,
+                                          T *dfilter) {
+  int blx = blockDim.x;
+  int bly = blockDim.y;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+
+  extern __shared__ T mem[];
+
+  int xdim_sh_in = block_y;
+  int xdim_sh_dout = block_y;
+  // int xdim_sh_dfilter = future_context;
+  int ydim_sh_in = block_x;
+  int ydim_sh_dout = block_x + future_context - 1;
+  int ydim_sh_dfilter = block_y;
+
+  T *sh_in = mem;
+  T *sh_dout = &mem[xdim_sh_in * ydim_sh_in];
+  T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout];
+
+  if (thy < future_context) {
+    sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast<T>(0);
+  }
+  __syncthreads();
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * ydim_sh_in + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0);
+      sh_dout[thx * ydim_sh_dout + thy + future_context - 1] =
+          (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0);
+      __syncthreads();
+
+      if (thy < future_context - 1) {
+        int pos_offset = pos - future_context + 1;
+        sh_dout[thx * ydim_sh_dout + thy] =
+            (d < input_dim && pos_offset >= start)
+                ? dout[pos_offset * input_dim + d]
+                : T(0);
+      }
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        T val = sh_in[thy * ydim_sh_in + thx] *
+                sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0) {
+          sh_dfilter[w * ydim_sh_dfilter + thy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int w = thy; (w < future_context) && (d < input_dim); w += bly) {
+    dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx];
+  }
+}
+
+// Compute weight(filter) gradient
+template <typename T>
+__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence,
+                                  int input_dim, int future_context,
+                                  int block_x, int block_y,
+                                  const size_t *batch_indices, T *dfilter) {
+  int blx = blockDim.x;
+  int thx = threadIdx.x;
+  int thy = threadIdx.y;
+  int gx = blockIdx.x * blx;
+  int d = gx + thx;  // index along input dim
+  extern __shared__ T mem[];
+  T *sh_in = mem;
+  T *sh_dout = &mem[block_x * block_y];
+
+  for (int i = 0; i < num_sequence; i++) {
+    int start = static_cast<int>(batch_indices[i]);
+    int end = static_cast<int>(batch_indices[i + 1]);
+    int current_timesteps = end - start;
+    int scaled_cur_steps =
+        ((current_timesteps + block_x - 1) / block_x) * block_x;
+
+    for (int k = thy; k < scaled_cur_steps; k += block_x) {
+      int pos = start + k;
+      sh_in[thx * block_y + thy] =
+          (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0;
+      __syncthreads();
+
+      for (int w = 0; w < future_context; w++) {
+        sh_dout[thx * block_y + thy] =
+            (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps)
+                ? dout[(pos - w) * input_dim + d]
+                : 0.0;
+        __syncthreads();
+
+        T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx];
+        __syncthreads();
+
+        for (int offset = 16; offset > 0;
+             offset = offset / 2) {  // blockDim.x is 32.
+          val += __shfl_down(val, offset);
+        }
+        __syncthreads();
+
+        if (thx == 0 && (gx + thy) < input_dim) {
+          dfilter[w * input_dim + gx + thy] += val;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace
+
+template <typename T>
+class RowConvKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *Out = context.Output<LoDTensor>("Out");
+
+    const T *in = X->data<T>();
+    const T *weight = Filter->data<T>();
+    T *out = Out->mutable_data<T>(context.GetPlace());
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+    auto stream = context.cuda_device_context().stream();
+
+    if (future_context <= 32) {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+      RowConvForwardSharedMemory<
+          T><<<grid_dim, block_dim, mem_per_block, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    } else {
+      dim3 block_dim = dim3(32, 32);
+      dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+      RowConvForward<T><<<grid_dim, block_dim, 0, stream>>>(
+          in, weight, num_sequence, input_dim, future_context, idx, out);
+    }
+  }
+};
+
+template <typename T>
+class RowConvGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<LoDTensor>("X");
+    auto *Filter = context.Input<Tensor>("Filter");
+    auto *dOut = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    const T *in = X->data<T>();
+    const T *weights = Filter->data<T>();
+    const T *dout = dOut->data<T>();
+
+    Tensor *dX = context.Output<LoDTensor>(framework::GradVarName("X"));
+    Tensor *dFilter = context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    auto batch_indices = X->lod()[0];
+    int input_dim = X->dims()[1];
+    int num_sequence = batch_indices.size() - 1;
+    int future_context = Filter->dims()[0];
+    size_t *idx = batch_indices.data();
+
+    auto &device_ctx = context.cuda_device_context();
+    math::SetConstant<platform::CUDADeviceContext, T> zero;
+
+    if (dFilter) {
+      T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, dFilter, static_cast<T>(0.0));
+
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_y * block_x + block_y * (block_x + future_context - 1) +
+             future_context * block_y) *
+            sizeof(T);
+        RowConvGradFilterImproved<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int block_x = block_dim.x;
+        int block_y = block_dim.y;
+        int mem_per_block =
+            (block_x * block_y * 2) * sizeof(T);  // For 2 arrays of size 32x32
+        RowConvGradFilter<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            in, dout, num_sequence, input_dim, future_context, block_x, block_y,
+            idx, dfilter);
+      }
+    }
+
+    if (dX) {
+      T *din = dX->mutable_data<T>(context.GetPlace());
+      if (future_context <= 32) {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        int mem_per_block = (future_context * block_dim.x) * sizeof(T);
+        RowConvGradInputSharedMemory<
+            T><<<grid_dim, block_dim, mem_per_block, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      } else {
+        dim3 block_dim = dim3(32, 32);
+        dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1);
+        RowConvGradInput<T><<<grid_dim, block_dim, 0, device_ctx.stream()>>>(
+            dout, weights, num_sequence, input_dim, future_context, idx, din);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    row_conv, ops::RowConvKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    row_conv_grad,
+    ops::RowConvGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h
new file mode 100644
index 0000000000..10d435ab08
--- /dev/null
+++ b/paddle/operators/row_conv_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class RowConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename DeviceContext, typename T>
+class RowConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/save_combine_op.cc b/paddle/operators/save_combine_op.cc
new file mode 100644
index 0000000000..bffa2908bc
--- /dev/null
+++ b/paddle/operators/save_combine_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+#include <sstream>
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(sidgoyal78): These function are needed by other files (save_op), move
+// them to paddle::filesystem namespace. (as noted by yuyang18 in save_op).
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveCombineOp : public framework::OperatorBase {
+ public:
+  SaveCombineOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    bool is_present = FileExists(filename);
+    if (is_present && !overwrite) {
+      PADDLE_THROW("%s exists!, cannot save_combine to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto inp_var_names = Inputs("X");
+    PADDLE_ENFORCE_GT(static_cast<int>(inp_var_names.size()), 0,
+                      "The number of input variables should be greater than 0");
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    for (size_t i = 0; i < inp_var_names.size(); i++) {
+      auto *var = scope.FindVar(inp_var_names[i]);
+
+      PADDLE_ENFORCE(var != nullptr,
+                     "Cannot find variable %s for save_combine_op",
+                     inp_var_names[i]);
+      PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                     "SaveCombineOp only supports LoDTensor, %s has wrong type",
+                     inp_var_names[i]);
+
+      auto &tensor = var->Get<framework::LoDTensor>();
+      // Serialize tensor
+      framework::SerializeToStream(fout, tensor, dev_ctx);
+    }
+    fout.close();
+  }
+};
+
+class SaveCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveCombineOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(vector) Input LoDTensors that need to be saved together in a file.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+SaveCombine operator
+
+This operator will serialize and write a list of input LoDTensor variables 
+to a file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if it exists.")
+        .SetDefault(true);
+    AddAttr<std::string>(
+        "file_path",
+        "(string)"
+        "The \"file_path\" where the LoDTensor variables will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save_combine, ops::SaveCombineOp,
+                  ops::SaveCombineOpProtoMaker);
diff --git a/paddle/operators/save_load_combine_op_test.cc b/paddle/operators/save_load_combine_op_test.cc
new file mode 100644
index 0000000000..f3ddc4a6c5
--- /dev/null
+++ b/paddle/operators/save_load_combine_op_test.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include <string>
+#include <vector>
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save_combine);
+USE_NO_KERNEL_OP(load_combine);
+
+int* CreateForSaveCombineOp(int x, int y, const std::vector<int>& lod_info,
+                            std::string var_name,
+                            paddle::platform::CPUPlace& place,
+                            paddle::framework::Scope& scope,
+                            paddle::framework::LoD& expect_lod) {
+  auto var = scope.Var(var_name);
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({x, y});
+  expect_lod.resize(1);
+  for (size_t i = 0; i < lod_info.size(); i++) {
+    expect_lod[0].push_back(lod_info[i]);
+  }
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  return expect;
+}
+
+paddle::framework::LoDTensor* GeneratePlaceholderBeforeLoad(
+    const std::string out_var_name, paddle::framework::Scope& scope) {
+  auto load_var = scope.Var(out_var_name);
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  return target;
+}
+
+int* GetValuesAfterLoadCombineOp(paddle::framework::LoDTensor* target,
+                                 paddle::framework::Scope& scope,
+                                 paddle::framework::LoD& actual_lod) {
+  int* actual = target->data<int>();
+  actual_lod = target->lod();
+  return actual;
+}
+
+void CheckValues(int* expect, int* actual, paddle::framework::LoD expect_lod,
+                 paddle::framework::LoD actual_lod, const int& numel) {
+  for (int64_t i = 0; i < numel; ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
+
+// Here, we create 4 LoDTensors and use save_combine_op to first save these
+// in a single file. Then, we use load_combine_op to load these sequentially
+TEST(SaveLoadCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  int* expect1 = CreateForSaveCombineOp(10, 10, lod1, "test_var1", place, scope,
+                                        expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  int* expect2 = CreateForSaveCombineOp(10, 20, lod2, "test_var2", place, scope,
+                                        expect_lod2);
+
+  std::vector<int> lod3 = {0, 2, 3, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  int* expect3 = CreateForSaveCombineOp(20, 200, lod3, "test_var3", place,
+                                        scope, expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  int* expect4 = CreateForSaveCombineOp(20, 50, lod4, "test_var4", place, scope,
+                                        expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto target1 = GeneratePlaceholderBeforeLoad("out_var1", scope);
+  auto target2 = GeneratePlaceholderBeforeLoad("out_var2", scope);
+  auto target3 = GeneratePlaceholderBeforeLoad("out_var3", scope);
+  auto target4 = GeneratePlaceholderBeforeLoad("out_var4", scope);
+
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  int* actual1 = GetValuesAfterLoadCombineOp(target1, scope, actual_lod1);
+  int* actual2 = GetValuesAfterLoadCombineOp(target2, scope, actual_lod2);
+  int* actual3 = GetValuesAfterLoadCombineOp(target3, scope, actual_lod3);
+  int* actual4 = GetValuesAfterLoadCombineOp(target4, scope, actual_lod4);
+
+  CheckValues(expect1, actual1, expect_lod1, actual_lod1, numel1);
+  CheckValues(expect2, actual2, expect_lod2, actual_lod2, numel2);
+  CheckValues(expect3, actual3, expect_lod3, actual_lod3, numel3);
+  CheckValues(expect4, actual4, expect_lod4, actual_lod4, numel4);
+}
+
+// Test with original SaveLoadTest
+TEST(SaveLoadTestWithCombineOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("check_t.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/operators/save_load_op_test.cc b/paddle/operators/save_load_op_test.cc
new file mode 100644
index 0000000000..d829d5da17
--- /dev/null
+++ b/paddle/operators/save_load_op_test.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+USE_NO_KERNEL_OP(save);
+USE_NO_KERNEL_OP(load);
+
+TEST(SaveLoadOp, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  auto var = scope.Var("test_var");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({3, 10});
+  paddle::framework::LoD expect_lod;
+  expect_lod.resize(1);
+  expect_lod[0].push_back(0);
+  expect_lod[0].push_back(1);
+  expect_lod[0].push_back(2);
+  expect_lod[0].push_back(3);
+
+  tensor->set_lod(expect_lod);
+  int* expect = tensor->mutable_data<int>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<int>(i);
+  }
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string("tensor.save")});
+
+  auto save_op = paddle::framework::OpRegistry::CreateOp(
+      "save", {{"X", {"test_var"}}}, {}, attrs);
+  save_op->Run(scope, place);
+
+  auto load_var = scope.Var("out_var");
+  auto target = load_var->GetMutable<paddle::framework::LoDTensor>();
+  auto load_op = paddle::framework::OpRegistry::CreateOp(
+      "load", {}, {{"Out", {"out_var"}}}, attrs);
+  load_op->Run(scope, place);
+  int* actual = target->data<int>();
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    EXPECT_EQ(expect[i], actual[i]);
+  }
+  auto& actual_lod = target->lod();
+  EXPECT_EQ(expect_lod.size(), actual_lod.size());
+  for (size_t i = 0; i < expect_lod.size(); ++i) {
+    for (size_t j = 0; j < expect_lod[i].size(); ++j) {
+      EXPECT_EQ(expect_lod[i][j], actual_lod[i][j]);
+    }
+  }
+}
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
new file mode 100644
index 0000000000..4b1cbe8883
--- /dev/null
+++ b/paddle/operators/save_op.cc
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <numeric>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(yuyang18): If the functions below are needed by other files, move them
+// to paddle::filesystem namespace.
+constexpr char kSEP = '/';
+static bool FileExists(const std::string &filepath) {
+  struct stat buffer;
+  return (stat(filepath.c_str(), &buffer) == 0);
+}
+
+static std::string DirName(const std::string &filepath) {
+  auto pos = filepath.rfind(kSEP);
+  if (pos == std::string::npos) {
+    return "";
+  }
+  return filepath.substr(0, pos);
+}
+
+static void MkDir(const char *path) {
+  if (mkdir(path, 0755)) {
+    PADDLE_ENFORCE_EQ(errno, EEXIST, "%s mkdir failed!", path);
+  }
+}
+
+static void MkDirRecursively(const char *fullpath) {
+  if (*fullpath == '\0') return;  // empty string
+  if (FileExists(fullpath)) return;
+
+  MkDirRecursively(DirName(fullpath).c_str());
+  MkDir(fullpath);
+}
+
+class SaveOp : public framework::OperatorBase {
+ public:
+  SaveOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto filename = Attr<std::string>("file_path");
+    auto overwrite = Attr<bool>("overwrite");
+
+    if (FileExists(filename) && !overwrite) {
+      PADDLE_THROW("%s is existed, cannot save to it when overwrite=false",
+                   filename, overwrite);
+    }
+
+    MkDirRecursively(DirName(filename).c_str());
+
+    // FIXME(yuyang18): We save variable to local file now, but we should change
+    // it to save an output stream.
+    std::ofstream fout(filename);
+    PADDLE_ENFORCE(static_cast<bool>(fout), "Cannot open %s to write",
+                   filename);
+
+    auto iname = Input("X");
+    auto *var = scope.FindVar(iname);
+    PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s for save_op",
+                   iname);
+
+    PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                   "SaveOp only support LoDTensor, %s has wrong type", iname);
+
+    auto &tensor = var->Get<framework::LoDTensor>();
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    framework::SerializeToStream(fout, tensor, dev_ctx);
+  }
+};
+
+class SaveOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor ) Input tensor to be saved");
+    AddComment(R"DOC(
+Save operator
+
+This operator will serialize and write a tensor variable to file on disk.
+)DOC");
+    AddAttr<bool>("overwrite",
+                  "(boolean, default true)"
+                  "Overwrite the output file if exist")
+        .SetDefault(true);
+    AddAttr<std::string>("file_path",
+                         "(string)"
+                         "The \"file_path\" where the variable will be saved.")
+        .AddCustomChecker(
+            [](const std::string &path) { return !path.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(save, ops::SaveOp, ops::SaveOpProtoMaker);
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
new file mode 100644
index 0000000000..c0e614743a
--- /dev/null
+++ b/paddle/operators/scale_op.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ScaleOp : public framework::OperatorWithKernel {
+ public:
+  ScaleOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of ScaleOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScaleOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScaleOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of scale operator.");
+    AddOutput("Out", "(Tensor) Output tensor of scale operator.");
+    AddComment(R"DOC(
+Scale operator
+
+$$Out = scale*X$$
+)DOC");
+    AddAttr<AttrType>("scale",
+                      "(float, default 1.0)"
+                      "The scaling factor of the scale operator.")
+        .SetDefault(1.0);
+  }
+};
+
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
+                  ops::ScaleGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    scale, ops::ScaleKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::ScaleKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
new file mode 100644
index 0000000000..7202c0de70
--- /dev/null
+++ b/paddle/operators/scale_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scale_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    scale,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
+    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
new file mode 100644
index 0000000000..395268c2ee
--- /dev/null
+++ b/paddle/operators/scale_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class ScaleKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto scale = static_cast<T>(context.Attr<float>("scale"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& dev =
+        *context.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(dev) = scale * eigen_in;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
new file mode 100644
index 0000000000..55555300fc
--- /dev/null
+++ b/paddle/operators/scatter.cu.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
+                                  T* output, size_t index_size,
+                                  size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new updated tensor from source tensor, scatter-assigned according to
+ * index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
new file mode 100644
index 0000000000..c1fb844ebd
--- /dev/null
+++ b/paddle/operators/scatter.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+/**
+ * Return a updated tensor from source tensor, scattered according to index:
+ * dst[i] = src[index[i]]
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                   const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  auto dst_dims = output->dims();
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  // check src shape and dst shape should match
+  for (int i = 1; i < src_dims.size(); i++)
+    PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
+
+  // slice size
+  size_t slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
new file mode 100644
index 0000000000..b653348906
--- /dev/null
+++ b/paddle/operators/scatter_op.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scatter_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class ScatterOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Ref"),
+                   "Input(Ref) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Index"),
+                   "Input(Index) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Updates"),
+                   "Input(Updates) of ScatterOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of ScatterOp should not be null.");
+
+    auto updates_dims = ctx->GetInputDim("Updates");
+    auto ref_dims = ctx->GetInputDim("Ref");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Index").size(), 1,
+                      "Update Index should be 1-D.");
+    PADDLE_ENFORCE_EQ(ref_dims.size(), updates_dims.size(),
+                      "Reference and Updates should have the same shape size");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0],
+                      ctx->GetInputDim("Index")[0],
+                      "Updates and Index should have same batch-size.");
+    framework::DDim data_dim(updates_dims);
+    for (int i = 1; i < data_dim.size(); ++i) {
+      PADDLE_ENFORCE_EQ(data_dim[i], updates_dims[i]);
+    }
+    ctx->SetOutputDim("Out", ref_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
+  }
+};
+
+class ScatterGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    ctx->SetOutputDim(framework::GradVarName("Updates"),
+                      ctx->GetInputDim("Updates"));
+    ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
+        ctx.device_context());
+  }
+};
+
+class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ScatterOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Ref", "The source input of scatter op");
+    AddInput("Index",
+             "The index input of scatter op where Ref will be updated");
+    AddInput("Updates", "The updated value of updates op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Scatter Operator.
+
+This operator obtains output by updating the input on selected indices on the first axis:
+
+$$
+Out = Ref \\
+Out[Index] = Ref[Index] + Updates
+$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
+            ops::ScatterGradOp);
+REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
new file mode 100644
index 0000000000..0c198d2258
--- /dev/null
+++ b/paddle/operators/scatter_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    Out->ShareDataWith(*Ref);
+
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates = dO[Index]
+    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
new file mode 100644
index 0000000000..1a4f6f99bf
--- /dev/null
+++ b/paddle/operators/scatter_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "gather.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "scatter.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class ScatterOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    // In place output: Out = Ref, Out[Index] += Updates
+    Out->ShareDataWith(*Ref);
+    // Apply ScatterUpdate: Out[index] += Updates[:]
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradientOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates += dO[Index]
+    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
new file mode 100644
index 0000000000..00dbdacbfe
--- /dev/null
+++ b/paddle/operators/scatter_test.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/scatter.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <string>
+
+TEST(scatter, ScatterUpdate) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  float* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<float>(make_ddim({1, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({1}), CPUPlace());
+
+  for (size_t i = 0; i < 4; ++i) p_src[i] = float(i);
+  p_index[0] = 1;
+
+  float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  ScatterAssign<float>(ctx, *src, *index, output);
+
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], float(i - 4));
+  for (size_t i = 4; i < 8; ++i)
+    EXPECT_EQ(output->data<float>()[i], float(i - 4));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(p_output[i], float(0));
+  for (size_t i = 8; i < 16; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
+
+  delete src;
+  delete index;
+  delete output;
+}
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000..be41b527f2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"
+
+namespace paddle {
+namespace operators {
+
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    auto ins = Inputs("X");
+    auto outs = Outputs("Out");
+    std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
+    std::vector<std::string> endpoints =
+        Attr<std::vector<std::string>>("endpoints");
+
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto& ctx = *pool.Get(place);
+
+    auto client_var_name = Output("RPCClient");
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
+                            "Can not find variable '%s' in the scope.",
+                            client_var_name);
+    auto* client_var = scope.FindVar(client_var_name);
+    detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
+
+    for (size_t i = 0; i < ins.size(); i++) {
+      VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
+      rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (auto& ep : endpoints) {
+      VLOG(3) << "batch barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      VLOG(3) << "getting " << outs[i] << " from " << epmap[i];
+      rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
+    }
+    PADDLE_ENFORCE(rpc_client->Wait());
+  }
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be sent").AsDuplicable();
+    AddOutput("Out", "(Tensor) Output tensor to be received from server")
+        .AsDuplicable();
+    AddOutput("RPCClient",
+              "(RPCClient) The RPC client object which is"
+              "initialized at most once.");
+    AddComment(R"DOC(
+Send operator
+
+This operator will send tensor to recv_op at the parameter server.
+)DOC");
+    AddAttr<std::vector<std::string>>("endpoints",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints to send variables to.")
+        .SetDefault({});
+    AddAttr<std::vector<std::string>>("epmap",
+                                      "(string vector, default 127.0.0.1:6164)"
+                                      "Server endpoints in the order of input "
+                                      "variables for mapping")
+        .SetDefault({});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000..045a0f5434
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,206 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <unistd.h>
+#include <string>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/string/printf.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+// global for simplicity.
+std::unique_ptr<f::OperatorBase> recv_op;
+
+void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  for (int i = 0; i < 2; ++i) {
+    auto var_name = paddle::string::Sprintf("x%d", i);
+    auto var = scope.Var(var_name);
+    auto tensor = var->GetMutable<f::LoDTensor>();
+    tensor->Resize({10, 10});
+    float *expect = tensor->mutable_data<float>(place);
+    for (int64_t i = 0; i < tensor->numel(); ++i) {
+      expect[i] = static_cast<float>(i);
+    }
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<f::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  out_tensor->mutable_data<float>(place);  // allocate
+}
+
+void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
+  p::CPUDeviceContext ctx(place);
+  int64_t height = 10;
+  int64_t row_numel = 10;
+  m::SetConstant<p::CPUDeviceContext, float> set_one;
+  // init x0
+  std::vector<int64_t> rows0{0, 4, 7};
+  auto x0_var = scope.Var("x0");
+  auto x0 = x0_var->GetMutable<f::SelectedRows>();
+  x0->set_rows(rows0);
+  x0->set_height(height);
+  auto x0_value = x0->mutable_value();
+  x0_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
+  set_one(ctx, x0_value, 1.0);
+
+  // init x1
+  std::vector<int64_t> rows1{2, 9};
+  auto x1_var = scope.Var("x1");
+  auto x1 = x1_var->GetMutable<f::SelectedRows>();
+  x1->set_rows(rows1);
+  x1->set_height(height);
+  auto x1_value = x1->mutable_value();
+  x1_value->mutable_data<float>(
+      f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
+  set_one(ctx, x1_value, 1.0);
+
+  auto out_var = scope.Var("Out");
+  auto out = out_var->GetMutable<f::SelectedRows>();
+  auto out_value = out->mutable_value();
+  out->set_height(height);
+  out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+}
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           f::BlockDesc *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(f::proto::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet(bool is_sparse) {
+  f::Scope scope;
+  p::CPUPlace place;
+  if (is_sparse) {
+    InitSelectedRowsInScope(scope, place);
+  } else {
+    InitTensorsInScope(scope, place);
+  }
+
+  // sub program run in recv_op, for simple test we use sum
+  f::ProgramDesc program;
+  f::BlockDesc *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
+  attrs.insert({"GradList", std::vector<std::string>({"x1"})});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
+  recv_op->Run(scope, place);
+}
+
+TEST(SendRecvOp, CPUDense) {
+  std::thread server_thread(StartServerNet, false);
+  sleep(10);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto in_var = scope.Var("x1");
+  auto tensor = in_var->GetMutable<f::LoDTensor>();
+  float *expected = tensor->data<float>();
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<f::LoDTensor>();
+  // x1 * 2 == x0
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op->Stop();
+  server_thread.join();
+  recv_op.reset(nullptr);
+}
+
+TEST(SendRecvOp, CPUSparse) {
+  std::thread server_thread(StartServerNet, true);
+  sleep(3);  // wait server to start
+  // local net
+  f::Scope scope;
+  p::CPUPlace place;
+  p::CPUDeviceContext ctx(place);
+  InitSelectedRowsInScope(scope, place);
+  f::AttributeMap attrs;
+  attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
+  attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
+  auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
+                                         {{"Out", {"Out"}}}, attrs);
+  send_op->Run(scope, place);
+
+  auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
+  auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
+  auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
+  auto actual = out->mutable_value();
+
+  std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
+  auto expect_value = expect->mutable_value();
+  expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
+
+  m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
+  add_functor(ctx, *x0, *x1, expect.get());
+
+  EXPECT_EQ(actual->numel(), expect_value->numel());
+  EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
+
+  for (int64_t i = 0; i < expect_value->numel(); ++i) {
+    EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
+              actual->mutable_data<float>(place)[i]);
+  }
+  recv_op->Stop();
+  server_thread.join();
+  recv_op.reset();
+}
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
new file mode 100644
index 0000000000..2f0aad2003
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"),
+                   "Inputs(X) of SequenceConcatOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceConcatOp should not be null.");
+    const size_t level = static_cast<size_t>(ctx->Attrs().Get<int>("level"));
+    const size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE(level == 0UL || level == 1UL,
+                   "The sequence_concat operator only accepts sequence "
+                   "or a nested sequence as its input.");
+    auto ins_dims = ctx->GetInputsDim("X");
+    framework::DDim out_dims = ins_dims[0];
+    const size_t n = ins_dims.size();
+    for (size_t i = 1; i < n; ++i) {
+      out_dims[axis] += ins_dims[i][axis];
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceConcatOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LodTensorArray) Input is a vector of LoDTensor, "
+             "each of which is a variable-length sequence or nested sequence.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "(LoDTensor), Variable-length output of "
+              "sequence_concat Op.");
+    AddAttr<int>("axis",
+                 "(int, default 0) "
+                 "The axis along which the inputs will be joined. "
+                 "If axis is 0, the inputs will be joined with LoD index.")
+        .SetDefault(0);
+    AddAttr<int>("level",
+                 "(int, default 0) "
+                 "The level at which the inputs will be joined. "
+                 "If the level is 0, the inputs will be joined at the nested "
+                 "sequence level. "
+                 "If the level is 1, the inputs will be joined at the "
+                 "sequence level. "
+                 "The level should be less than the level number of inputs.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+The sequence_concat operator concatenates multiple LoDTensors.
+It only supports sequence (LoD Tensor with level number is 1)
+or a nested sequence (LoD tensor with level number is 2) as its input.
+- Case1:
+  If the axis is other than 0(here, axis is 1 and level is 1),
+  each input should have the same LoD information and the LoD
+  information of the output keeps the same as the input.
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+  LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+
+- Case2:
+  If the axis is 0(here, leve is 0), the inputs are concatenated along
+  time steps, the LoD information of the output need to re-compute.
+  The LoD information of level-1 should be same.
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,2,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,4}, {0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+- Case3:
+  If the axis is 0(here, level is 1).
+
+  LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,3,4}, {0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,5,8}, {0,1,2,3,5,7,8,9,11}}; Dims(Out) = (11,3,4)
+
+- Case4:
+  If the LoD number is 1, axis is 0, level is 0
+
+  LoD(x0) = {{0,1,2,3,4}}; Dims(x0) = (4,3,4)
+  LoD(x1) = {{0,1,3,5,7}}; Dims(x1) = (7,3,4)
+  LoD(Out) = {{0,2,5,8,11}}; Dims(Out) = (11,3,4)
+
+NOTE: The levels of all the inputs should be the same.
+    )DOC");
+  }
+};
+
+class SequenceConcatGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_EX(sequence_concat, ops::SequenceConcatOp,
+               ops::SequenceConcatOpMaker, sequence_concat_grad,
+               ops::SequenceConcatGradOp, false);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat_grad,
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc
new file mode 100644
index 0000000000..144bdb5af6
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_concat_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sequence_concat_grad,
+                        ops::SequenceConcatGradOpKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
new file mode 100644
index 0000000000..8445224f46
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.h
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+LoD ConcatLoD(const std::vector<const T*> ins, const size_t level) {
+  auto out_lod = ins[0]->lod();
+  auto numLevels = ins[0]->NumLevels();
+  const size_t n = ins.size();
+  const size_t level_idx = ins[0]->NumLevels() - 1 - level;
+  for (size_t i = 1; i < n; ++i) {
+    for (size_t j = 0; j < ins[i]->lod()[level_idx].size(); ++j) {
+      out_lod[level_idx][j] += ins[i]->lod()[level_idx][j];
+    }
+  }
+
+  for (size_t i = level_idx; i < numLevels - 1; ++i) {
+    size_t lod_len = 1;
+    for (size_t j = 0; j < n; ++j) {
+      lod_len += ins[j]->lod()[i + 1].size() - 1;
+    }
+    out_lod[i + 1].clear();
+    out_lod[i + 1].resize(lod_len);
+
+    size_t idx = 1;
+    for (size_t j = 0; j < ins[0]->lod()[i].size() - 1; ++j) {
+      for (size_t k = 0; k < n; ++k) {
+        for (size_t m = ins[k]->lod()[i][j]; m < ins[k]->lod()[i][j + 1]; ++m) {
+          out_lod[i + 1][idx] = out_lod[i + 1][idx - 1] +
+                                ins[k]->lod()[i + 1][m + 1] -
+                                ins[k]->lod()[i + 1][m];
+          idx++;
+        }
+      }
+    }
+  }
+
+  return out_lod;
+}
+
+template <typename DeviceContext, typename T>
+class SequenceConcatOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    const size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    const size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = ins.size();
+
+    for (size_t i = 1; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(),
+                        "The levels of all the input LoDTensors "
+                        "should be the same.");
+      PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(),
+                        "The dimension size of all the input LoDTensors "
+                        "should be the same.");
+
+      const size_t dims_size = ins[i]->dims().size();
+      for (size_t j = 0; j < dims_size; ++j) {
+        if (j == axis) continue;
+        PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j],
+                          "Except for the dimension of the specified "
+                          "axis along which all the inputs are concatenated, "
+                          "dimensions of all the other axises of the input "
+                          "LoDTensors should be the same.");
+      }
+    }
+    PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level,
+                      "The levels of all the input LoDTensors "
+                      "should be greater than the specify level");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = ins[0]->lod();
+    if (axis == 0) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    out->set_lod(out_lod);
+
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
+                                static_cast<int>(out_lod_level[i + 1]));
+      auto out_stride = framework::stride(out_t.dims());
+      size_t offset = 0;
+      for (size_t j = 0; j < n; ++j) {
+        auto in_lod_level = framework::ToAbsOffset(ins[j]->lod())[level_idx];
+        auto in_stride = framework::stride(ins[j]->dims());
+        Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
+                                    static_cast<int>(in_lod_level[i + 1]));
+        size_t axis_dim = in_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                         in_t.dims(), out_stride, out_t.data<T>() + offset);
+        offset += axis_dim * in_stride[axis];
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto x_grads =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = x_grads.size();
+
+    // Set Grad(X) LoD as X
+    for (size_t i = 0; i < n; i++) {
+      x_grads[i]->set_lod(ins[i]->lod());
+      x_grads[i]->mutable_data<T>(ctx.GetPlace());
+    }
+    auto out_lod = ins[0]->lod();
+    if (axis == 0UL) {
+      out_lod = ConcatLoD<LoDTensor>(ins, level);
+    }
+    const size_t level_idx = out_lod.size() - level - 1;
+    auto out_lod_level = framework::ToAbsOffset(out_lod)[level_idx];
+
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_grad_t =
+          out_grad->Slice(static_cast<int>(out_lod_level[i]),
+                          static_cast<int>(out_lod_level[i + 1]));
+      auto out_grad_stride = framework::stride(out_grad_t.dims());
+      size_t offset = 0;
+
+      for (size_t j = 0; j < n; ++j) {
+        auto x_grad_lod_level =
+            framework::ToAbsOffset(x_grads[j]->lod())[level_idx];
+        auto x_grad_stride = framework::stride(x_grads[j]->dims());
+        Tensor x_grad_t =
+            x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
+                              static_cast<int>(x_grad_lod_level[i + 1]));
+        size_t axis_dim = x_grad_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>() + offset,
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+        offset += axis_dim * out_grad_stride[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc
new file mode 100644
index 0000000000..c5b7c81bd7
--- /dev/null
+++ b/paddle/operators/sequence_conv_op.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_conv_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceConvOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                   "Input(Filter) of SequenceConvOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceConvOp should not be null.");
+
+    int context_length = ctx->Attrs().Get<int>("contextLength");
+    int context_start = ctx->Attrs().Get<int>("contextStart");
+
+    auto in_dims = ctx->GetInputDim("X");
+    auto filter_dims = ctx->GetInputDim("Filter");
+    PADDLE_ENFORCE(ctx->Attrs().Get<int>("contextStride") == 1,
+                   "Currently, SequenceConvOp only supports contextStride=1.");
+    PADDLE_ENFORCE(in_dims.size() == 2 && filter_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+    PADDLE_ENFORCE(filter_dims[0] == context_length * in_dims[1],
+                   "Filter's height should be context_length * "
+                   "input_hidden_size .");
+
+    if (ctx->Attrs().Get<bool>("paddingTrainable")) {
+      PADDLE_ENFORCE(
+          ctx->HasInput("PaddingData"),
+          "Input(PaddingData) of SequenceConvOp should not be null.");
+      framework::DDim padding_dim = ctx->GetInputDim("PaddingData");
+      int up_pad = std::max(0, -context_start);
+      int down_pad = std::max(0, context_start + context_length - 1);
+      int total_pad = up_pad + down_pad;
+      int input_width = static_cast<int>(in_dims[1]);
+
+      if (context_start == 0 && context_length == 1) {
+        PADDLE_THROW(
+            "If context_start is 0 and context_length is 1, paddingTrainable "
+            "should be false.");
+      }
+      PADDLE_ENFORCE(padding_dim.size() == 2,
+                     "Input(PaddingData) should be 2-D tensor.");
+      PADDLE_ENFORCE(
+          padding_dim[0] == total_pad && padding_dim[1] == input_width,
+          "Input(PaddingData)'s shape is not consistent with 'context_start' "
+          "and 'context_length'.");
+    }
+
+    in_dims[1] = filter_dims[1];
+    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SequenceConvGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input(X) should not be null.");
+
+    if (ctx->Attrs().Get<bool>("paddingTrainable") &&
+        ctx->HasOutput(framework::GradVarName("PaddingData"))) {
+      ctx->SetOutputDim(framework::GradVarName("PaddingData"),
+                        ctx->GetInputDim("PaddingData"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareLoD("X", framework::GradVarName("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+      ctx->SetOutputDim(framework::GradVarName("Filter"),
+                        ctx->GetInputDim("Filter"));
+    }
+  }
+};
+
+class SequenceConvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceConvOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(LoDTensor) the input(X) is a LodTensor, which supports "
+        "variable-time length input sequence. The underlying tensor in "
+        "this LoDTensor is a matrix with shape (T, N), where T is the "
+        "total time steps in this mini-batch and N is the input_hidden_size.");
+    AddInput("PaddingData",
+             "(Tensor, optional) the input(PaddingData) is an optional "
+             "parameter, and it is learnable. "
+             "This is a tensor with shape (P, N), where P is the "
+             "top_pad + bottom_pad, N is the input_hidden_size. In order to "
+             "ensure the equal length of sequence before and after "
+             "convolution, it is necessary to fill the top and bottom of each "
+             "sequence according to context_length, context_stride and "
+             "context_start")
+        .AsDispensable();
+    AddInput(
+        "Filter",
+        "(Tensor) the input(Filter) is an learnable parameter."
+        "This is a tensor with shape (K, M), where K is the "
+        "context_length * input_hidden_size, M is the output feature size.");
+    AddOutput(
+        "Out",
+        "(LoDTensor) the output(Out) is a LodTensor, which support "
+        "variable-time length output sequence. The underlying tensor in "
+        "this LoDTensor is a matrix with shape (T, M), where, T is the "
+        "total time steps in this mini-batch, M is the output feature size.");
+
+    AddAttr<bool>("paddingTrainable",
+                  "(bool, default:false) the padding data of SequenceConvOp "
+                  "is trainable or not.")
+        .SetDefault(false);
+    AddAttr<int>("contextLength",
+                 "(int) the contextLength of SequenceConvOp is the "
+                 "height of the convolution kernel.")
+        .GreaterThan(0);
+    AddAttr<int>("contextStart",
+                 "(int, default:0) the contextStart of SequenceConvOp "
+                 "represents the beginning of the convolution of the number of "
+                 "rows of sequence, which can be negative. The negative number "
+                 "means to pad contextStart time-steps of zeros or learnable "
+                 "parameters at the beginning of each instance. The positive "
+                 "number means to skip contextStart time-steps of each "
+                 "instance.")
+        .SetDefault(0);
+    AddAttr<int>("contextStride",
+                 "(int, default:1) the contextStride of SequenceConvOp "
+                 "represents the stride length of convolution kernel. "
+                 "Currently, SequenceConvOp only supports"
+                 "contextStride=1.")
+        .SetDefault(1)
+        .GreaterThan(0);
+
+    AddComment(R"DOC(
+Sequence Conv Operator.
+
+SequenceConvOp performs convolution operation on features of contextLength
+time-steps of each instance. The convolution operation calculates the output
+based on the input, filter, strides and paddings parameters.
+The size of each dimension of the parameters is checked during infer-shape.
+In order to ensure the equal length of sequence before and after convolution,
+it is necessary to fill the top and bottom of each sequence based on
+context_length, context_stride and context_start.
+
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker,
+            sequence_conv_grad, ops::SequenceConvGradOp);
+
+REGISTER_OP_CPU_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_conv_grad,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc
new file mode 100644
index 0000000000..0b8f2c6955
--- /dev/null
+++ b/paddle/operators/sequence_conv_op.cu.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_conv_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_conv_grad,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceConvGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h
new file mode 100644
index 0000000000..bb584b7bfa
--- /dev/null
+++ b/paddle/operators/sequence_conv_op.h
@@ -0,0 +1,160 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/context_project.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    auto filter = *context.Input<Tensor>("Filter");
+
+    out->mutable_data<T>(context.GetPlace());
+    context.ShareLoD("X", "Out");
+
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
+
+    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
+                      "Only support one level sequence now.");
+
+    const Tensor* padding_data = nullptr;
+    if (padding_trainable) {
+      padding_data = context.Input<Tensor>("PaddingData");
+    }
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    int sequence_width = static_cast<int>(in->dims()[1]);
+
+    framework::DDim col_shape = {in->dims()[0],
+                                 context_length * sequence_width};
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // Because if padding_trainable is false, padding data should be zeros.
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    set_zero(dev_ctx, &col, static_cast<T>(0));
+
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+
+    seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                        context_start, context_length, context_stride, up_pad,
+                        down_pad, &col);
+
+    math::matmul<DeviceContext, T>(dev_ctx, col, false, filter, false,
+                                   static_cast<T>(1.0), out,
+                                   static_cast<T>(0.0));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceConvGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto* out_g = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* filter_g = context.Output<Tensor>(framework::GradVarName("Filter"));
+    auto* padding_data_g =
+        context.Output<Tensor>(framework::GradVarName("PaddingData"));
+    auto* in = context.Input<LoDTensor>("X");
+    auto* filter = context.Input<Tensor>("Filter");
+
+    int context_start = context.Attr<int>("contextStart");
+    int context_length = context.Attr<int>("contextLength");
+    int context_stride = context.Attr<int>("contextStride");
+    bool padding_trainable = context.Attr<bool>("paddingTrainable");
+
+    PADDLE_ENFORCE_EQ(in->lod().size(), 1UL,
+                      "Only support one level sequence now.");
+    auto lod_g_level_0 = in->lod()[0];
+
+    int up_pad = std::max(0, -context_start);
+    int down_pad = std::max(0, context_start + context_length - 1);
+    int sequence_width = static_cast<int>(in->dims()[1]);
+
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    // use col_shape in the im2col calculation
+    framework::DDim col_shape = {in->dims()[0],
+                                 sequence_width * context_length};
+    Tensor col;
+
+    if (in_g || filter_g || (padding_trainable && padding_data_g)) {
+      col.mutable_data<T>(col_shape, context.GetPlace());
+      // Because if padding_trainable is false, padding data should be zeros.
+      set_zero(dev_ctx, &col, static_cast<T>(0));
+      math::matmul<DeviceContext, T>(dev_ctx, *out_g, false, *filter, true,
+                                     T(1.0), &col, T(1.0));
+    }
+    math::ContextProjectFunctor<DeviceContext, T> seq_project_functor;
+    math::ContextProjectGradFunctor<DeviceContext, T> seq_project_grad_functor;
+
+    if (in_g) {
+      in_g->mutable_data<T>(context.GetPlace());
+      in_g->set_lod(in->lod());
+      set_zero(dev_ctx, in_g, static_cast<T>(0));
+
+      seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start,
+                               context_length, context_stride, up_pad, down_pad,
+                               false, true, padding_data_g, &col);
+    }
+
+    if (padding_trainable && padding_data_g) {
+      padding_data_g->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, padding_data_g, static_cast<T>(0));
+
+      LoDTensor* input = const_cast<LoDTensor*>(in);
+      seq_project_grad_functor(
+          dev_ctx, *input, padding_trainable, context_start, context_length,
+          context_stride, up_pad, down_pad, true, false, padding_data_g, &col);
+    }
+
+    if (filter_g) {
+      filter_g->mutable_data<T>(context.GetPlace());
+      set_zero(dev_ctx, filter_g, static_cast<T>(0));
+
+      Tensor filter_grad = *filter_g;
+      LoDTensor out_grad = *out_g;
+
+      const Tensor* padding_data = nullptr;
+      if (padding_trainable) {
+        padding_data = context.Input<Tensor>("PaddingData");
+      }
+
+      seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable,
+                          context_start, context_length, context_stride, up_pad,
+                          down_pad, &col);
+
+      math::matmul<DeviceContext, T>(dev_ctx, col, true, out_grad, false,
+                                     T(1.0), &filter_grad, T(1.0));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
new file mode 100644
index 0000000000..aa0c00aa6f
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_erase_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEraseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceEraseOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceEraseOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
+                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
+                   "with the 2nd dimension equal to 1.");
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dim. equal to 1) "
+             "Input LoDTensor of SequenceEraseOp.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dim. equal to 1) "
+              "Output LoDTensor of SequenceEraseOp.");
+    AddAttr<std::vector<int>>("tokens",
+                              "(vector<int>) Tokens need to be erased from "
+                              "input sequences.");
+    AddComment(R"DOC(
+Sequence Erase Operator.
+
+Sequence erase operator erases tokens specified by Attr(tokens) from the input 
+sequences Input(X), and outputs the remaining data and modifies the LoD 
+information at the same time. For example, given a 2-D LoDTensor
+
+    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
+
+with lod = [[0, 3, 6, 10]], there are three sequences in the input:
+   
+     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
+
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+operation, the three sequences become
+
+    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
+
+Hence the LoDTensor Output(Out) should be
+
+    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
+
+with lod = [[0, 1, 3, 7]].
+
+An example usage for this operator is to remove the special tokens when 
+computing the edit distance between two strings, such as blank, start token, 
+and end token.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
+                             ops::SequenceEraseOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_erase,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
new file mode 100644
index 0000000000..f1e3b96acd
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cu
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/sequence_erase_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void LabelErasedIdx(const T* in_dat, const int64_t in_len,
+                               const int* tokens, const size_t tokens_len,
+                               size_t* num_erased) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    for (size_t i = 0; i < tokens_len; ++i) {
+      if (in_dat[index] == tokens[i]) {
+        num_erased[index + 1] = 1;
+        break;
+      }
+    }
+  }
+}
+
+__global__ void GetOutLod(const size_t* num_erased, const size_t* in_lod,
+                          const size_t lod_len, size_t* out_lod0) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < lod_len) {
+    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(const T* in_dat, const int64_t in_len,
+                          const size_t* num_erased, T* out_dat) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    if (num_erased[index] == num_erased[index + 1]) {
+      out_dat[index - num_erased[index]] = in_dat[index];
+    }
+  }
+}
+
+template <typename T>
+class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    // Copy tokens to GPU
+    thrust::device_vector<int> dev_tokens(tokens.begin(), tokens.end());
+    int* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
+
+    // Count number of elements to be erased
+    thrust::device_vector<size_t> num_erased(in_len + 1, 0);
+    size_t* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+    auto stream = ctx.cuda_device_context().stream();
+    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_dat, in_len, dev_tokens_ptr, tokens.size(), num_erased_ptr);
+    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
+                           num_erased.begin() + 1);
+
+    // Copy LoD to GPU
+    auto lod0 = lod[0];
+    auto lod_len = lod0.size();
+    thrust::device_vector<size_t> dev_in_lod = lod0;
+    size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+
+    // Calc output LoD
+    thrust::device_vector<size_t> dev_out_lod(lod_len);
+    size_t* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
+
+    // Set LoD for output
+    thrust::host_vector<size_t> out_lod0 = dev_out_lod;
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+
+    // Set output
+    out->Resize({static_cast<int64_t>(out_lod0.back()), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
+                                                      num_erased_ptr, out_dat);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(sequence_erase,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/operators/sequence_erase_op.h
new file mode 100644
index 0000000000..cb2d7be009
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SequenceEraseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    std::vector<size_t> num_erased(in_len + 1, 0);
+    std::vector<size_t> out_lod0(1, 0);
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      size_t num_out = 0;
+      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+        num_erased[j] = num_erased[j - 1];
+        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
+            tokens.end()) {
+          num_erased[j] += 1;
+        } else {
+          num_out += 1;
+        }
+      }
+      out_lod0.push_back(out_lod0.back() + num_out);
+    }
+
+    auto out_len = in_len - num_erased[in_len];
+    out->Resize({static_cast<int64_t>(out_len), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int64_t i = 0; i < in_len; ++i) {
+      if (num_erased[i] == num_erased[i + 1]) {
+        out_dat[i - num_erased[i]] = in_dat[i];
+      }
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_expand_op.cc b/paddle/operators/sequence_expand_op.cc
new file mode 100644
index 0000000000..d34dbd35b6
--- /dev/null
+++ b/paddle/operators/sequence_expand_op.cc
@@ -0,0 +1,153 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_expand_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SequenceExpandOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasOutput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput("Y"));
+    framework::DDim out_dim;
+    out_dim = ctx->GetInputDim("Y");
+    ctx->ShareLoD("Y", "Out");
+    ctx->SetOutputDim("Out", out_dim);
+  }
+};
+
+class SequenceExpandOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceExpandOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor or LoDTensor) The input(X) of this operator can be a "
+             "LoDTensor or a base Tensor.");
+    AddInput("Y",
+             "(LoDTensor)The reference input(Y) of sequence_expand op."
+             "It must be a LoDTensor with k-level(k>0)."
+             "The input(X) will be expanded according to LOD of input(Y)."
+             "The element numbers of last level in input(Y) "
+             "must be equal to dims[0] of input(X).");
+    AddOutput("Out",
+              "(LodTensor)The output of sequence_expand op."
+              "The lod of output will be as same as input(Y)'s lod.");
+    AddComment(R"DOC(
+Sequence Expand Operator.
+
+This operator expands input(X) according to LOD of input(Y).
+Following are cases to better explain how this works:
+Case 1:
+
+Given a 2-level LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 7, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 7, 8]]
+    Out.data = [a, a, a, b, b, b, c, d]
+    Out.dims = [8, 1]
+
+Case 2:
+
+Given a common Tensor input(X)
+    X.data = [a, b, c]
+    X.dims = [3, 1]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 1-level LoDTensor
+    Out.lod = [[0,    2, 3,      6]]
+    Out.data = [a, a, b, c, c, c]
+    Out.dims = [6, 1]
+
+Case 3:
+
+Given a common Tensor input(X)
+    X.data = [[a, b], [c, d], [e, f]]
+    X.dims = [3, 2]
+and input(Y)
+    Y.lod = [[0, 2, 3, 6]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 1-level LoDTensor
+    Out.lod = [[0,           2,     3,                     6]]
+    Out.data = [[a,b], [a,b] [c,d], [e, f], [e, f], [e, f]]
+    Out.dims = [6, 2]
+
+Case 4:
+
+Given 2-level a LoDTensor input(X)
+    X.lod = [[0,       2, 3],
+             [0, 1,    3, 4]]
+    X.data = [a, b, c, d]
+    X.dims = [4, 1]
+and input(Y)
+    Y.lod = [[0,    2,    4],
+             [0, 3, 6, 6, 8]]
+with condition len(Y.lod[-1]) -1 == X.dims[0]
+then we get 2-level LoDTensor
+    Out.lod = [[0,                2,    4],
+               [0,       3,       6, 6, 8]]
+    Out.data = [a, a, a, b, b, b, d, d]
+    Out.dims = [8, 1]
+
+
+)DOC");
+  }
+};
+
+class SequenceExpandOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Out"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_expand, ops::SequenceExpandOp, ops::SequenceExpandOpMaker,
+            sequence_expand_grad, ops::SequenceExpandOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.cu b/paddle/operators/sequence_expand_op.cu
new file mode 100644
index 0000000000..0b9638b2ce
--- /dev/null
+++ b/paddle/operators/sequence_expand_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sequence_expand_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand,
+    ops::SequenceExpandKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_expand_grad,
+    ops::SequenceExpandGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_expand_op.h b/paddle/operators/sequence_expand_op.h
new file mode 100644
index 0000000000..6021526eee
--- /dev/null
+++ b/paddle/operators/sequence_expand_op.h
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceExpandKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    const T* x_data = x->data<T>();
+    auto x_dims = x->dims();
+    auto* y = context.Input<LoDTensor>("Y");
+    PADDLE_ENFORCE(!y->lod().empty(), "y should have lod");
+    PADDLE_ENFORCE_EQ(static_cast<size_t>(x_dims[0]),
+                      y->lod().back().size() - 1,
+                      "The size of last lod level in Input(Y)"
+                      "must be equal to dims[0] of Input(X).");
+    out->set_lod(y->lod());
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+    size_t element_len = framework::product(x_dims) / x_dims[0];
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    auto out_starts = out->lod().back();
+
+    for (size_t i = 0; i < out_starts.size() - 1; i++) {
+      int scale = out_starts[i + 1] - out_starts[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          x_t(x_data, 1, element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+          out_t(out_data, scale, element_len);
+      Eigen::array<int, 2> cast({{scale, 1}});
+      out_t.device(*place) = x_t.broadcast(cast);
+      x_data += element_len;
+      out_data += element_len * scale;
+    }
+  }
+};
+
+/*
+ *Given Grad(Out)
+ *
+ *    Grad(Out).lod = [[0,                            2],
+ *                     [0,              3,            6]]
+ *    Grad(Out).data = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6]
+ * Then
+ *    Grad(X).data = [(0.1 + 0.2 + 0.3), (0.4 + 0.5 + 0.6)]
+ *                 = [0.6, 1.5]
+ *    Grad(X).lod = Input(X).lod
+ *
+ * */
+template <typename DeviceContext, typename T>
+class SequenceExpandGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* d_out = context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = context.Input<LoDTensor>("X");
+    auto* out = context.Input<LoDTensor>("Out");
+    auto* d_x = context.Output<LoDTensor>(framework::GradVarName("X"));
+    auto out_last_level = out->lod().back();
+    d_x->set_lod(x->lod());
+    const T* d_out_data = d_out->data<T>();
+    T* d_x_data = d_x->mutable_data<T>(context.GetPlace());
+    size_t element_len = d_out->numel() / d_out->dims()[0];
+    for (size_t i = 0; i < out_last_level.size() - 1; ++i) {
+      size_t repeat = out_last_level[i + 1] - out_last_level[i];
+      Eigen::TensorMap<
+          Eigen::Tensor<const T, 2, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_out_t(d_out_data, static_cast<int>(repeat), element_len);
+      Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, Eigen::DenseIndex>>
+      d_x_t(d_x_data, static_cast<int>(element_len));
+      auto place =
+          context.template device_context<DeviceContext>().eigen_device();
+      d_x_t.device(*place) = d_out_t.sum(Eigen::array<int, 1>({{0}}));
+      d_out_data += (repeat * element_len);
+      d_x_data += element_len;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
new file mode 100644
index 0000000000..549d9620ef
--- /dev/null
+++ b/paddle/operators/sequence_pool_op.cc
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequencePoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequencePoolOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequencePoolOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (ctx->Attrs().Get<std::string>("pooltype") == "MAX") {
+      PADDLE_ENFORCE(ctx->HasOutput("MaxIndex"),
+                     "Output(MaxIndex) of SequencePoolOp should not be null.");
+      ctx->SetOutputDim("MaxIndex", ctx->GetInputDim("X"));
+    }
+  }
+};
+
+class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequencePoolOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The variable-length input of SequencePoolOp");
+    AddOutput("Out",
+              "(Tensor) The output of SequencePoolOp does not contain LoD "
+              "infomation.");
+    AddOutput("MaxIndex",
+              "(Tensor<int>) This tensor is used for the sequence max-pooling "
+              "to record the max indexes.")
+        .AsIntermediate();
+    AddAttr<std::string>(
+        "pooltype",
+        "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.")
+        .SetDefault("AVERAGE")
+        .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"});
+    AddComment(R"DOC(
+Sequence Pool Operator.
+
+The SequencePoolOp pools features of all time-steps of each instance.
+It supports six pooling types:
+1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$
+2. SUM:     $$Out[i] = \sum_jX_{ij}$$
+3. SQRT:    $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$
+4. LAST:    Out[i] = last instance in i-th sequence X[i]
+5. FIRST:   Out[i] = first instance in i-th sequence X[i]
+6. MAX:     $$Out[i] = max(X_i)$$
+
+The following example explains how this works:
+For a mini-batch of 3 variable-length sentences,
+containing 2, 3, and 2 time-steps:
+
+Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
+Besides, for the sake of simplicity, we assume M=1 and N=1,
+and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
+
+Thus, Out is a [3,1,1] Tensor without LoD infomation.
+And for different pooltype, the value of Out is as follows:
+
+- AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+- SUM: [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+- SQRT: [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+           6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+- MAX: [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+- LAST: [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+- FIRST: [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    )DOC");
+  }
+};
+
+class SequencePoolGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
+    auto og_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE_EQ(og_dims.size(), x_dims.size(),
+                      "The rank of output grad must equal to Input(X).");
+    for (int64_t i = 1; i < og_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
+    }
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequencePoolGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_pool_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    if (boost::get<std::string>(GetAttr("pooltype")) == "MAX") {
+      op_desc_ptr->SetInput("MaxIndex", Output("MaxIndex"));
+    }
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker,
+                  ops::SequencePoolGradOpMaker);
+REGISTER_OPERATOR(sequence_pool_grad, ops::SequencePoolGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu
new file mode 100644
index 0000000000..265f695935
--- /dev/null
+++ b/paddle/operators/sequence_pool_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sequence_pool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool,
+    ops::SequencePoolKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_pool_grad,
+    ops::SequencePoolGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
new file mode 100644
index 0000000000..7519aa1d72
--- /dev/null
+++ b/paddle/operators/sequence_pool_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SequencePoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<Tensor>("Out");
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    auto dims = in->dims();
+    auto lod = in->lod();
+    int64_t w = in->numel() / dims[0];
+
+    // InferShape by lod
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_GE(
+        dims[0],
+        /*batch size = */ static_cast<int64_t>(lod[0].size() - 1),
+        "The first dimension of Input(X) must be large than batch size.");
+    dims[0] = lod[0].size() - 1;
+    out->Resize({dims});
+
+    auto lod_level_0 = lod[0];
+
+    out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolFunctor<DeviceContext, T> max_pool;
+      auto* index = context.Output<Tensor>("MaxIndex");
+      index->Resize({dims});
+      index->mutable_data<int>(context.GetPlace());
+      max_pool(dev_ctx, *in, out, index);
+      return;
+    }
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
+      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = out->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
+      auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
+      auto out_e = EigenVector<T>::Flatten(out_t);
+
+      if (pooltype == "AVERAGE") {
+        out_e.device(place) = in_e.mean(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SUM") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
+      } else if (pooltype == "SQRT") {
+        out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                              std::sqrt(static_cast<T>(h));
+      } else if (pooltype == "LAST") {
+        out_e.device(place) = in_e.chip(h - 1, 0);
+      } else if (pooltype == "FIRST") {
+        out_e.device(place) = in_e.chip(0, 0);
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequencePoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out_g = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* in_g = context.Output<LoDTensor>(framework::GradVarName("X"));
+    std::string pooltype = context.Attr<std::string>("pooltype");
+
+    auto dims = in->dims();
+    auto lod = in->lod()[0];
+    int64_t w = in->numel() / dims[0];
+
+    in_g->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    if (pooltype == "MAX") {
+      math::MaxSeqPoolGradFunctor<DeviceContext, T> max_pool_grad;
+      auto* index = context.Input<Tensor>("MaxIndex");
+      max_pool_grad(dev_ctx, *out_g, *index, in_g);
+      return;
+    }
+
+    if (pooltype == "LAST" || pooltype == "FIRST") {
+      // set X@Grad be zero at first when pooltype is LAST/FIRST
+      math::SetConstant<DeviceContext, T> functor;
+      functor(dev_ctx, in_g, 0);
+    }
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
+      auto in_g_t =
+          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_g->Slice(i, i + 1);
+      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
+      auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
+      auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
+      auto out_g_e_v = EigenVector<T>::Flatten(out_g_t);
+      Eigen::DSizes<int, 2> bcast(h, 1);
+
+      if (pooltype == "AVERAGE") {
+        in_g_e.device(place) = (out_g_e / static_cast<T>(h)).broadcast(bcast);
+      } else if (pooltype == "SUM") {
+        in_g_e.device(place) = (out_g_e).broadcast(bcast);
+      } else if (pooltype == "SQRT") {
+        in_g_e.device(place) =
+            (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+      } else if (pooltype == "LAST") {
+        in_g_e.chip(h - 1, 0).device(place) = out_g_e_v;
+      } else if (pooltype == "FIRST") {
+        in_g_e.chip(0, 0).device(place) = out_g_e_v;
+      } else {
+        PADDLE_THROW("unsupported pooling pooltype");
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_reshape_op.cc b/paddle/operators/sequence_reshape_op.cc
new file mode 100644
index 0000000000..d89a46a712
--- /dev/null
+++ b/paddle/operators/sequence_reshape_op.cc
@@ -0,0 +1,135 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/operators/sequence_reshape_op.h"
+#include "paddle/framework/ddim.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceReshapeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceReshapeOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    auto x_numel = product(x_dims);
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2U, "Rank of Input(X) should be 2.");
+    int new_dim = ctx->Attrs().Get<int>("new_dim");
+    if (ctx->IsRuntime()) {
+      ctx->SetOutputDim("Out",
+                        {x_numel / new_dim, static_cast<int64_t>(new_dim)});
+    } else {
+      // when compiling, the batch size is undetermined, just set to -1
+      ctx->SetOutputDim("Out", {-1, static_cast<int64_t>(new_dim)});
+    }
+  }
+};
+
+class SequenceReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceReshapeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with shape "
+             "being [N, M].");
+    AddOutput("Out",
+              "(LoDTensor, default LoDTensor<float>) A 2-D LoDTensor with "
+              "shape [T, new_dim] where T is calculated based on X.lod, M and "
+              "new_dim.");
+    AddAttr<int>("new_dim", "Sequence dimension of the output LoDTensor.");
+    AddComment(R"DOC(
+Sequence Reshape Operator.
+
+This operator will rearrange the input sequences. The new dimension is set by
+attribute and length of each sequence may change longer or shorter which is
+decided by original length, original dimension and new dimension. The following
+example will help to illustrate the function of this operator:
+
+x is a LoDTensor:
+    x.lod  = [[0, 2, 6]]
+    x.data = [[1, 2], [3, 4],
+              [5, 6], [7, 8], [9, 10], [11, 12]]
+    x.dims = [6, 2]
+
+set new_dim = 4
+
+then out is a LoDTensor:
+    out.lod  = [[0, 1, 3]]
+    out.data = [[1, 2, 3, 4],
+                [5, 6, 7, 8], [9, 10, 11, 12]]
+    out.dims = [3, 4]
+
+Currently, only 1-level LoDTensor is supported and please make sure (original
+length * original dimension) can be divided by new_dim with no remainder for
+each sequence.
+
+)DOC");
+  }
+};
+
+class SequenceReshapeGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceReshapeGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceReshapeGradOp should  not be null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
+  }
+};
+
+class SequenceReshapeGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op_desc_ptr = new framework::OpDesc();
+    op_desc_ptr->SetType("sequence_reshape_grad");
+    op_desc_ptr->SetInput("X", Input("X"));
+    op_desc_ptr->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op_desc_ptr->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op_desc_ptr->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op_desc_ptr);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sequence_reshape, ops::SequenceReshapeOp,
+                  ops::SequenceReshapeOpMaker, ops::SequenceReshapeGradOpMaker);
+REGISTER_OPERATOR(sequence_reshape_grad, ops::SequenceReshapeGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CPUDeviceContext, int64_t>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CPUDeviceContext, int>);
diff --git a/paddle/operators/sequence_reshape_op.cu b/paddle/operators/sequence_reshape_op.cu
new file mode 100644
index 0000000000..d9c2f7e9a4
--- /dev/null
+++ b/paddle/operators/sequence_reshape_op.cu
@@ -0,0 +1,30 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_reshape_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SequenceReshapeKernel<paddle::platform::CUDADeviceContext, int64_t>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_reshape_grad,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext,
+                                   int64_t>,
+    ops::SequenceReshapeGradKernel<paddle::platform::CUDADeviceContext, int>);
diff --git a/paddle/operators/sequence_reshape_op.h b/paddle/operators/sequence_reshape_op.h
new file mode 100644
index 0000000000..aaae7ab292
--- /dev/null
+++ b/paddle/operators/sequence_reshape_op.h
@@ -0,0 +1,86 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+template <typename DeviceContext, typename T>
+class SequenceReshapeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<LoDTensor>("X");
+    auto* out = context.Output<LoDTensor>("Out");
+    int out_width = context.Attr<int>("new_dim");
+
+    auto in_dims = in->dims();
+    int64_t in_width = in_dims[1];
+    auto& in_lod = in->lod();
+
+    PADDLE_ENFORCE_EQ(in_lod.size(), 1UL,
+                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        (uint64_t)in_dims[0], in_lod[0].back(),
+        "Inconsistent size between X.shape[0] and X.lod()[0].back().");
+
+    auto in_lod_l0 = in_lod[0];
+    int seq_num = in_lod_l0.size() - 1;
+
+    if (in_width == out_width) {
+      out->set_lod(in->lod());
+    } else {
+      auto& out_lod = *out->mutable_lod();
+      out_lod.resize(1);
+      out_lod[0].resize(seq_num + 1);
+      out_lod[0][0] = 0;
+      for (int i = 0; i < seq_num; ++i) {
+        size_t seq_len = in_lod_l0[i + 1] - in_lod_l0[i];
+        size_t offset = 0;
+        offset = (seq_len * in_width) / out_width;
+        PADDLE_ENFORCE_EQ(offset * out_width, seq_len * in_width,
+                          "Please make sure (sequence_length * dimension) can "
+                          "be divided by new_dim with no remainder for each "
+                          "sequence. The %dth sequence is invalid.",
+                          i + 1);
+        out_lod[0][i + 1] = out_lod[0][i] + offset;
+      }
+    }
+
+    framework::Copy(*in, context.GetPlace(), out);
+    out->Resize({static_cast<int64_t>(out->lod()[0].back()), out_width});
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceReshapeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x_tensor_ptr = context.Input<LoDTensor>("X");
+    auto* outg_tensor_ptr =
+        context.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* xg_tensor_ptr =
+        context.Output<LoDTensor>(framework::GradVarName("X"));
+
+    xg_tensor_ptr->mutable_data<T>(context.GetPlace());
+    framework::Copy(*outg_tensor_ptr, context.GetPlace(), xg_tensor_ptr);
+    xg_tensor_ptr->Resize(x_tensor_ptr->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
new file mode 100644
index 0000000000..f79106ff0f
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSliceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Offset"),
+                   "Input(Offset) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Length"),
+                   "Input(Length) of SequenceSliceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSliceOp should not be null.");
+    auto input_dims = ctx->GetInputDim("X");
+
+    auto offset_dim = ctx->GetInputDim("Offset");
+    auto length_dim = ctx->GetInputDim("Length");
+
+    PADDLE_ENFORCE_EQ(
+        offset_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of offset must be 2.");
+    PADDLE_ENFORCE_EQ(
+        length_dim.size(), 2UL,
+        "Only support one level sequence now, The rank of Length must be 2.");
+
+    // Initialize the output's dims to maximum,
+    // and re-set to real dims by the value of Offset and Length at kernel
+    ctx->SetOutputDim("Out", input_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
+        ctx.device_context());
+  }
+};
+
+class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSliceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor), "
+             "the input of SequenceSliceOp.");
+    AddInput("Offset",
+             "(Tensor), "
+             "a vector<int> to describe the offset of every input sequence for "
+             "sub sequence item.");
+    AddInput("Length",
+             "(Tensor), "
+             "a vector<int> to describe the length of every input sequence for "
+             "sub sequence item.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
+    AddComment(R"DOC(
+Sequence slice operator
+
+The operator crops a subsequence from given sequence with given start offset and subsequence length.
+It only supports sequence (LoD Tensor with level number is 1).
+- Case:
+    X = [[a1, a2;
+        b1, b2;
+        c1, c2]
+       [d1, d2;
+        e1, e2]]
+    LoD(X) = {{0, 3, 5}}; Dims(X) = (5, 2)
+    Offset = [[0], [1]]; Length = [[2], [1]]
+
+    Out = [[a1, a2;
+            b1, b2]
+            [e1, e2]]
+    LoD(Out) = {{0, 2, 3}}; Dims(Out) = (3, 2)
+NOTE: The first dimension size of input, the size of offset and Length, should be equal. The offset start from 0.
+    )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker,
+            sequence_slice_grad, ops::SequenceSliceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu
new file mode 100755
index 0000000000..43a21d619f
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_slice_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_slice,
+    ops::SequenceSliceOpKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    sequence_slice_grad,
+    ops::SequenceSliceGradOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
new file mode 100644
index 0000000000..0e4e4cf65f
--- /dev/null
+++ b/paddle/operators/sequence_slice_op.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
+                            const int64_t* length_data) {
+  auto out_lod = in.lod();
+  size_t lod_offset = 0;
+
+  auto n = in.lod()[0].size() - 1;
+  out_lod[0][0] = 0;
+  for (size_t i = 0; i < n; ++i) {
+    lod_offset += length_data[i];
+    out_lod[0][i + 1] = lod_offset;
+  }
+  return out_lod;
+}
+
+template <typename DeviceContext, typename T>
+class SequenceSliceOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    auto n = lod[0].size() - 1;
+
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(length->dims()[0]),
+        "The size of input-sequence and length-array should be the same");
+    PADDLE_ENFORCE_EQ(
+        n, static_cast<size_t>(offset->dims()[0]),
+        "The size of input-sequence and offset-array should be the same");
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    for (size_t i = 0; i < n; ++i) {
+      PADDLE_ENFORCE_LT(0, offset_data[i],
+                        "The offset[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(0, length_data[i],
+                        "The length[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.");
+    }
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = SequenceSliceLoD(*in, offset_data, length_data);
+    auto out_dims = in->dims();
+    out_dims[0] = out_lod[0][out_lod[0].size() - 1];
+    out->Resize(out_dims);
+    out->set_lod(out_lod);
+
+    auto in_stride = framework::stride(in->dims());
+    auto out_stride = framework::stride(out->dims());
+
+    size_t out_offset = 0;
+    for (size_t i = 0; i < n; ++i) {
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
+      out_offset += length_data[i] * in_stride[0];
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* offset = ctx.Input<Tensor>("Offset");
+    auto* length = ctx.Input<Tensor>("Length");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
+
+    const int64_t* offset_data = offset->data<int64_t>();
+    const int64_t* length_data = length->data<int64_t>();
+    framework::Tensor offset_cpu;
+    framework::Tensor length_cpu;
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+      offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
+      framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
+                      &offset_cpu);
+      offset_data = offset_cpu.data<int64_t>();
+
+      length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
+      framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
+                      &length_cpu);
+      length_data = length_cpu.data<int64_t>();
+    }
+
+    auto lod = in->lod();
+    auto out_lod = out_grad->lod();
+
+    if (x_grad) {
+      x_grad->mutable_data<T>(ctx.GetPlace());
+      x_grad->set_lod(in->lod());
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(ctx.template device_context<DeviceContext>(), x_grad,
+               static_cast<T>(0));
+
+      auto out_grad_stride = framework::stride(out_grad->dims());
+
+      for (size_t i = 0; i < out_lod[0].size() - 1; ++i) {
+        Tensor out_grad_t =
+            out_grad->Slice(static_cast<int>(out_lod[0][i]),
+                            static_cast<int>(out_lod[0][i + 1]));
+        auto out_grad_stride = framework::stride(out_grad_t.dims());
+
+        auto x_grad_stride = framework::stride(x_grad->dims());
+
+        Tensor x_grad_t = x_grad->Slice(
+            static_cast<int>(lod[0][i] + offset_data[i]),
+            static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
new file mode 100644
index 0000000000..b74766f012
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSoftmaxOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
+    AddComment(R"DOC(
+Sequence Softmax Operator.
+
+SequenceSoftmaxOp computes the softmax activation among all time-steps for each
+sequence. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of the length
+of all sequences.
+
+The algorithm works as follows:
+
+    for i-th sequence in a mini-batch:
+
+$$
+Out(X[lod[i]:lod[i+1]], :) = \
+\frac{\exp(X[lod[i]:lod[i+1], :])} \
+{\sum(\exp(X[lod[i]:lod[i+1], :]))}
+$$
+
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
+and N turns out to be 7.
+
+)DOC");
+  }
+};
+
+class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+            ops::SequenceSoftmaxGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc
new file mode 100644
index 0000000000..5f65b4daf9
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_softmax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CUDADeviceContext, float>)
+REGISTER_OP_CUDA_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
new file mode 100644
index 0000000000..e889e88cb3
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext, typename T>
+class SequenceSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = x->lod();
+    auto dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &x_i, &out_i);
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<DeviceContext, T>()(
+          ctx.template device_context<DeviceContext>(), &out_i, &out_grad_i,
+          &x_grad_i);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
new file mode 100644
index 0000000000..a11c9624ce
--- /dev/null
+++ b/paddle/operators/sgd_op.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of SGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+    auto param_dim = ctx->GetInputDim("Param");
+    // TODO(qijun): check dimensions of Param and Grad at complie
+    // and run time.
+    ctx->SetOutputDim("ParamOut", param_dim);
+  }
+};
+
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SGDOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("LearningRate", "(Tensor) Learning rate of SGD");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddComment(R"DOC(
+
+SGD operator
+
+This operator implements one step of the stochastic gradient descent algorithm.
+
+$$param\_out = param - learning\_rate * grad$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
new file mode 100644
index 0000000000..42f8f8b2f0
--- /dev/null
+++ b/paddle/operators/sgd_op.cu
@@ -0,0 +1,118 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sgd_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+
+template <typename T>
+__global__ void SGDKernel(const T* g, const T* p, const T* learning_rate,
+                          const int num, T* p_out) {
+  T lr = learning_rate[0];
+  int grid_size = blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; i += grid_size) {
+    T g_data = g[i];
+    T p_data = p[i];
+    p_out[i] = p_data - lr * g_data;
+  }
+}
+
+template <typename T, int block_size>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate, T* tensor_out,
+                                       int64_t row_numel) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(
+        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class SGDOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+      auto* grad_data = grad->data<T>();
+      auto* param_data = param->data<T>();
+      auto* param_out_data = param_out->data<T>();
+
+      int block = 512;
+      int grid = (param->numel() + block - 1) / block;
+
+      SGDKernel<T><<<grid, block, 0, ctx.cuda_device_context().stream()>>>(
+          grad_data, param_data, learning_rate->data<T>(), param->numel(),
+          param_out_data);
+
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+
+      const int block_size = 256;
+      dim3 threads(block_size, 1);
+      dim3 grid(1, in_rows.size());
+      SparseSGDFunctorKernel<
+          T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
+          in_data, in_rows.data(), learning_rate->data<T>(), out_data,
+          in_row_numel);
+
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
+                        ops::SGDOpCUDAKernel<double>);
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
new file mode 100644
index 0000000000..a6c544591e
--- /dev/null
+++ b/paddle/operators/sgd_op.h
@@ -0,0 +1,76 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/selected_rows.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
+
+      auto p = framework::EigenVector<T>::Flatten(*param);
+      auto g = framework::EigenVector<T>::Flatten(*grad);
+      auto o = framework::EigenVector<T>::Flatten(*param_out);
+      auto* lr = learning_rate->data<T>();
+
+      o = p - lr[0] * g;
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+
+      auto in_height = grad->height();
+      auto out_dims = param_out->dims();
+      PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+      auto& in_value = grad->value();
+      auto& in_rows = grad->rows();
+
+      int64_t in_row_numel = in_value.numel() / in_rows.size();
+      PADDLE_ENFORCE_EQ(in_row_numel, param_out->numel() / in_height);
+
+      auto* in_data = in_value.data<T>();
+      auto* out_data = param_out->data<T>();
+      auto* lr = learning_rate->data<T>();
+
+      for (size_t i = 0; i < in_rows.size(); i++) {
+        for (int64_t j = 0; j < in_row_numel; j++) {
+          out_data[in_rows[i] * in_row_numel + j] -=
+              lr[0] * in_data[i * in_row_numel + j];
+        }
+      }
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
new file mode 100644
index 0000000000..bf870115a4
--- /dev/null
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+class ShrinkRNNMemoryOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryOp(const std::string &type,
+                    const framework::VariableNameMap &inputs,
+                    const framework::VariableNameMap &outputs,
+                    const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr, "Input X must be set");
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    size_t offset = this->GetOffset(scope, place);
+    auto *rank_table_var = scope.FindVar(Input("RankTable"));
+    PADDLE_ENFORCE(rank_table_var != nullptr, "RankTable must be set");
+    auto &rank_table = rank_table_var->Get<framework::LoDRankTable>();
+
+    auto &rank_items = rank_table.items();
+    int dst_num_rows =
+        std::lower_bound(rank_items.begin(), rank_items.end(), offset,
+                         [](const framework::LoDRankTable::TableItem &a,
+                            size_t b) { return a.length > b; }) -
+        rank_items.begin();
+
+    auto *out_var = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
+    auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
+
+    size_t height = dst_num_rows;
+
+    // do shrink for the top level LoD
+    if (x_tensor.lod().size() > 0 &&
+        x_tensor.lod()[0].size() > static_cast<size_t>(dst_num_rows)) {
+      auto lod_offset = framework::GetSubLoDAndAbsoluteOffset(x_tensor.lod(), 0,
+                                                              dst_num_rows, 0);
+      height = lod_offset.second.second;
+      auto out_lod = out_tensor.mutable_lod();
+      framework::AppendLoD(out_lod, lod_offset.first);
+    }
+
+    if (dst_num_rows != 0) {
+      out_tensor.ShareDataWith(x_tensor.Slice(0, height));
+    }
+  }
+};
+
+class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ShrinkRNNMemoryOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(R"DOC(
+This operator is used to shrink output batch of memory defined in dynamic RNN.
+
+Dynamic RNN is able to handle variable-length sequences, in which, sequences in
+a mini-batch are sorted by their lengths first. After that, the longest sequence
+becomes the first one in the sorted batch, followed by the second longest, the
+third longest, and so on. Dynamic RNN then slices a batch input timestep by
+timestep from the sorted input. Once any sequence in the input batch reaches its
+end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
+batch size for the next time step.
+)DOC");
+  }
+};
+
+class ShrinkRNNMemoryInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasInput("I"));
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+};
+
+class ShrinkRNNMemoryGradOp : public ArrayOp {
+ public:
+  ShrinkRNNMemoryGradOp(const std::string &type,
+                        const framework::VariableNameMap &inputs,
+                        const framework::VariableNameMap &outputs,
+                        const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *dout_var = scope.FindVar(Input(framework::GradVarName("Out")));
+    auto *dx_var = scope.FindVar(Output(framework::GradVarName("X")));
+    PADDLE_ENFORCE(dx_var != nullptr, "Input Gradient should not be nullptr");
+    auto *x_var = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x_var != nullptr);
+
+    auto &x_tensor = x_var->Get<framework::LoDTensor>();
+    auto &dx_tensor = *dx_var->GetMutable<framework::LoDTensor>();
+    dx_tensor.Resize(x_tensor.dims());
+    dx_tensor.mutable_data(x_tensor.place(), x_tensor.type());
+
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(place);
+
+    if (dout_var == nullptr) {  // dx_tensor fill zero
+      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+    } else {
+      auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
+      auto height = dout_tensor.dims()[0];
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
+      if (dx_tensor.dims()[0] > height) {
+        auto rest_tensor = dx_tensor.Slice(
+            static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
+        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+      }
+    }
+    dx_tensor.set_lod(x_tensor.lod());
+  }
+};
+
+class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"));
+    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+
+class ShrinkRNNGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *op = new framework::OpDesc();
+    op->SetType("shrink_rnn_memory_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(shrink_rnn_memory, ops::ShrinkRNNMemoryOp,
+                  ops::ShrinkRNNMemoryInferShape,
+                  ops::ShrinkRNNMemoryOpProtoMaker, ops::ShrinkRNNGradOpMaker);
+REGISTER_OPERATOR(shrink_rnn_memory_grad, ops::ShrinkRNNMemoryGradOp,
+                  ops::ShrinkRNNMemoryGradInferShape);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
new file mode 100644
index 0000000000..c526a88a12
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsGradOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Label");
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Label)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
+                      "Input(Out@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Label) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
+                      "The 1st dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidCrossEntropyWithLogitsOpMaker(OpProto* proto,
+                                       OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a tensor of logits computed by the previous "
+             " operator. Logits are unscaled log probabilities given as "
+             "log(p/(1-p)).");
+    AddInput("Label",
+             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
+             "and shape as X. This input is a tensor of probabalistic labels "
+             "for each logit");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
+              " of elementwise logistic losses.");
+    AddComment(R"DOC(
+SigmoidCrossEntropyWithLogits Operator.
+
+This measures the element-wise probability error in classification tasks
+in which each class is independent. This can be thought of as predicting labels
+for a data-point, where labels are not mutually exclusive.
+For example, a news article can be about politics, technology or sports
+at the same time or none of these.
+
+The logistic loss is given as follows:
+
+       $$loss = -Labels * \log(\sigma(X)) - (1 - Labels) * \log(1 - \sigma(X))$$
+
+We know that $$\sigma(X) = (1 / (1 + \exp(-X)))$$. By substituting this we get:
+
+       $$loss = X - X * Labels + \log(1 + \exp(-X))$$
+
+For stability and to prevent overflow of $$\exp(-X)$$ when X < 0,
+we reformulate the loss as follows:
+
+       $$loss = \max(X, 0) - X * Labels + \log(1 + \exp(-|X|))$$
+
+Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
+However the output only shares the LoD with input `X`.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid_cross_entropy_with_logits,
+            ops::SigmoidCrossEntropyWithLogitsOp,
+            ops::SigmoidCrossEntropyWithLogitsOpMaker,
+            sigmoid_cross_entropy_with_logits_grad,
+            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
+                           paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
new file mode 100644
index 0000000000..3f393265f4
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits,
+                        ops::SigmoidCrossEntropyWithLogitsKernel<
+                            paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                        ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
new file mode 100644
index 0000000000..b78bcc436e
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto &place = *context.device_context<DeviceContext>().eigen_device();
+
+    // term1 = max(x, 0)
+    auto term1 = x.cwiseMax(static_cast<T>(0));
+    // term2 = x * labels
+    auto term2 = x * labels;
+    // term3 = log(1 + exp(-abs(x)))
+    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
+
+    out.device(place) = term1 - term2 + term3;
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename DeviceContext, typename T>
+class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto &place =
+        *context.template device_context<DeviceContext>().eigen_device();
+
+    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    dx.device(place) = dout * (sigmoid_x - labels);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc
new file mode 100644
index 0000000000..f63eaa4464
--- /dev/null
+++ b/paddle/operators/sign_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sign_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SignOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SignOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SignOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class SignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SignOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of sign operator.");
+    AddOutput("Out", "(Tensor) Output tensor of sign operator.");
+    AddComment(R"DOC(
+Sign operator
+
+$$Out = X.sign()$$
+)DOC");
+  }
+};
+
+class SignGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", 0.0f);
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker<float>,
+                  ops::SignGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    sign, ops::SignKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu
new file mode 100644
index 0000000000..f224880cff
--- /dev/null
+++ b/paddle/operators/sign_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sign_op.h"
+
+REGISTER_OP_CUDA_KERNEL(
+    sign,
+    paddle::operators::SignKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h
new file mode 100644
index 0000000000..9fe49ae1a2
--- /dev/null
+++ b/paddle/operators/sign_op.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SignKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* out = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    out->mutable_data<T>(in->place());
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    eigen_out.device(place) = eigen_in.sign();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
new file mode 100644
index 0000000000..dcb18d729d
--- /dev/null
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/smooth_l1_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SmoothL1LossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    PADDLE_ENFORCE_GE(x_dims.size(), 2,
+                      "The tensor rank of Input(X) should not be less than 2.");
+    if (ctx->HasInput("InsideWeight")) {
+      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
+                     "If weights are provided, must specify both "
+                     "inside and outside weights.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
+    }
+
+    ctx->SetOutputDim("Diff", x_dims);
+    // loss is a two-rank tensor
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+  }
+};
+
+template <typename AttrType>
+class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SmoothL1LossOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
+    AddInput("InsideWeight",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
+             "by this tensor element by element.")
+        .AsDispensable();
+    AddInput("OutsideWeight",
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
+        .AsDispensable();
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
+    AddAttr<AttrType>("sigma",
+                      "Hyper parameter of smooth l1 loss op."
+                      "A float scalar with default value 3.0.")
+        .SetDefault(3.0);
+    AddComment(R"DOC(
+Smooth L1 Loss Operator.
+
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
+For each instance, it computes the smooth l1 loss element by element first
+and then sums all the losses. So the shape of Out is [batch_size, 1].
+
+The equation is:
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
+
+)DOC");
+  }
+};
+
+class SmoothL1LossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    auto in_dims = ctx->GetInputDim("X");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_GE(out_dims.size(), 2,
+                      "The tensor rank of Input(Out@Grad) should be 2.");
+    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
+                      "The 1st dimension of Input(Out@Grad) must be "
+                      "same as input.");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "The 2nd dimension of Input(Out@Grad) must be 1.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, in_dims);
+    }
+    if (ctx->HasOutput(y_grad_name)) {
+      ctx->SetOutputDim(y_grad_name, in_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp,
+            ops::SmoothL1LossOpMaker<float>, smooth_l1_loss_grad,
+            ops::SmoothL1LossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    smooth_l1_loss_grad,
+    ops::SmoothL1LossGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu
new file mode 100644
index 0000000000..213429bc37
--- /dev/null
+++ b/paddle/operators/smooth_l1_loss_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/smooth_l1_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss,
+    ops::SmoothL1LossKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    smooth_l1_loss_grad,
+    ops::SmoothL1LossGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
new file mode 100644
index 0000000000..3facfae116
--- /dev/null
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SmoothL1LossForward {
+  HOSTDEVICE SmoothL1LossForward(const T& sigma2) : sigma2(sigma2) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      return 0.5 * val * val * sigma2;
+    } else {
+      return abs_val - 0.5 / sigma2;
+    }
+  }
+
+  T sigma2;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class SmoothL1LossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* in2 = context.Input<Tensor>("InsideWeight");
+    auto* in3 = context.Input<Tensor>("OutsideWeight");
+    auto* out0 = context.Output<Tensor>("Diff");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
+    T sigma2 = sigma * sigma;
+    bool has_weight = (in2 != nullptr) && (in3 != nullptr);
+
+    auto x = EigenVector<T>::Flatten(*in0);
+    auto y = EigenVector<T>::Flatten(*in1);
+    auto diff = EigenVector<T>::Flatten(*out0);
+
+    diff.device(*place) = x - y;
+    // multiply inside weight
+    if (has_weight) {
+      auto inside_weight = EigenVector<T>::Flatten(*in2);
+      // cache diff, reused in bp
+      diff.device(*place) = diff * inside_weight;
+    }
+
+    auto in_counts = in0->numel();
+    Tensor ptensor_errors;
+    ptensor_errors.mutable_data<T>({static_cast<int>(in_counts)},
+                                   context.GetPlace());
+    auto errors = EigenVector<T>::Flatten(ptensor_errors);
+    // apply smooth l1 forward
+    errors.device(*place) = diff.unaryExpr(SmoothL1LossForward<T>(sigma2));
+
+    // multiply outside weight
+    if (has_weight) {
+      auto outside_weight = EigenVector<T>::Flatten(*in3);
+      errors.device(*place) = errors * outside_weight;
+    }
+    auto loss = EigenVector<T>::Flatten(*out1);
+    // first dimension of 'X' is the number of samples
+    auto mat_dims =
+        framework::make_ddim({static_cast<int>(in0->dims()[0]),
+                              static_cast<int>(in_counts / in0->dims()[0])});
+    auto errors_mat_view = EigenMatrix<T>::From(ptensor_errors, mat_dims);
+    loss.device(*place) = errors_mat_view.sum(Eigen::array<int, 1>({{1}}));
+  }
+};
+
+template <typename T>
+struct SmoothL1LossBackward {
+  HOSTDEVICE SmoothL1LossBackward(const T& sigma2) : sigma2(sigma2) {}
+
+  HOSTDEVICE T operator()(const T& val) const {
+    T abs_val = std::abs(val);
+    if (abs_val < 1.0 / sigma2) {
+      return sigma2 * val;
+    } else {
+      return (0 < val) - (val < 0);
+    }
+  }
+
+  T sigma2;
+};
+
+template <typename DeviceContext, typename T, typename AttrType = T>
+class SmoothL1LossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("InsideWeight");
+    auto* in1 = context.Input<Tensor>("OutsideWeight");
+    auto* in2 = context.Input<Tensor>("Diff");
+    auto* og = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto sigma = static_cast<T>(context.Attr<AttrType>("sigma"));
+    T sigma2 = sigma * sigma;
+    bool has_weight = (in0 != nullptr) && (in1 != nullptr);
+
+    auto* place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    auto in_dims = in2->dims();
+    auto counts = in2->numel();
+    auto cols = counts / in_dims[0];
+    auto mat_dims = framework::make_ddim(
+        {static_cast<int>(in_dims[0]), static_cast<int>(cols)});
+
+    Tensor ptensor_diff;
+    ptensor_diff.mutable_data<T>({static_cast<int>(counts)},
+                                 context.GetPlace());
+    auto diff = EigenVector<T>::Flatten(ptensor_diff);
+    // apply smooth l1 backwoard
+    diff.device(*place) = EigenVector<T>::Flatten(*in2).unaryExpr(
+        SmoothL1LossBackward<T>(sigma2));
+
+    // compute weights
+    Tensor ptensor_weights;
+    ptensor_weights.mutable_data<T>(mat_dims, context.GetPlace());
+    auto weights = EigenMatrix<T>::From(ptensor_weights);
+    // initialize to 1.0
+    weights.device(*place) = weights.constant(static_cast<T>(1.0));
+    if (has_weight) {
+      auto inside_weight = EigenMatrix<T>::From(*in0, mat_dims);
+      auto outside_weight = EigenMatrix<T>::From(*in1, mat_dims);
+      weights.device(*place) = inside_weight * outside_weight;
+    }
+
+    // compute gradients
+    auto out_grad = EigenMatrix<T>::From(*og);
+    auto diff_mat_view = EigenMatrix<T>::From(ptensor_diff, mat_dims);
+    auto gradients = out_grad.broadcast(
+                         Eigen::array<int, 2>({{1, static_cast<int>(cols)}})) *
+                     weights * diff_mat_view;
+
+    auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* out1 = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    if (out0) {
+      out0->mutable_data<T>(context.GetPlace());
+      auto x_grad = EigenMatrix<T>::From(*out0, mat_dims);
+      x_grad.device(*place) = gradients;
+    }
+
+    if (out1) {
+      out1->mutable_data<T>(context.GetPlace());
+      auto y_grad = EigenMatrix<T>::From(*out1, mat_dims);
+      y_grad.device(*place) = -1 * gradients;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
new file mode 100644
index 0000000000..cef1f1fc99
--- /dev/null
+++ b/paddle/operators/softmax_op.cc
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SoftmaxOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2UL,
+                   "The input of softmax op must be a matrix.");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "The normalized values with the same shape as X.");
+    AddComment(R"DOC(
+Softmax Operator.
+
+The input of the softmax operator is a 2-D tensor with shape N x K (N is the
+batch_size, K is the dimension of input feature). The output tensor has the
+same shape as the input tensor.
+
+For each row of the input tensor, the softmax operator squashes the
+K-dimensional vector of arbitrary real values to a K-dimensional vector of real
+values in the range [0, 1] that add up to 1.
+It computes the exponential of the given dimension and the sum of exponential
+values of all the other dimensions in the K-dimensional vector input.
+Then the ratio of the exponential of the given dimension and the sum of
+exponential values of all the other dimensions is the output of the softmax
+operator.
+
+For each row $i$ and each column $j$ in Input(X), we have:
+    $$Out[i, j] = \frac{\exp(X[i, j])}{\sum_j(exp(X[i, j])}$$
+
+)DOC");
+  }
+};
+
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"), "Input(Out) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Out"),
+                      ctx->GetInputDim(framework::GradVarName("Out")),
+                      "Input(Out) and its gradients should have a same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad,
+            ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc
new file mode 100644
index 0000000000..e7da40f3e8
--- /dev/null
+++ b/paddle/operators/softmax_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/softmax_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_CUDA_KERNEL(
+    softmax, ops::SoftmaxKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    softmax_grad,
+    ops::SoftmaxGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
new file mode 100644
index 0000000000..63e379a3b3
--- /dev/null
+++ b/paddle/operators/softmax_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class SoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* X = context.Input<Tensor>("X");
+    auto* Out = context.Output<Tensor>("Out");
+
+    // allocate memory on device.
+    Out->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), X, Out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* Out = context.Input<Tensor>("Out");
+    auto* dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
+
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxGradFunctor<DeviceContext, T>()(
+        context.template device_context<DeviceContext>(), Out, dOut, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
new file mode 100644
index 0000000000..7135780c92
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxWithCrossEntropyOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftmaxWithCrossEntropyOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "and K is the class number.");
+    AddInput("Label",
+             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+             "soft_label is set to true, Label is a Tensor<float/double> with "
+             "shape [N x K].");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
+              "entropy loss with shape [N x 1].");
+    AddAttr<bool>(
+        "soft_label",
+        "(bool, default: false), A flag to indicate whether to interpretate "
+        "the given labels as soft labels.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Softmax With Cross Entropy Operator.
+
+Cross entropy loss with softmax is used as the output layer extensively. This
+operator computes the softmax normalized values for each row of the input
+tensor, after which cross-entropy loss is computed. This provides a more
+numerically stable gradient.
+
+Because this operator performs a softmax on logits internally, it expects
+unscaled logits. This operator should not be used with the output of
+softmax operator since that would produce incorrect results.
+
+When the attribute soft_label is set false, this operators expects mutually
+exclusive hard labels, each sample in a batch is in exactly one class with a
+probability of 1.0. Each sample in the batch will have a single label.
+
+The equation is as follows:
+
+1) Hard label (one-hot label, so every sample has exactly one class)
+
+$$Loss_j =  -\text{Logit}_{Label_j} +
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right),
+j = 1,..., K$$
+
+2) Soft label (each sample can have a distribution over all classes)
+
+$$Loss_j =  -\sum_{i=0}^{K}\text{Label}_i \left(\text{Logit}_i -
+\log\left(\sum_{i=0}^{K}\exp(\text{Logit}_i)\right)\right),
+j = 1,...,K$$
+
+)DOC");
+  }
+};
+
+class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Softmax"),
+                   "Output(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(
+        logits_dims.size(), 2UL,
+        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
+                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim("Softmax", logits_dims);
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@Grad) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Softmax"),
+                   "Input(Softmax) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@Grad) should be not null.");
+
+    auto softmax_dims = ctx->GetInputDim("Softmax");
+    auto labels_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+                      "The labels should be a 2-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("soft_label")) {
+      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
+                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "Input(X) and Input(Label) should be equal.");
+    } else {
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<Tensor>(framework::GradVarName("Loss"))->type()),
+        ctx.device_context());
+  }
+};
+
+class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* grad_op = new framework::OpDesc();
+    grad_op->SetType("softmax_with_cross_entropy_grad");
+    grad_op->SetInput("Label", Input("Label"));
+    grad_op->SetInput("Softmax", Output("Softmax"));
+    grad_op->SetInput("Loss", Output("Loss"));
+    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
+    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
+REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
+                  ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
+                       ops::SoftmaxWithCrossEntropyKernel<float>,
+                       ops::SoftmaxWithCrossEntropyKernel<double>);
+REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
+                       ops::SoftmaxWithCrossEntropyGradKernel<float>,
+                       ops::SoftmaxWithCrossEntropyGradKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
new file mode 100644
index 0000000000..61583c6161
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/softmax_with_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+namespace {
+template <typename T>
+__global__ void CrossEntropyGrad(T* logit_grad, const T* loss_grad,
+                                 const int64_t* labels, const int batch_size,
+                                 const int class_num) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int sample_idx = tid / class_num;
+
+  if (tid < batch_size) {
+    PADDLE_ASSERT(labels[sample_idx] >= 0 && labels[sample_idx] < class_num);
+    logit_grad[tid * class_num + labels[tid]] -= static_cast<T>(1.);
+  }
+
+  __syncthreads();
+
+  if (tid < batch_size * class_num) {
+    logit_grad[tid] *= loss_grad[sample_idx];
+  }
+}
+
+template <typename T>
+__global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
+                                               const T* loss_grad,
+                                               const T* labels,
+                                               const int batch_size,
+                                               const int class_num) {
+  int ids = blockIdx.x * blockDim.x + threadIdx.x;
+  if (ids < batch_size * class_num) {
+    int row_ids = ids / class_num;
+    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+
+    Tensor* loss = context.Output<Tensor>("Loss");
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    math::SoftmaxFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), logits, softmax);
+    math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
+        context.cuda_device_context(), loss, softmax, labels,
+        context.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const T* loss_grad_data =
+        context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    T* logit_grad_data = logit_grad->data<T>();
+
+    const int batch_size = logit_grad->dims()[0];
+    const int class_num = logit_grad->dims()[1];
+    int block = 512;
+    int grid = (batch_size * class_num + block - 1) / block;
+
+    if (context.Attr<bool>("soft_label")) {
+      const T* label_data = labels->data<T>();
+      SoftCrossEntropyGradientKernel<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
+    } else {
+      const int64_t* label_data = labels->data<int64_t>();
+      CrossEntropyGrad<
+          T><<<grid, block, 0,
+               context.template device_context<platform::CUDADeviceContext>()
+                   .stream()>>>(logit_grad_data, loss_grad_data, label_data,
+                                batch_size, class_num);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<float>,
+                        ops::SoftmaxWithCrossEntropyGradCUDAKernel<double>);
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
new file mode 100644
index 0000000000..6bde0f37e0
--- /dev/null
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
+                   "This kernel only runs on CPU.");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* softmax = context.Output<Tensor>("Softmax");
+    Tensor* loss = context.Output<Tensor>("Loss");
+
+    softmax->mutable_data<T>(context.GetPlace());
+    loss->mutable_data<T>(context.GetPlace());
+
+    auto& dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+    math::SoftmaxFunctor<platform::CPUDeviceContext, T>()(dev_ctx, logits,
+                                                          softmax);
+    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
+        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"));
+  }
+};
+
+template <typename T>
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    const Tensor* labels = context.Input<Tensor>("Label");
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+
+    const int class_num = logit_grad->dims()[1];
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+    auto& place = *context.template device_context<platform::CPUDeviceContext>()
+                       .eigen_device();
+    if (context.Attr<bool>("soft_label")) {
+      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      logit_grad_mat.device(place) =
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          (logit_grad_mat - lbl_mat);
+    } else {
+      logit_grad_mat.device(place) =
+          logit_grad_mat *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+
+      const int batch_size = logit_grad->dims()[0];
+      const int64_t* label_data = labels->data<int64_t>();
+      T* logit_grad_data = logit_grad->data<T>();
+      const T* out_grad_data = out_grad->data<T>();
+      for (int i = 0; i < batch_size; ++i) {
+        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
new file mode 100644
index 0000000000..bd93c49201
--- /dev/null
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -0,0 +1,190 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+
+struct CopyRange {
+  size_t begin;
+  size_t end;
+};
+
+using LoD = framework::LoD;
+
+class SplitLoDTensorOp : public framework::OperatorBase {
+ public:
+  SplitLoDTensorOp(const std::string &type,
+                   const framework::VariableNameMap &inputs,
+                   const framework::VariableNameMap &outputs,
+                   const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &mask = scope.FindVar(Input("Mask"))->Get<framework::LoDTensor>();
+    auto *out_true =
+        scope.FindVar(Output("OutTrue"))->GetMutable<framework::LoDTensor>();
+    auto *out_false =
+        scope.FindVar(Output("OutFalse"))->GetMutable<framework::LoDTensor>();
+    auto level = static_cast<size_t>(Attr<int>("level"));
+    auto &x_lod = x.lod();
+    auto &mask_dim = mask.dims();
+
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+
+    std::unique_ptr<framework::LoDTensor> cpu_mask{new framework::LoDTensor()};
+    if (platform::is_cpu_place(mask.place())) {
+      cpu_mask->ShareDataWith(mask);
+    } else if (platform::is_gpu_place(mask.place())) {
+#ifdef PADDLE_WITH_CUDA
+      framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
+#else
+      PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
+#endif
+    }
+    auto *mask_data = cpu_mask->data<bool>();
+
+    std::vector<std::vector<CopyRange>> copy_ranges(mask_dim[0]);
+
+    // set out_true/out_false lod
+    for (size_t t = 0; t < 2; t++) {
+      LoD *lod = nullptr;
+      if (t == 0) {
+        lod = out_false->mutable_lod();
+      } else {
+        lod = out_true->mutable_lod();
+      }
+      lod->clear();
+      for (size_t i = 0; i < static_cast<size_t>(mask_dim[0]); i++) {
+        if (static_cast<size_t>(mask_data[i]) == t) {
+          size_t start_idx = i;
+          auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
+              x_lod, start_idx, start_idx + 1, level);
+
+          auto &lod_length = lod_and_offset.first;
+          framework::AppendLoD(lod, lod_length);
+
+          size_t start_offset = lod_and_offset.second.first;
+          size_t end_offset = lod_and_offset.second.second;
+          copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
+        }
+      }
+    }
+
+    for (size_t t = 0; t < 2; ++t) {
+      framework::LoDTensor *out;
+      if (t == 0) {
+        out = out_false;
+      } else {
+        out = out_true;
+      }
+      auto &ranges = copy_ranges[t];
+      size_t height = std::accumulate(
+          ranges.begin(), ranges.end(), 0UL,
+          [](size_t a, const CopyRange &b) { return a + b.end - b.begin; });
+      auto x_dim = x.dims();
+      x_dim[0] = static_cast<int64_t>(height);
+      out->Resize(x_dim);
+      out->mutable_data(x.place(), x.type());
+      size_t offset = 0;
+      for (auto &each_range : ranges) {
+        size_t len = each_range.end - each_range.begin;
+        if (len == 0) {
+          continue;
+        }
+        // out[offset: offset+len] = x[each_range.begin: each_range.end]
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::Copy(x.Slice(static_cast<int>(each_range.begin),
+                                static_cast<int>(each_range.end)),
+                        x.place(), dev_ctx, &slice);
+        offset += len;
+      }
+    }
+  }
+};
+
+class SplitLoDTensorOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitLoDTensorOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input LoDTensor");
+    AddInput("Mask", "A bool column vector which mask the input");
+    AddOutput("OutTrue", "True branch of input LoDTensor");
+    AddOutput("OutFalse", "False branch of input LoDTensor");
+    AddAttr<int>("level", "(int) the specific lod level to split.")
+        .SetDefault(0)
+        .EqualGreaterThan(0);
+    AddComment(
+        R"DOC(
+        Split a LoDTensor with a Mask at certain level. The input LoDTensor
+        has 3 sequence at certain lod level. The Mask is a bool column vector,
+        such as [0, 1, 0] at the same level. The first and third sequence will
+        be send to False Output LoDTensor; whereas the second sequence will
+        be send to True Output LoDTensor. Please refer to MergeLoDTensorOp.)DOC");
+  }
+};
+
+class SplitLoDTensorInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "SplitLoDTensorOp must has input X.");
+    PADDLE_ENFORCE(context->HasInput("Mask"),
+                   "SplitLoDTensorOp must has input Mask.");
+    PADDLE_ENFORCE(context->HasOutput("OutTrue"),
+                   "SplitLoDTensorOp must has output OutTrue.");
+    PADDLE_ENFORCE(context->HasOutput("OutFalse"),
+                   "SplitLoDTensorOp must has output OutFalse.");
+
+    auto mask_dim = context->GetInputDim("Mask");
+    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
+    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+
+    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
+    context->SetOutputDim("OutFalse", context->GetInputDim("X"));
+  }
+};
+
+class SplitLoDTensorArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("merge_lod_tensor");
+    grad_op->SetInput("InTrue", OutputGrad("OutTrue"));
+    grad_op->SetInput("InFalse", OutputGrad("OutFalse"));
+    grad_op->SetInput("Mask", Input("Mask"));
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_lod_tensor, ops::SplitLoDTensorOp,
+                  ops::SplitLoDTensorOpProtoMaker,
+                  ops::SplitLoDTensorInferShape,
+                  ops::SplitLoDTensorArrayGradMaker);
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
new file mode 100644
index 0000000000..8d55ae5dd7
--- /dev/null
+++ b/paddle/operators/split_op.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_op.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SplitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SplitOp should not be null.");
+    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
+                      "Outputs(Out) of SplitOp should not be empty.");
+    auto in_dims = ctx->GetInputDim("X");
+    auto outs_names = ctx->Outputs("Out");
+    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
+    std::vector<int> sections = static_cast<std::vector<int>>(
+        ctx->Attrs().Get<std::vector<int>>("sections"));
+    const size_t outs_number = outs_names.size();
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(outs_number);
+
+    if (num > 0) {
+      int64_t in_axis_dim = in_dims[axis];
+      PADDLE_ENFORCE_EQ(in_axis_dim % num, 0,
+                        "tensor split does not result"
+                        " in an equal division");
+      size_t out_axis_dim = in_axis_dim / num;
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[axis] = out_axis_dim;
+        outs_dims.push_back(dim);
+      }
+    } else if (sections.size() > 0) {
+      PADDLE_ENFORCE_EQ(sections.size(), outs_number,
+                        "tensor split sections size"
+                        "should be equal to output size.");
+      for (size_t i = 0; i < outs_number; ++i) {
+        auto dim = in_dims;
+        dim[axis] = sections[i];
+        outs_dims.push_back(dim);
+      }
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+    if (axis != 0) {
+      // Only pass LoD when not spliting along the first dim.
+      for (size_t i = 0; i < outs_number; ++i) {
+        ctx->ShareLoD("X", "Out", 0, i);
+      }
+    }
+  }
+};
+
+class SplitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor of the split operator.");
+    AddOutput("Out", "(Tensor) Output tensors of the split operator.")
+        .AsDuplicable();
+    AddComment(R"DOC(
+Split operator
+
+This operator splits the input tensor into multiple sub-tensors.
+
+Example:
+  Input = [[1,2],
+           [3,4],
+           [5,6]]
+  sections = [2,1]
+  axis = 0
+  Output[0] = [[1,2],
+               [3,4]]
+  Output[1] = [[5,6]]
+
+    )DOC");
+    AddAttr<std::vector<int>>("sections",
+                              "(vector<int>) "
+                              "the length of each output along the "
+                              "specified axis.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("num",
+                 "(int, default 0)"
+                 "Number of sub-tensors. This must evenly divide "
+                 "Input.dims()[axis]")
+        .SetDefault(0);
+    AddAttr<int>("axis",
+                 "(int, default 0) "
+                 "The axis which the input will be splited on.")
+        .SetDefault(0);
+  }
+};
+
+class SplitGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto op = new framework::OpDesc();
+    op->SetType("concat");
+    op->SetInput("X", OutputGrad("Out"));
+    op->SetOutput("Out", InputGrad("X"));
+    op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+USE_CPU_ONLY_OP(concat);
+
+REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker, ops::SplitGradMaker);
+REGISTER_OP_CPU_KERNEL(split,
+                       ops::SplitOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc
new file mode 100644
index 0000000000..dbad0bbf68
--- /dev/null
+++ b/paddle/operators/split_op.cu.cc
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split, ops::SplitOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
new file mode 100644
index 0000000000..a38c435d53
--- /dev/null
+++ b/paddle/operators/split_op.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SplitOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto outs = ctx.MultiOutput<framework::Tensor>("Out");
+    auto in_stride = framework::stride(in->dims());
+    int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
+    const size_t n = outs.size();
+    size_t input_offset = 0;
+    for (size_t i = 0; i < n; i++) {
+      auto& out = outs[i];
+      out->mutable_data<T>(ctx.GetPlace());
+      size_t axis_dim = out->dims()[axis];
+      auto out_stride = framework::stride(out->dims());
+      StridedMemcpy<T>(ctx.device_context(), in->data<T>() + input_offset,
+                       in_stride, out->dims(), out_stride, out->data<T>());
+      input_offset += axis_dim * in_stride[axis];
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/split_selected_rows_op.cc b/paddle/operators/split_selected_rows_op.cc
new file mode 100644
index 0000000000..0515ea13aa
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_selected_rows_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SplitSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SplitSelectedRowsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input SelectedRows.");
+    AddOutput("Out", "The outputs of input SelectedRows.").AsDuplicable();
+    AddAttr<std::vector<int>>("height_sections",
+                              "Height for each output SelectedRows.")
+        .SetDefault(std::vector<int>({}));
+
+    AddComment(R"DOC(
+Split a SelectedRows with a specified rows section.
+height_sections is only needed when need to split the dims of the original tensor.
+
+Example:
+  Input:
+    X.rows = {7, 5}
+    X.height = 12
+  Attr:
+    height_sections = {4, 8}
+  Out:
+    out0.rows = {}
+    out0.height = 4
+
+    out1.rows = {5, 7}
+    out2.height = 8
+
+)DOC");
+  }
+};
+
+class SplitSelectedRowsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "SplitSelectedRowsOp must has input X.");
+    PADDLE_ENFORCE(ctx->HasOutputs("Out"),
+                   "SplitSelectedRowsOp must has output Out.");
+
+    std::vector<int> height_sections =
+        ctx->Attrs().Get<std::vector<int>>("height_sections");
+    int64_t n = ctx->Outputs("Out").size();
+
+    std::vector<framework::DDim> outs_dims;
+    outs_dims.reserve(n);
+
+    // make output dims
+    for (int64_t i = 0; i < n; ++i) {
+      auto dims = ctx->GetInputDim("X");
+      if (height_sections.size()) {
+        PADDLE_ENFORCE_EQ(
+            height_sections.size(), static_cast<size_t>(n),
+            "The size of height section should be the same with height"
+            " section size.");
+        dims[0] = height_sections[i];
+      }
+      outs_dims.push_back(dims);
+    }
+    ctx->SetOutputsDim("Out", outs_dims);
+  }
+};
+
+class SplitSelectedRowsGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("sum");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(split_selected_rows, ops::SplitSelectedRowsOp,
+                  ops::SplitSelectedRowsOpMaker,
+                  ops::SplitSelectedRowsGradMaker);
+REGISTER_OP_CPU_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/split_selected_rows_op.cu b/paddle/operators/split_selected_rows_op.cu
new file mode 100644
index 0000000000..983285480f
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/split_selected_rows_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    split_selected_rows,
+    ops::SplitSelectedRowsOpKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/split_selected_rows_op.h b/paddle/operators/split_selected_rows_op.h
new file mode 100644
index 0000000000..12e64e2901
--- /dev/null
+++ b/paddle/operators/split_selected_rows_op.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+static int FindOutIdx(int row, const std::vector<int>& height_sections) {
+  int offset = 0;
+  for (size_t i = 0; i < height_sections.size(); ++i) {
+    if (row >= offset && row < (offset + height_sections[i])) {
+      return i;
+    }
+    offset += height_sections[i];
+  }
+  return -1;
+}
+
+template <typename DeviceContext, typename T>
+class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::SelectedRows>("X");
+    auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
+    auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
+
+    auto x_rows = x->rows();
+    std::vector<std::vector<int>> outs_rows_idx;
+    outs_rows_idx.resize(outs.size());
+
+    auto row_numel = x->value().numel() / x->value().dims()[0];
+    auto src = x->value().data<T>();
+
+    for (size_t i = 0; i < x_rows.size(); ++i) {
+      int out_idx = FindOutIdx(x_rows[i], height_sections);
+      outs_rows_idx[out_idx].push_back(i);
+    }
+    auto place = ctx.GetPlace();
+
+    for (size_t i = 0; i < outs_rows_idx.size(); ++i) {
+      auto rows_idx = outs_rows_idx[i];
+      if (rows_idx.size() > 0) {
+        auto dims = x->GetCompleteDims();
+        dims[0] = rows_idx.size();
+        outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
+        for (auto idx : rows_idx) {
+          outs[i]->mutable_rows()->push_back(x_rows[idx]);
+        }
+        auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
+        for (size_t j = 0; j < rows_idx.size(); j++) {
+          if (platform::is_cpu_place(place)) {
+            memory::Copy(platform::CPUPlace(), dst + j * row_numel,
+                         platform::CPUPlace(), src + rows_idx[j] * row_numel,
+                         sizeof(T) * row_numel);
+          } else {
+#ifdef PADDLE_WITH_CUDA
+            auto stream = ctx.cuda_device_context().stream();
+            memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
+                         platform::CUDAPlace(), src + rows_idx[j] * row_numel,
+                         sizeof(T) * row_numel, stream);
+#else
+            PADDLE_THROW("Paddle is not compiled with GPU");
+#endif
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/spp_op.cc b/paddle/operators/spp_op.cc
new file mode 100644
index 0000000000..c0aa87b0f0
--- /dev/null
+++ b/paddle/operators/spp_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+namespace paddle {
+namespace operators {
+
+class SppOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SppOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of spp operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of spp operator."
+              "N * M."
+              "M = C * H * W");
+    AddAttr<int>("pyramid_height", "(int), multi level pooling");
+    AddAttr<std::string>(
+        "pooling_type",
+        "(string), pooling type, can be \"max\" for max-pooling "
+        "and \"avg\" for average-pooling.")
+        .InEnum({"max", "avg"});
+    AddComment(R"DOC(
+        "With spatial pyramid pooling, the input image can
+        be of any sizes. This not only allows arbitrary aspect
+        ratios, but also allows arbitrary scales. We can resize
+        the input image to any scale (e.g., min(w, h)=180, 224,
+        ...) and apply the same deep network. When the
+        input image is at different scales, the network (with
+        the same filter sizes) will extract features at different
+        scales. The scales play important roles in traditional
+        methods.
+        Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = N \\
+            W_{out} = (((4^pyramid_height) - 1) / (4 - 1))$ * C_{in}
+          $$
+        paper https://arxiv.org/pdf/1406.4729v4.pdf
+        )DOC");
+  }
+};
+
+class SppOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SppOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SppOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    int pyramid_height = ctx->Attrs().Get<int>("pyramid_height");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Spping intput must be of 4-dimensional.");
+    int outlen = ((std::pow(4, pyramid_height) - 1) / (4 - 1)) * in_x_dims[1];
+    std::vector<int64_t> output_shape({in_x_dims[0], outlen});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class SppOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(spp, ops::SppOp, ops::SppOpMaker, spp_grad, ops::SppOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/spp_op.cu.cc b/paddle/operators/spp_op.cu.cc
new file mode 100644
index 0000000000..761e4d6c4a
--- /dev/null
+++ b/paddle/operators/spp_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/spp_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    spp, ops::SppKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    spp_grad, ops::SppGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SppGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/spp_op.h b/paddle/operators/spp_op.h
new file mode 100644
index 0000000000..f35b305d02
--- /dev/null
+++ b/paddle/operators/spp_op.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class SppKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    out->mutable_data<T>(context.GetPlace());
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t output_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // pooling output shape
+      framework::Tensor out_level;
+      std::vector<int64_t> output_shape_vec(
+          {in_x->dims()[0], in_x->dims()[1], bins, bins});
+      framework::DDim output_shape(framework::make_ddim(output_shape_vec));
+      out_level.mutable_data<T>(output_shape, context.GetPlace());
+      // pooling
+      if (pooling_type == "max") {
+        math::Pool2dFunctor<DeviceContext, math::MaxPool<T>, T> pool_forward;
+        math::MaxPool<T> max_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, max_process, &out_level);
+      } else if (pooling_type == "avg") {
+        math::Pool2dFunctor<DeviceContext, math::AvgPool<T>, T> pool_forward;
+        math::AvgPool<T> avg_process;
+        pool_forward(context.template device_context<DeviceContext>(), *in_x,
+                     kernel_size, strides, paddings, avg_process, &out_level);
+      }
+      // flatten pooling output shape
+      int output_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> output_flatten_shape_vec(
+          {in_x->dims()[0], output_flatten_w});
+      framework::DDim output_flatten_shape(
+          framework::make_ddim(output_flatten_shape_vec));
+      out_level.Resize(output_flatten_shape);
+      // concat
+      auto out_level_stride = framework::stride(out_level.dims());
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_level.data<T>(), out_level_stride, out_level.dims(),
+                       out_stride, out->data<T>() + output_offset);
+      output_offset += out_level.dims()[1] * out_level_stride[1];
+    }
+  }
+};
+template <typename DeviceContext, typename T>
+class SppGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    int pyramid_height = context.template Attr<int>("pyramid_height");
+    std::string pooling_type =
+        context.template Attr<std::string>("pooling_type");
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    in_x_grad->mutable_data<T>(context.GetPlace());
+    zero(device_ctx, in_x_grad, static_cast<T>(0));
+    auto out_stride = framework::stride(out->dims());
+    int input_h = in_x->dims()[2];
+    int input_w = in_x->dims()[3];
+    size_t out_offset = 0;
+    for (int p = 0; p < pyramid_height; ++p) {
+      int bins = std::pow(2, p);
+      int kernel_size_h = std::ceil(input_h / static_cast<double>(bins));
+      int kernel_size_w = std::ceil(input_w / static_cast<double>(bins));
+      int padding_h = (kernel_size_h * bins - input_h + 1) / 2;
+      int padding_w = (kernel_size_w * bins - input_w + 1) / 2;
+      std::vector<int> kernel_size({kernel_size_h, kernel_size_w});
+      std::vector<int> strides({kernel_size_h, kernel_size_w});
+      std::vector<int> paddings({padding_h, padding_w});
+      // split out and outgrad  ...  to flatten
+      framework::Tensor out_level;
+      framework::Tensor outgrad_level;
+      int out_flatten_w = in_x->dims()[1] * bins * bins;
+      std::vector<int64_t> out_flatten_shape_vec(
+          {in_x->dims()[0], out_flatten_w});
+      framework::DDim out_flatten_shape(
+          framework::make_ddim(out_flatten_shape_vec));
+      out_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      outgrad_level.mutable_data<T>(out_flatten_shape, context.GetPlace());
+      auto flatten_stride = framework::stride(out_level.dims());
+      // memcpy
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out->data<T>() + out_offset, out_stride,
+                       out_level.dims(), flatten_stride, out_level.data<T>());
+
+      StridedMemcpy<T>(context.template device_context<DeviceContext>(),
+                       out_grad->data<T>() + out_offset, out_stride,
+                       outgrad_level.dims(), flatten_stride,
+                       outgrad_level.data<T>());
+      out_offset += out_level.dims()[1] * out_stride[1];
+      // flatten backward to nchw
+
+      std::vector<int64_t> out_shape_vec({in_x->dims()[0], in_x->dims()[1]});
+      out_shape_vec.push_back(
+          (input_h - kernel_size_h + 2 * padding_h) / kernel_size_h + 1);
+      out_shape_vec.push_back(
+          (input_w - kernel_size_w + 2 * padding_w) / kernel_size_w + 1);
+      framework::DDim out_shape(framework::make_ddim(out_shape_vec));
+      out_level.ShareDataWith(out_level);
+      out_level.Resize(out_shape);
+      outgrad_level.ShareDataWith(outgrad_level);
+      outgrad_level.Resize(out_shape);
+      // pooling backward
+      if (pooling_type == "max") {
+        math::MaxPool2dGradFunctor<DeviceContext, T> pool2d_backward;
+        pool2d_backward(context.template device_context<DeviceContext>(), *in_x,
+                        *&out_level, *&outgrad_level, kernel_size, strides,
+                        paddings, in_x_grad);
+      } else if (pooling_type == "avg") {
+        math::Pool2dGradFunctor<DeviceContext, math::AvgPoolGrad<T>, T>
+            pool_backward;
+        math::AvgPoolGrad<T> avg_process;
+        pool_backward(context.template device_context<DeviceContext>(), *in_x,
+                      *&out_level, *&outgrad_level, kernel_size, strides,
+                      paddings, avg_process, in_x_grad);
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
new file mode 100644
index 0000000000..9e097176f3
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/squared_l2_distance_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SquaredL2DistanceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"),
+                   "Input(Y) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("sub_result"),
+        "Output(sub_result) of SquaredL2DistanceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SquaredL2DistanceOp should not be null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+
+    PADDLE_ENFORCE_EQ(framework::arity(x_dims), framework::arity(y_dims),
+                      "Tensor rank of both SquaredL2DistanceOp's "
+                      "inputs must be same.");
+
+    int rank = framework::arity(x_dims);
+    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
+    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
+                      "Product of dimensions expcet the first dimension of "
+                      "input and target must be equal.");
+    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                   "First dimension of target must be equal to input "
+                   "or to 1.");
+
+    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
+    ctx->SetOutputDim("Out", {x_dims[0], 1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SquaredL2DistanceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquaredL2DistanceOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input of SquaredL2DistanceOp.");
+    AddInput("Y", "(Tensor) Target of SquaredL2DistanceOp.");
+    AddOutput("sub_result",
+              "(Tensor) Buffering subtraction result which "
+              "will be reused in backward.")
+        .AsIntermediate();
+    AddOutput("Out", "(Tensor) Squared l2 distance between input and target.");
+    AddComment(R"DOC(
+SquaredL2Distance operator
+
+This operator will cacluate the squared L2 distance for the input and 
+the target. Number of distance value will be equal to the first dimension 
+of input. First dimension of the target could be equal to the input or to 1. 
+If the first dimension of target is 1, the operator will broadcast target's 
+first dimension to input's first dimension. During backward propagation, 
+the user can decide whether to calculate the gradient of the input or 
+the target or both.
+
+Both the input X and Y can carry the LoD (Level of Details) information. 
+However, the output only shares the LoD information with input X.
+    )DOC");
+  }
+};
+
+class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Gradient of Out should not be null");
+    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+                      "First dimension of output gradient and "
+                      "input value must be equal.");
+    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+                      "Second dimension of output gradient "
+                      "must be 1.");
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);
+    if (ctx->HasOutput(y_grad_name)) ctx->SetOutputDim(y_grad_name, y_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp,
+            ops::SquaredL2DistanceOpMaker, squared_l2_distance_grad,
+            ops::SquaredL2DistanceGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad,
+                       ops::SquaredL2DistanceGradKernel<
+                           paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu
new file mode 100644
index 0000000000..f2648dde5e
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/squared_l2_distance_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_distance,
+    ops::SquaredL2DistanceKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad,
+                        ops::SquaredL2DistanceGradKernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
new file mode 100644
index 0000000000..5bd5f4819a
--- /dev/null
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SquaredL2DistanceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("X");
+    auto* in1 = context.Input<Tensor>("Y");
+    auto* out0 = context.Output<Tensor>("sub_result");
+    auto* out1 = context.Output<Tensor>("Out");
+
+    auto in0_dims = in0->dims();
+    auto in1_dims = in1->dims();
+
+    int cols = in0->numel() / in0_dims[0];
+    // reduce dimensions except the first
+    auto x =
+        EigenMatrix<T>::From(*in0, framework::make_ddim({in0_dims[0], cols}));
+    auto y =
+        EigenMatrix<T>::From(*in1, framework::make_ddim({in1_dims[0], cols}));
+
+    out0->mutable_data<T>(context.GetPlace());
+    out1->mutable_data<T>(context.GetPlace());
+    auto sub_result = EigenMatrix<T>::From(*out0);
+    auto z = EigenVector<T>::Flatten(*out1);
+
+    auto& place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    auto x_dims = x.dimensions();
+    auto y_dims = y.dimensions();
+    // buffer the substraction result
+    if (y_dims[0] == 1 && x_dims[0] > y_dims[0]) {
+      sub_result.device(place) =
+          x -
+          y.broadcast(Eigen::array<int, 2>({{static_cast<int>(x_dims[0]), 1}}));
+    } else {
+      sub_result.device(place) = x - y;
+    }
+    auto sub_res_pow2 = sub_result * sub_result;
+    z.device(place) = sub_res_pow2.sum(Eigen::array<int, 1>({{1}}));
+  }
+};
+
+template <typename DeviceContext, typename T>
+class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in0 = context.Input<Tensor>("sub_result");
+    auto* in1 = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
+    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto sub_result = EigenMatrix<T>::From(*in0);
+    auto out_grad = EigenMatrix<T>::From(*in1);
+
+    auto x_dims = x_g->dims();
+    auto y_dims = y_g->dims();
+
+    int cols = x_g->numel() / x_dims[0];
+    // calculate gradient
+    auto grad_mat = 2 *
+                    (out_grad.broadcast(Eigen::array<int, 2>({{1, cols}}))) *
+                    sub_result;
+
+    // propagate back to input
+    auto& eigen_place =
+        *context.template device_context<DeviceContext>().eigen_device();
+    if (x_g) {
+      x_g->mutable_data<T>(context.GetPlace());
+      // eigen matrix
+      auto x_grad =
+          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+      // dimensions are same with subResult
+      x_grad.device(eigen_place) = grad_mat;
+    }
+
+    if (y_g) {
+      y_g->mutable_data<T>(context.GetPlace());
+
+      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                        "First dimension of gradient must be greater or "
+                        "equal than first dimension of target.");
+
+      if (sub_result.dimensions()[0] == y_dims[0]) {
+        auto y_grad =
+            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+        y_grad.device(eigen_place) = -1 * grad_mat;
+      } else {
+        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+        auto y_grad = EigenVector<T>::Flatten(*y_g);
+        y_grad.device(eigen_place) = col_sum_res;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc
new file mode 100644
index 0000000000..6626bf0375
--- /dev/null
+++ b/paddle/operators/squared_l2_norm_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/squared_l2_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SquaredL2NormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+};
+
+class SquaredL2NormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class SquaredL2NormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SquaredL2NormOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of squared_l2_norm op.");
+    AddOutput("Out", "(Scalar) The output of squared_l2_norm op.");
+    AddComment(R"DOC(
+SquaredL2Norm Operator.
+
+Computes the squared L2 norm of a tensor.
+
+$$Out = \sum_{i} X_{i}^2$$
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker,
+            squared_l2_norm_grad, ops::SquaredL2NormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_norm,
+    ops::SquaredL2NormKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    squared_l2_norm_grad,
+    ops::SquaredL2NormGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu
new file mode 100644
index 0000000000..b222113a8c
--- /dev/null
+++ b/paddle/operators/squared_l2_norm_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/squared_l2_norm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_norm,
+    ops::SquaredL2NormKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    squared_l2_norm_grad,
+    ops::SquaredL2NormGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h
new file mode 100644
index 0000000000..1ce26c775e
--- /dev/null
+++ b/paddle/operators/squared_l2_norm_op.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = sum(square(X))
+template <typename DeviceContext, typename T>
+class SquaredL2NormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto out = framework::EigenScalar<T>::From(*Out);
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    out.device(*place) = x.square().sum();
+  }
+};
+
+// dX = X
+template <typename DeviceContext, typename T>
+class SquaredL2NormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    PADDLE_ENFORCE(dOut->numel() == 1,
+                   "Squared L2 Norm Gradient should be scalar");
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto *place =
+        context.template device_context<DeviceContext>().eigen_device();
+
+    Eigen::DSizes<int, 1> x_dsize(X->numel());
+    dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast<T>(2.0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/strided_memcpy.h b/paddle/operators/strided_memcpy.h
new file mode 100644
index 0000000000..735cabcd97
--- /dev/null
+++ b/paddle/operators/strided_memcpy.h
@@ -0,0 +1,45 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/detail/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+// Strided memory copy from src to dst.
+//
+// The src and dst should be both on dev_ctx.GetPlace(), otherwise, there will
+// be a segment fault.
+//
+// The stride of an array (also referred to as increment, pitch or step size) is
+// the number of locations in memory between beginnings of successive array
+// elements
+//
+// For example, for tensor like [1, 3, 300, 300]. If there is no padding, the
+// stride is [270000, 90000, 300, 1].
+//
+// NOTE: When use GPU, the memcpy is async. To sync memcpy, please invoke
+// `dev_ctx.Wait()`.
+template <typename T>
+inline void StridedMemcpy(const platform::DeviceContext& dev_ctx, const T* src,
+                          const framework::DDim& src_stride,
+                          const framework::DDim& dst_dim,
+                          const framework::DDim& dst_stride, T* dst) {
+  using namespace detail;
+  StridedCopyDimVisitor<T> func(dev_ctx, src, src_stride, dst_stride, dst);
+  boost::apply_visitor(func, dst_dim);
+}
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
new file mode 100644
index 0000000000..06d8118855
--- /dev/null
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/strided_memcpy.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+
+namespace paddle {
+namespace operators {
+
+TEST(StridedMemcpy, CPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  platform::CPUDeviceContext ctx;
+  StridedMemcpy<int>(ctx, src + 1, src_stride, dst_dim, dst_stride, dst);
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+}
+
+TEST(StridedMemcpy, CPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  int dst[8];
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+  platform::CPUDeviceContext ctx;
+
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst);
+  StridedMemcpy<int>(ctx, src, src_stride, dst_dim, dst_stride, dst + 2);
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(StridedMemcpy, GPUCrop) {
+  // clang-format off
+  int src[] = {
+      0, 1, 2, 0, 0,
+      0, 3, 4, 0, 0,
+      0, 0, 0, 0, 0,
+  };
+  // clang-format on
+
+  platform::CUDAPlace gpu0(0);
+  platform::CPUPlace cpu;
+
+  platform::CUDADeviceContext ctx(gpu0);
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+
+  framework::DDim src_stride({5, 1});
+
+  int dst[4];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({2, 1});
+
+  StridedMemcpy<int>(ctx, gpu_src + 1, src_stride, dst_dim, dst_stride,
+                     gpu_dst);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  ASSERT_EQ(1, dst[0]);
+  ASSERT_EQ(2, dst[1]);
+  ASSERT_EQ(3, dst[2]);
+  ASSERT_EQ(4, dst[3]);
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+TEST(StridedMemcpy, GPUConcat) {
+  // clang-format off
+  int src[] = {
+      1, 2,
+      3, 4
+  };
+  // clang-format on
+
+  platform::CUDAPlace gpu0(0);
+  platform::CPUPlace cpu;
+  platform::CUDADeviceContext ctx(gpu0);
+
+  int* gpu_src = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(src)));
+  memory::Copy(gpu0, gpu_src, cpu, src, sizeof(src), ctx.stream());
+
+  int dst[8];
+  int* gpu_dst = reinterpret_cast<int*>(memory::Alloc(gpu0, sizeof(dst)));
+
+  framework::DDim src_stride({2, 1});
+  framework::DDim dst_dim({2, 2});
+  framework::DDim dst_stride({4, 1});
+
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride, gpu_dst);
+  StridedMemcpy<int>(ctx, gpu_src, src_stride, dst_dim, dst_stride,
+                     gpu_dst + 2);
+
+  memory::Copy(cpu, dst, gpu0, gpu_dst, sizeof(dst), ctx.stream());
+  ctx.Wait();
+
+  // clang-format off
+  int expect_dst[] = {
+      1, 2, 1, 2,
+      3, 4, 3, 4
+  };
+  // clang-format on
+  for (size_t i = 0; i < sizeof(expect_dst) / sizeof(int); ++i) {
+    ASSERT_EQ(expect_dst[i], dst[i]);
+  }
+
+  memory::Free(gpu0, gpu_dst);
+  memory::Free(gpu0, gpu_src);
+}
+
+#endif
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
new file mode 100644
index 0000000000..88ed67f7ba
--- /dev/null
+++ b/paddle/operators/sum_op.cc
@@ -0,0 +1,200 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sum_op.h"
+#include <vector>
+#include "paddle/framework/var_type_inference.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+
+class SumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
+
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SumOp should not be null.");
+    if (ctx->IsRuntime() &&
+        ctx->GetOutputsVarType("Out")[0] ==
+            framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
+      return;  // skip runtime infershape when is tensor array;
+    }
+
+    auto x_dims = ctx->GetInputsDim("X");
+    size_t N = x_dims.size();
+    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+
+    framework::DDim in_dim({0});
+    for (auto& x_dim : x_dims) {
+      if (framework::product(x_dim) == 0) {
+        continue;
+      }
+      if (framework::product(in_dim) == 0) {
+        in_dim = x_dim;
+      } else {
+        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+      }
+    }
+    ctx->SetOutputDim("Out", in_dim);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    auto x_vars = ctx.MultiInputVar("X");
+    if (x_vars[0]->IsType<framework::LoDTensor>()) {
+      int dtype = -1;
+      for (auto& x_var : x_vars) {
+        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        if (lod_tensor.numel() == 0) {
+          continue;
+        }
+        if (dtype == -1) {
+          dtype = framework::ToDataType(lod_tensor.type());
+        } else {
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+        }
+      }
+      PADDLE_ENFORCE_NE(dtype, -1,
+                        "Sum operator should have at least one tensor");
+
+      return framework::OpKernelType(
+          static_cast<framework::proto::DataType>(dtype), ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
+      return framework::OpKernelType(
+          framework::ToDataType(
+              x_vars[0]->Get<framework::SelectedRows>().value().type()),
+          ctx.device_context());
+    } else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
+      for (auto& x_var : x_vars) {
+        auto& array = x_var->Get<framework::LoDTensorArray>();
+        for (auto& each : array) {
+          if (each.numel() != 0) {
+            return framework::OpKernelType(framework::ToDataType(each.type()),
+                                           ctx.device_context());
+          }
+        }
+      }
+      PADDLE_THROW("Cannot find the input data type by all input data");
+    }
+    PADDLE_THROW("Unexpected branch. Input type is %s",
+                 x_vars[0]->Type().name());
+  }
+};
+
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(vector<Tensor>) The input tensors of sum operator.")
+        .AsDuplicable();
+    AddOutput("Out", "(Tensor) The output tensor of sum operator.");
+    AddComment(R"DOC(
+Sum operator.
+
+This operators sums the input tensors. All the inputs can carry the
+LoD (Level of Details) information. However, the output only shares
+the LoD information with the first input.
+)DOC");
+  }
+};
+
+class SumOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto& inputs = op_desc.Input("X");
+    auto var_type = framework::proto::VarDesc::SELECTED_ROWS;
+
+    for (auto& name : op_desc.Input("X")) {
+      VLOG(10) << name << " "
+               << block->FindRecursiveOrCreateVar(name).GetType();
+    }
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string& name) {
+          return block->FindRecursiveOrCreateVar(name).GetType() ==
+                 framework::proto::VarDesc::LOD_TENSOR;
+        });
+
+    auto is_tensor_array = [block](const std::string& name) {
+      return block->FindRecursiveOrCreateVar(name).GetType() ==
+             framework::proto::VarDesc::LOD_TENSOR_ARRAY;
+    };
+
+    bool any_input_is_tensor_array =
+        std::any_of(inputs.begin(), inputs.end(), is_tensor_array);
+    bool all_inputs_are_tensor_array =
+        std::all_of(inputs.begin(), inputs.end(), is_tensor_array);
+
+    if (any_input_is_tensor_array) {
+      if (!all_inputs_are_tensor_array) {
+        std::ostringstream os;
+        for (auto& each : inputs) {
+          os << "    " << each << " type is "
+             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
+        }
+        PADDLE_ENFORCE(all_inputs_are_tensor_array,
+                       "Not all inputs are tensor array:\n%s", os.str());
+      }
+      var_type = framework::proto::VarDesc::LOD_TENSOR_ARRAY;
+    } else if (any_input_is_lod_tensor) {
+      var_type = framework::proto::VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    out_var.SetType(var_type);
+    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
+    out_var.SetDataType(in_var.GetDataType());
+  }
+};
+
+class SumGradMaker : public framework::GradOpDescMakerBase {
+ public:
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDesc>> operator()() const override {
+    auto x_grads = InputGrad("X", false);
+    std::vector<std::unique_ptr<framework::OpDesc>> grad_ops;
+    grad_ops.reserve(x_grads.size());
+    auto og = OutputGrad("Out");
+    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
+                   [&og](const std::string& x_grad) {
+                     auto* grad_op = new framework::OpDesc();
+                     grad_op->SetType("scale");
+                     grad_op->SetInput("X", og);
+                     grad_op->SetOutput("Out", {x_grad});
+                     grad_op->SetAttr("scale", 1.0f);
+                     return std::unique_ptr<framework::OpDesc>(grad_op);
+                   });
+    return grad_ops;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker,
+                  ops::SumOpVarTypeInference);
+REGISTER_OP_CPU_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, double>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int>,
+    ops::SumKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
new file mode 100644
index 0000000000..873155076c
--- /dev/null
+++ b/paddle/operators/sum_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sum_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
new file mode 100644
index 0000000000..48201b344d
--- /dev/null
+++ b/paddle/operators/sum_op.h
@@ -0,0 +1,131 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
+using LoDTensor = framework::LoDTensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class SumKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto in_vars = context.MultiInputVar("X");
+    int N = in_vars.size();
+    auto out_var = context.OutputVar("Out");
+
+    bool in_place = out_var == in_vars[0];
+
+    if (out_var->IsType<framework::LoDTensor>()) {
+      auto *out = context.Output<LoDTensor>("Out");
+      if (!in_place) {
+        out->mutable_data<T>(context.GetPlace());
+      }
+      auto result = EigenVector<T>::Flatten(*out);
+      if (!in_place) {
+        math::SetConstant<DeviceContext, T> constant_functor;
+        constant_functor(context.template device_context<DeviceContext>(), out,
+                         0.0);
+      }
+
+      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
+      auto &place =
+          *context.template device_context<DeviceContext>().eigen_device();
+      // If in_place, just skip the first tensor
+      for (int i = in_place ? 1 : 0; i < N; i++) {
+        if (in_vars[i]->IsType<framework::LoDTensor>()) {
+          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+          if (in_t.numel() == 0) {
+            continue;
+          }
+          auto in = EigenVector<T>::Flatten(in_t);
+          result.device(place) = result + in;
+        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
+          auto &in_t = in_vars[i]->Get<framework::SelectedRows>();
+          functor(context.template device_context<DeviceContext>(), in_t, out);
+        } else {
+          PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+        }
+      }
+    } else if (out_var->IsType<framework::SelectedRows>()) {
+      PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
+      auto *out = context.Output<SelectedRows>("Out");
+      out->mutable_rows()->clear();
+      auto *out_value = out->mutable_value();
+
+      // Runtime InferShape
+      size_t first_dim = 0;
+      for (int i = 0; i < N; i++) {
+        first_dim += in_vars[i]->Get<SelectedRows>().rows().size();
+      }
+      auto in_dim = in_vars[0]->Get<SelectedRows>().value().dims();
+      auto in_dim_vec = framework::vectorize(in_dim);
+      in_dim_vec[0] = static_cast<int64_t>(first_dim);
+
+      out_value->Resize(framework::make_ddim(in_dim_vec));
+      out_value->mutable_data<T>(context.GetPlace());
+
+      math::SelectedRowsAddTo<DeviceContext, T> functor;
+
+      int64_t offset = 0;
+      for (int i = 0; i < N; i++) {
+        PADDLE_ENFORCE_EQ(out->height(),
+                          in_vars[i]->Get<SelectedRows>().height());
+        functor(context.template device_context<DeviceContext>(),
+                in_vars[i]->Get<SelectedRows>(), offset, out);
+        offset += in_vars[i]->Get<SelectedRows>().value().numel();
+      }
+    } else if (out_var->IsType<framework::LoDTensorArray>()) {
+      auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
+      for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
+        PADDLE_ENFORCE(in_vars[i]->IsType<framework::LoDTensorArray>(),
+                       "Only support all inputs are TensorArray");
+        auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
+
+        for (size_t i = 0; i < in_array.size(); ++i) {
+          if (in_array[i].numel() != 0) {
+            if (i >= out_array.size()) {
+              out_array.resize(i + 1);
+            }
+            if (out_array[i].numel() == 0) {
+              framework::Copy(in_array[i], in_array[i].place(),
+                              context.device_context(), &out_array[i]);
+              out_array[i].set_lod(in_array[i].lod());
+            } else {
+              PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
+              auto in = EigenVector<T>::Flatten(in_array[i]);
+              auto result = EigenVector<T>::Flatten(out_array[i]);
+              result.device(*context.template device_context<DeviceContext>()
+                                 .eigen_device()) = result + in;
+            }
+          }
+        }
+      }
+    } else {
+      PADDLE_THROW("Unexpected branch, output variable type is %s",
+                   out_var->Type().name());
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
new file mode 100644
index 0000000000..a70be8b875
--- /dev/null
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -0,0 +1,220 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/array_operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+namespace paddle {
+namespace operators {
+
+class WriteToArrayOp : public ArrayOp {
+ public:
+  WriteToArrayOp(const std::string &type,
+                 const framework::VariableNameMap &inputs,
+                 const framework::VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    if (x == nullptr) return;
+    auto &x_tensor = x->Get<framework::LoDTensor>();
+    size_t offset = GetOffset(scope, place);
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+    if (offset >= out->size()) {
+      VLOG(10) << "Resize " << Output("Out") << " from " << out->size()
+               << " to " << offset + 1;
+      out->resize(offset + 1);
+    }
+    if (x_tensor.memory_size() > 0) {
+      auto *out_tensor = &out->at(offset);
+
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+
+      Copy(x_tensor, place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
+  }
+};
+
+class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WriteToArrayOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(LoDTensor) the tensor will be written to tensor array");
+    AddInput(
+        "I",
+        "(Tensor) the subscript index in tensor array. The number of element "
+        "should be 1");
+    AddOutput("Out", "(TensorArray) the tensor array will be written");
+    AddComment(R"DOC(
+WriteToArray Operator.
+
+This operator writes a LoDTensor to a LoDTensor array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$A[i] = T$$
+
+)DOC");
+  }
+};
+
+class WriteToArrayInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
+    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                      "The number of element of subscript index must be 1");
+    if (!context->HasInput("X")) {
+      return;
+    }
+    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
+    context->SetOutputDim("Out", context->GetInputDim("X"));
+  }
+
+ protected:
+  virtual const char *NotHasXError() const { return "Must set the lod tensor"; }
+
+  virtual const char *NotHasOutError() const {
+    return "Must set the lod tensor array";
+  }
+};
+
+class WriteToArrayInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
+    auto &out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
+    auto *x = block->FindVarRecursive(x_name);
+    if (x != nullptr) {
+      out.SetDataType(x->GetDataType());
+    }
+  }
+};
+
+class ReadFromArrayOp : public ArrayOp {
+ public:
+  ReadFromArrayOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : ArrayOp(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::Place &place) const override {
+    auto *x = scope.FindVar(Input("X"));
+    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    auto &x_array = x->Get<framework::LoDTensorArray>();
+    auto *out = scope.FindVar(Output("Out"));
+    PADDLE_ENFORCE(out != nullptr, "Out must be set");
+    size_t offset = GetOffset(scope, place);
+    if (offset < x_array.size()) {
+      auto *out_tensor = out->GetMutable<framework::LoDTensor>();
+      platform::DeviceContextPool &pool =
+          platform::DeviceContextPool::Instance();
+      auto &dev_ctx = *pool.Get(place);
+      framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
+      out_tensor->set_lod(x_array[offset].lod());
+    } else {
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+    }
+  }
+};
+
+class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ReadFromArrayProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(TensorArray) the array will be read from.");
+    AddInput("I",
+             "(Tensor) the subscript index in tensor array. The number of "
+             "element should be 1");
+    AddOutput("Out", "(LoDTensor) the tensor will be read from.");
+    AddComment(R"DOC(
+ReadFromArray Operator.
+
+Read a LoDTensor from a LoDTensor Array.
+
+Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The
+equation is
+
+$$T = A[i]$$
+
+)DOC");
+  }
+};
+
+class ReadFromArrayInferShape : public WriteToArrayInferShape {
+ protected:
+  const char *NotHasXError() const override {
+    return "The input array X must be set";
+  }
+  const char *NotHasOutError() const override {
+    return "The output tensor out must be set";
+  }
+};
+
+class WriteToArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("read_from_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+class ReadFromArrayGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *grad_op = new framework::OpDesc();
+    grad_op->SetType("write_to_array");
+    grad_op->SetInput("I", Input("I"));
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDesc>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(write_to_array, ops::WriteToArrayOp,
+                  ops::WriteToArrayInferShape, ops::WriteToArrayOpProtoMaker,
+                  ops::WriteToArrayGradMaker, ops::WriteToArrayInferVarType);
+REGISTER_OPERATOR(read_from_array, ops::ReadFromArrayOp,
+                  ops::ReadFromArrayInferShape, ops::ReadFromArrayProtoMaker,
+                  ops::ReadFromArrayGradMaker);
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
new file mode 100644
index 0000000000..a8ddd72973
--- /dev/null
+++ b/paddle/operators/top_k_op.cc
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/top_k_op.h"
+
+namespace paddle {
+namespace operators {
+
+class TopkOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of TopkOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Indices"),
+                   "Output(Indices) of TopkOp should not be null.");
+
+    auto input_dims = ctx->GetInputDim("X");
+    const int k = static_cast<int>(ctx->Attrs().Get<int>("k"));
+
+    PADDLE_ENFORCE_GE(k, 1, "k must >= 1");
+    PADDLE_ENFORCE_GE(input_dims.size(), 1, "input must have >= 1d shape");
+    PADDLE_ENFORCE_GE(input_dims[input_dims.size() - 1], k,
+                      "input must have >= k columns");
+
+    framework::DDim dims = input_dims;
+    dims[dims.size() - 1] = k;
+    ctx->SetOutputDim("Out", dims);
+    ctx->SetOutputDim("Indices", dims);
+    ctx->ShareLoD("X", "Out");
+    ctx->ShareLoD("X", "Indices");
+  }
+};
+
+class TopkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TopkOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input of Topk op");
+    AddOutput("Out", "(Tensor) The output tensor of Topk op");
+    AddOutput("Indices", "(Tensor) The indices of Topk elements of input");
+    AddComment(R"DOC(
+Top K operator
+
+If the input is a vector (1d tensor), this operator finds the k largest 
+entries in the vector and outputs their values and indices as vectors. 
+Thus values[j] is the j-th largest entry in input, and its index is indices[j].
+
+For matrices, this operator computes the top k entries in each row. )DOC");
+    AddAttr<int>("k",
+                 "(int, default 1) Number of top elements to look for along "
+                 "the last dimension (along each row for matrices).")
+        .SetDefault(1);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(top_k, ops::TopkOp, ops::TopkOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(top_k,
+                       ops::TopkKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
new file mode 100644
index 0000000000..f7bf58e721
--- /dev/null
+++ b/paddle/operators/top_k_op.cu
@@ -0,0 +1,320 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Pair {
+  __device__ __forceinline__ Pair() {}
+  __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {}
+
+  __device__ __forceinline__ void set(T value, int64_t id) {
+    v = value;
+    id = id;
+  }
+
+  __device__ __forceinline__ void operator=(const Pair<T>& in) {
+    v = in.v;
+    id = in.id;
+  }
+
+  __device__ __forceinline__ bool operator<(const T value) const {
+    return (v < value);
+  }
+
+  __device__ __forceinline__ bool operator<(const Pair<T>& in) const {
+    return (v < in.v) || ((v == in.v) && (id > in.id));
+  }
+
+  __device__ __forceinline__ bool operator>(const Pair<T>& in) const {
+    return (v > in.v) || ((v == in.v) && (id < in.id));
+  }
+
+  T v;
+  int64_t id;
+};
+
+template <typename T>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
+                                      int beam_size) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int beam_size>
+__device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p) {
+  for (int k = beam_size - 2; k >= 0; k--) {
+    if (topk[k] < p) {
+      topk[k + 1] = topk[k];
+    } else {
+      topk[k + 1] = p;
+      return;
+    }
+  }
+  topk[0] = p;
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* src, int idx,
+                                        int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < src[idx]) {
+      Pair<T> tmp(src[idx], idx);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      AddTo<T>(topk, tmp, beam_size);
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int BlockSize>
+__device__ __forceinline__ void GetTopK(Pair<T> topk[], const T* val, int* col,
+                                        int idx, int dim, const Pair<T>& max,
+                                        int beam_size) {
+  while (idx < dim) {
+    if (topk[beam_size - 1] < val[idx]) {
+      Pair<T> tmp(val[idx], col[idx]);
+      if (tmp < max) {
+        AddTo<T>(topk, tmp, beam_size);
+      }
+    }
+    idx += BlockSize;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* src,
+                                              bool& firstStep, bool& is_empty,
+                                              Pair<T>& max, int dim,
+                                              const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, src, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, src, tid, dim, max,
+                              length);
+      }
+    }
+
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int& beam,
+                                              int beam_size, const T* val,
+                                              int* col, bool& firstStep,
+                                              bool& is_empty, Pair<T>& max,
+                                              int dim, const int tid) {
+  if (beam > 0) {
+    int length = beam < beam_size ? beam : beam_size;
+    if (firstStep) {
+      firstStep = false;
+      GetTopK<T, BlockSize>(topk, val, col, tid, dim, length);
+    } else {
+      for (int k = 0; k < MaxLength; k++) {
+        if (k < MaxLength - beam) {
+          topk[k] = topk[k + beam];
+        } else {
+          topk[k].set(-INFINITY, -1);
+        }
+      }
+      if (!is_empty) {
+        GetTopK<T, BlockSize>(topk + MaxLength - beam, val, col, tid, dim, max,
+                              length);
+      }
+    }
+
+    max = topk[MaxLength - 1];
+    if (max.v == -1) is_empty = true;
+    beam = 0;
+  }
+}
+
+template <typename T, int MaxLength, int BlockSize>
+__device__ __forceinline__ void BlockReduce(Pair<T>* sh_topk, int* maxid,
+                                            Pair<T> topk[], T** topVal,
+                                            int64_t** topIds, int& beam, int& k,
+                                            const int tid, const int warp) {
+  while (true) {
+    __syncthreads();
+    if (tid < BlockSize / 2) {
+      if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) {
+        maxid[tid] = tid + BlockSize / 2;
+      } else {
+        maxid[tid] = tid;
+      }
+    }
+    __syncthreads();
+    for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) {
+      if (tid < stride) {
+        if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) {
+          maxid[tid] = maxid[tid + stride];
+        }
+      }
+      __syncthreads();
+    }
+    __syncthreads();
+
+    if (tid == 0) {
+      **topVal = sh_topk[maxid[0]].v;
+      **topIds = sh_topk[maxid[0]].id;
+      (*topVal)++;
+      (*topIds)++;
+    }
+    if (tid == maxid[0]) beam++;
+    if (--k == 0) break;
+    __syncthreads();
+
+    if (tid == maxid[0]) {
+      if (beam < MaxLength) {
+        sh_topk[tid] = topk[beam];
+      }
+    }
+    if (maxid[0] / 32 == warp) {
+      if (__shfl(beam, (maxid[0]) % 32, 32) == MaxLength) break;
+    }
+  }
+}
+
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top MaxLength value;
+ * 2. merge to sh_topk, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topk value is null;
+ * 4. go to the first setp, until get the topk value.
+ */
+template <typename T, int MaxLength, int BlockSize>
+__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
+                             const T* src, int lds, int dim, int k) {
+  __shared__ Pair<T> sh_topk[BlockSize];
+  __shared__ int maxid[BlockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  output += blockIdx.x * output_stride;
+  indices += blockIdx.x * k;
+
+  Pair<T> topk[MaxLength];
+  int beam = MaxLength;
+  Pair<T> max;
+  bool is_empty = false;
+  bool firststep = true;
+
+  for (int k = 0; k < MaxLength; k++) {
+    topk[k].set(-INFINITY, -1);
+  }
+  while (k) {
+    ThreadGetTopK<T, MaxLength, BlockSize>(topk, beam, k,
+                                           src + blockIdx.x * lds, firststep,
+                                           is_empty, max, dim, tid);
+
+    sh_topk[tid] = topk[0];
+    BlockReduce<T, MaxLength, BlockSize>(sh_topk, maxid, topk, &output,
+                                         &indices, beam, k, tid, warp);
+  }
+}
+
+template <typename T>
+class TopkOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use CUDAPlace.");
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    auto* indices = ctx.Output<Tensor>("Indices");
+    size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    const T* input_data = input->data<T>();
+
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    // FIXME(typhoonzero): data is always converted to type T?
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    size_t input_height = input->dims()[0];
+    size_t input_width = input->dims()[1];
+    if (k > input_width) k = input_width;
+
+    // NOTE: pass lds and dim same to input width.
+    // NOTE: old matrix implementation of stride is different to eigen.
+    // TODO(typhoonzero): refine this kernel.
+    dim3 threads(256, 1);
+    dim3 grid(input_height, 1);
+
+    KeMatrixTopK<T, 5, 256><<<
+        grid, threads, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
+                              ctx.device_context())
+                              .stream()>>>(output_data, output->dims()[1],
+                                           indices_data, input_data,
+                                           input_width, input_width, int(k));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>);
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
new file mode 100644
index 0000000000..bf42e15e6b
--- /dev/null
+++ b/paddle/operators/top_k_op.h
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename DeviceContext, typename T>
+class TopkKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get the top k elements of each row of input tensor
+    // FIXME: only deal with matrix(2d tensor).
+    auto* input = ctx.Input<LoDTensor>("X");
+    auto* output = ctx.Output<LoDTensor>("Out");
+    auto* indices = ctx.Output<LoDTensor>("Indices");
+    // k is determined by Attr
+    const size_t k = static_cast<int>(ctx.Attr<int>("k"));
+
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    int64_t* indices_data = indices->mutable_data<int64_t>(ctx.GetPlace());
+
+    auto eg_input = EigenMatrix<T>::From(*input);
+
+    // reshape input to a flattern matrix(like flat_inner_dims)
+    framework::DDim inputdims = input->dims();
+    const size_t row = framework::product(
+        framework::slice_ddim(inputdims, 0, inputdims.size() - 1));
+    const size_t col = inputdims[inputdims.size() - 1];
+    Eigen::DSizes<int, 2> flat2dims(row, col);
+    // NOTE: eigen shape doesn't affect paddle tensor.
+    eg_input.reshape(flat2dims);
+
+    for (size_t i = 0; i < row; i++) {
+      std::vector<std::pair<T, size_t>> vec;
+      for (size_t j = 0; j < col; j++) {
+        vec.push_back(std::pair<T, size_t>(eg_input(i, j), j));
+      }
+
+      std::partial_sort(
+          vec.begin(), vec.begin() + k, vec.end(),
+          [](const std::pair<T, size_t>& l, const std::pair<T, size_t>& r) {
+            return l.first > r.first;
+          });
+      for (size_t j = 0; j < k; j++) {
+        output_data[i * k + j] = vec[j].first;
+        indices_data[i * k + j] = int64_t(vec[j].second);
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
new file mode 100644
index 0000000000..c7ae162638
--- /dev/null
+++ b/paddle/operators/transpose_op.cc
@@ -0,0 +1,126 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class TransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    std::vector<int> axis = ctx->Attrs().Get<std::vector<int>>("axis");
+    size_t x_rank = x_dims.size();
+    size_t axis_size = axis.size();
+
+    PADDLE_ENFORCE_EQ(x_rank, axis_size,
+                      "The input tensor's rank(%d) "
+                      "should be equal to the axis's size(%d)",
+                      x_rank, axis_size);
+
+    std::vector<int> count(axis_size, 0);
+    for (size_t i = 0; i < axis_size; i++) {
+      PADDLE_ENFORCE(
+          axis[i] < static_cast<int>(axis_size) && ++count[axis[i]] == 1,
+          "Each element of Attribute axis should be a unique value "
+          "range from 0 to (dims - 1), "
+          "where the dims is the axis's size");
+    }
+
+    framework::DDim out_dims(x_dims);
+    for (size_t i = 0; i < axis_size; i++) {
+      out_dims[i] = x_dims[axis[i]];
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class TransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TransposeOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
+    AddOutput("Out", "(Tensor)The output tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
+    AddComment(R"DOC(
+Transpose Operator.
+
+The input tensor will be permuted according to the axes given.
+The behavior of this operator is similar to how `numpy.transpose` works.
+
+- suppose the input `X` is a 2-D tensor:
+    $$
+    X = \begin{pmatrix}
+    0 &1 &2 \\
+    3 &4 &5
+    \end{pmatrix}$$
+
+    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
+
+    then the output $Y$ is:
+
+    $$
+    Y = \begin{pmatrix}
+         0 &3 \\
+         1 &4  \\
+         2 &5
+    \end{pmatrix}$$
+
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is 
+$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
+
+)DOC");
+  }
+};
+
+class TransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = ctx->GetInputDim("X");
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad,
+            ops::TransposeOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    transpose, ops::TransposeKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    transpose_grad,
+    ops::TransposeGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc
new file mode 100644
index 0000000000..281c4468cc
--- /dev/null
+++ b/paddle/operators/transpose_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/transpose_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    transpose,
+    ops::TransposeKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    transpose_grad,
+    ops::TransposeGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
new file mode 100644
index 0000000000..b9686a2db3
--- /dev/null
+++ b/paddle/operators/transpose_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
+                         const framework::Tensor& in, framework::Tensor* out,
+                         const std::vector<int>& axis) {
+  switch (dim) {
+    case 1:
+      math::Transpose<DeviceContext, T, 1> trans1;
+      trans1(dev_ctx, in, out, axis);
+      break;
+    case 2:
+      math::Transpose<DeviceContext, T, 2> trans2;
+      trans2(dev_ctx, in, out, axis);
+      break;
+    case 3:
+      math::Transpose<DeviceContext, T, 3> trans3;
+      trans3(dev_ctx, in, out, axis);
+      break;
+    case 4:
+      math::Transpose<DeviceContext, T, 4> trans4;
+      trans4(dev_ctx, in, out, axis);
+      break;
+    case 5:
+      math::Transpose<DeviceContext, T, 5> trans5;
+      trans5(dev_ctx, in, out, axis);
+      break;
+    case 6:
+      math::Transpose<DeviceContext, T, 6> trans6;
+      trans6(dev_ctx, in, out, axis);
+      break;
+    default:
+      PADDLE_THROW("Tensors with rank at most 6 are supported");
+  }
+}
+
+template <typename DeviceContext, typename T>
+class TransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<framework::Tensor>("X");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    int ndims = axis.size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *x, out, axis);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class TransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    if (!x_grad) return;
+
+    x_grad->mutable_data<T>(context.GetPlace());
+    std::vector<int> axis = context.Attr<std::vector<int>>("axis");
+    std::vector<int> reversed_axis(axis);
+
+    for (size_t i = 0; i < axis.size(); i++) {
+      reversed_axis[axis[i]] = i;
+    }
+
+    int ndims = axis.size();
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    TransCompute<DeviceContext, T>(ndims, dev_ctx, *out_grad, x_grad,
+                                   reversed_axis);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
new file mode 100644
index 0000000000..3a314bdb9b
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* tensor = ctx.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(ctx.Attr<float>("min")),
+        static_cast<T>(ctx.Attr<float>("max")));
+    int64_t size = tensor->numel();
+    for (int64_t i = 0; i < size; ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UniformRandomOp should not be null.");
+
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "uniform_random's min must less then max");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> temp;
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
+      temp.push_back(static_cast<int64_t>(dim));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(temp));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
+        ctx.GetPlace());
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) The output tensor of uniform random op");
+    AddComment(R"DOC(
+Uniform random operator.
+
+This operator initializes a tensor with random values sampled from a 
+uniform distribution.
+
+)DOC");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) The shape of the output tensor");
+    AddAttr<float>("min",
+                   "(float, default -1.0) "
+                   "Minimum value of uniform random")
+        .SetDefault(-1.0f);
+    AddAttr<float>("max",
+                   "(float, default 1.0) "
+                   "Maximun value of uniform random")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "(int, default 0) "
+                 "Random seed used for generating samples. "
+                 "0 means use a seed generated by the system.")
+        .SetDefault(0);
+    AddAttr<int>("dtype", "(int, default 5(FP32)) Output tensor data type")
+        .SetDefault(framework::proto::DataType::FP32);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(uniform_random, paddle::operators::UniformRandomOp,
+                             paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>,
+                       paddle::operators::CPUUniformRandomKernel<double>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
new file mode 100644
index 0000000000..719d0872a7
--- /dev/null
+++ b/paddle/operators/uniform_random_op.cu
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T min = static_cast<T>(context.Attr<float>("min"));
+    T max = static_cast<T>(context.Attr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    int64_t size = tensor->numel();
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000..50cee11a7a
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+Input shape is: $(N, C_{in}, H_{in}, W_{in})$, Output shape is:
+$(N, C_{out}, H_{out}, W_{out})$, where
+$$
+H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+$$
+Paper: http://www.matthewzeiler.com/wp-content/uploads/2017/07/iccv2011.pdf
+)DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000..9b002e35c4
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    unpool, ops::UnpoolKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    unpool_grad,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::UnpoolGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000..ee18b118c9
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    if (output_data) {
+      math::SetConstant<DeviceContext, T> set_zero;
+      set_zero(dev_ctx, out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
+    unpool2d_max_forward(dev_ctx, *in_x, *in_y, out);
+  }
+};
+template <typename DeviceContext, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<DeviceContext, T> unpool2d_max_backward;
+    unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/warpctc_op.cc b/paddle/operators/warpctc_op.cc
new file mode 100644
index 0000000000..bd0c5f9957
--- /dev/null
+++ b/paddle/operators/warpctc_op.cc
@@ -0,0 +1,141 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace paddle {
+namespace operators {
+
+class WarpCTCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"),
+                   "Input(Label) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
+                   "Output(WarpCTCGrad) of WarpCTCOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of WarpCTCOp should not be null.");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    int sequence_width =
+        static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
+    int blank = ctx->Attrs().Get<int>("blank");
+    PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
+                   "The value of Attr(blank) should be in interval [0, %d).",
+                   sequence_width);
+    // TODO(liuyiqun): it is tricky to set the wrong dimension here.
+    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "(LodTensor, default: LoDTensor<float>), the unscaled "
+             "probabilities of variable-length sequences, which is a 2-D "
+             "Tensor with LoD information. It's shape is "
+             "[Lp, num_classes + 1], where Lp is the sum of all input "
+             "sequences' length and num_classes is the true number of classes "
+             "(not including the blank label).");
+    AddInput("Label",
+             "(LodTensor, default: LoDTensor<int>), the ground truth "
+             "of variable-length sequence, which is a 2-D Tensor with LoD "
+             "information. It is of the shape [Lg, 1], where Lg is th sum of "
+             "all labels' length.");
+    AddOutput("WarpCTCGrad",
+              "(Tensor, default: Tensor<float>), a temporary "
+              "output Tensor to store the gradients of warp-ctc, which is "
+              "computed with loss together in one call. It is a 3-D Tensor of "
+              "the shape [max_sequence_length, batch_size, num_classes + 1].")
+        .AsIntermediate();
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), the Connectionist "
+              "Temporal Classification (CTC) loss, which is a 2-D Tensor of "
+              "the shape [batch_size, 1]");
+    AddAttr<int>("blank",
+                 "(int, default: 0), the blank label of Connectionist "
+                 "Temporal Classification (CTC) loss, which is in the "
+                 "half-opened interval [0, num_classes + 1).")
+        .SetDefault(0);
+    AddAttr<bool>("norm_by_times",
+                  "(bool, default: false), whether to "
+                  "normalize the gradients by the number of time-step, "
+                  "which is also the sequence's length.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+An operator integrating the open-source
+[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
+[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
+https://arxiv.org/pdf/1512.02595v1.pdf),
+to compute Connectionist Temporal Classification (CTC) loss.
+It can be aliased as softmax with ctc, since a native softmax activation is
+interated to the warp-ctc library, to to normlize values for each row of the
+input tensor.
+
+More detail of CTC loss can be found by refering to
+[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
+Recurrent Neural Networks](
+http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
+)DOC");
+  }
+};
+
+class WarpCTCGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
+                   "Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Logits"));
+    ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
+        ctx.device_context());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
+            ops::WarpCTCGradOp);
+REGISTER_OP_CPU_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
+REGISTER_OP_CPU_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.cu.cc b/paddle/operators/warpctc_op.cu.cc
new file mode 100644
index 0000000000..7d8527ac75
--- /dev/null
+++ b/paddle/operators/warpctc_op.cu.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/warpctc_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
+REGISTER_OP_CUDA_KERNEL(
+    warpctc_grad,
+    ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/operators/warpctc_op.h b/paddle/operators/warpctc_op.h
new file mode 100644
index 0000000000..8aea061c00
--- /dev/null
+++ b/paddle/operators/warpctc_op.h
@@ -0,0 +1,229 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence_padding.h"
+#include "paddle/operators/math/sequence_scale.h"
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename DeviceContext>
+class WarpCTCFunctor {
+ public:
+  /*
+   * \brief Compute the connectionist temporal classification loss,
+   *        and optionally compute the gradient with respect to the inputs.
+   *
+   * If gradient is nullptr, it only computes the ctc loss,
+   * or computes both ctc loss and gradient.
+   *
+   * \param ctx               execution context of this functor
+   * \param input             batch matrix of input probabilities, in
+   *                          max_sequence_length x num_sequences x
+   *                          sequence_width, (row-major) format
+   * \param gradient          batch matrix of gradient, with the same shape as
+   *                          input.
+   * \param cpu_labels        labels always in CPU memory.
+   * \param cpu_label_lengths length of all labels in CPU memory.
+   * \param cpu_input_lengths length of all sequences in CPU memory.
+   * \param sequence_width    number of possible output symbols.
+   * \param num_sequences     number of sequence.
+   * \param blank             blank label used in ctc loss function.
+   * \param cpu_losss         cost of each sequence in CPU memory.
+   */
+  void operator()(const framework::ExecutionContext& ctx, const float* input,
+                  float* gradient, const int* cpu_labels,
+                  const int* cpu_label_lengths, const int* cpu_input_lengths,
+                  const size_t sequence_width, const size_t num_sequences,
+                  const size_t blank, float* cpu_loss) {
+    // Init warp-ctc options
+    init(ctx, blank);
+
+    // Compute the required workspace size.
+    // There is no memory allocated operations within warp-ctc.
+    size_t workspace_bytes = 0;
+    ctcStatus_t status = platform::dynload::get_workspace_size(
+        cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
+        static_cast<int>(num_sequences), options_, &workspace_bytes);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in get_workspace_size: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+    PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
+                      "Bytes of workspace got by warp-ctc function, "
+                      "get_workspace_size(), should be larger than 0.");
+
+    Tensor workspace;
+    size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
+    float* workspace_data = workspace.mutable_data<float>(
+        framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
+        ctx.GetPlace());
+    math::SetConstant<DeviceContext, float>()(
+        ctx.template device_context<DeviceContext>(), &workspace,
+        static_cast<float>(0));
+
+    // compute loss and gradient
+    status = platform::dynload::compute_ctc_loss(
+        input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
+        static_cast<int>(sequence_width), static_cast<int>(num_sequences),
+        cpu_loss, workspace_data, options_);
+    PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
+                      "warp-ctc [version %d] Error in compute_ctc_loss: ",
+                      warpctc_version_,
+                      platform::dynload::ctcGetStatusString(status));
+  }
+
+ protected:
+  void init(const framework::ExecutionContext& ctx, const size_t blank) {
+    warpctc_version_ = platform::dynload::get_warpctc_version();
+
+    if (platform::is_gpu_place(ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+      options_.loc = CTC_GPU;
+      options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
+                            ctx.device_context())
+                            .stream();
+#else
+      PADDLE_THROW("[warpctc init] GPU is not enabled.");
+#endif
+    } else {
+      options_.loc = CTC_CPU;
+      options_.num_threads = 1;
+    }
+
+    options_.blank_label = blank;
+  }
+
+ private:
+  int warpctc_version_;
+  ctcOptions options_;
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* logits = ctx.Input<LoDTensor>("Logits");
+    auto* label = ctx.Input<LoDTensor>("Label");
+    auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
+    auto* loss = ctx.Output<Tensor>("Loss");
+
+    const size_t level = 0;
+
+    auto logits_lod = framework::ToAbsOffset(logits->lod());
+    auto logits_dims = logits->dims();
+    PADDLE_ENFORCE_EQ(logits_dims[0],
+                      static_cast<int64_t>(logits_lod[level].back()),
+                      "The first dimension of Input(Logits) should be equal to "
+                      "the sum of all sequences' lengths.");
+
+    auto label_lod = framework::ToAbsOffset(label->lod());
+    auto label_dims = label->dims();
+    PADDLE_ENFORCE_EQ(
+        label_dims[0], label->numel(),
+        "The width of each timestep in Input(Label) should be 1.");
+
+    const size_t num_sequences = logits_lod[level].size() - 1;
+    PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
+                      "The number of sequences of Input(Logits) should be "
+                      "equal to that of Input(Label).");
+
+    const size_t sequence_width = logits->numel() / logits_dims[0];
+    auto loss_dims =
+        framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
+
+    // warpctc needs sequences data stored in transposed padding format
+    Tensor warpctc_logits;
+    const size_t max_sequence_length =
+        math::MaximumSequenceLength(logits_lod, level);
+    auto warpctc_logits_dims =
+        framework::make_ddim({static_cast<int64_t>(max_sequence_length),
+                              static_cast<int64_t>(num_sequences),
+                              static_cast<int64_t>(sequence_width)});
+    warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
+    math::PaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
+        false);
+    const T* warpctc_logits_data = warpctc_logits.data<T>();
+
+    std::vector<int> warpctc_label_lengths(num_sequences);
+    std::vector<int> warpctc_logits_lengths(num_sequences);
+
+    for (size_t i = 0; i < num_sequences; ++i) {
+      warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
+      warpctc_logits_lengths[i] =
+          logits_lod[level][i + 1] - logits_lod[level][i];
+    }
+
+    // warpctc computes loss and gradient in one call, gradient data also stored
+    // in batch format
+    T* warpctc_grad_data =
+        warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
+
+    math::SetConstant<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), warpctc_grad,
+        static_cast<T>(0));
+
+    // warpctc accesses labels in CPU memory
+    Tensor warpctc_label;
+    Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
+    const int* warpctc_label_data = warpctc_label.data<int>();
+    // warpctc stores loss in CPU memory
+    Tensor warpctc_loss;
+    T* warpctc_loss_data =
+        warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
+
+    const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
+
+    WarpCTCFunctor<DeviceContext>()(
+        ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
+        warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
+        sequence_width, num_sequences, blank, warpctc_loss_data);
+
+    // Copy the loss back
+    Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class WarpCTCGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
+    auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
+    const Tensor* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+
+    logits_grad->mutable_data<T>(ctx.GetPlace());
+    bool norm_by_times = ctx.Attr<bool>("norm_by_times");
+    math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        *warpctc_grad, norm_by_times);
+
+    const T* loss_grad_data = loss_grad->data<T>();
+    math::ScaleLoDTensorFunctor<DeviceContext, T>()(
+        ctx.template device_context<DeviceContext>(), *logits_grad,
+        loss_grad_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
new file mode 100644
index 0000000000..2fdd25dbbe
--- /dev/null
+++ b/paddle/operators/while_op.cc
@@ -0,0 +1,345 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/framework/executor.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/detail/safe_ref.h"
+
+namespace paddle {
+namespace operators {
+
+using StepScopeVar = std::vector<framework::Scope *>;
+using LoDTensor = framework::LoDTensor;
+
+static constexpr char kStepBlock[] = "sub_block";
+static constexpr char kCondition[] = "Condition";
+static constexpr char kStepScopes[] = "StepScopes";
+static constexpr char kX[] = "X";
+static constexpr char kXGRAD[] = "X@GRAD";
+static constexpr char kOutputs[] = "Out";
+
+class WhileOp : public framework::OperatorBase {
+ public:
+  WhileOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    PADDLE_ENFORCE_NOT_NULL(scope.FindVar(Input(kCondition)));
+    auto &cond = scope.FindVar(Input(kCondition))->Get<LoDTensor>();
+    PADDLE_ENFORCE_EQ(cond.dims(), paddle::framework::make_ddim({1}));
+
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+
+    auto *program = block->Program();
+
+    auto step_scopes =
+        scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
+
+    while (cond.data<bool>()[0]) {
+      auto &current_scope = scope.NewScope();
+      step_scopes->push_back(&current_scope);
+
+      executor.Run(*program, &current_scope, block->ID(),
+                   false /*create_local_scope*/);
+    }
+  }
+};
+
+class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(kX,
+             "A set of variables, which are required by operators inside the "
+             "block of While Op.")
+        .AsDuplicable();
+    AddInput(
+        kCondition,
+        "(Bool) An scalar. When it's False, the While Op will be terminated.")
+        .AsDuplicable();
+    AddOutput(kOutputs,
+              "A set of variables, which will be assigned with values "
+              "generated by the operators inside the block of While Op.")
+        .AsDuplicable();
+    AddOutput(kStepScopes,
+              "(StepScopeVar) A vector of local scope, which size equals the "
+              "step number of While Op. The i'th scope storages temporary "
+              "variables generated in the i'th step.");
+    AddAttr<framework::BlockDesc *>(kStepBlock,
+                                    "The step block inside WhileOp");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class WhileGradOp : public framework::OperatorBase {
+ public:
+  WhileGradOp(const std::string &type, const framework::VariableNameMap &inputs,
+              const framework::VariableNameMap &outputs,
+              const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::Place &dev_place) const override {
+    framework::Executor executor(dev_place);
+    auto *block = Attr<framework::BlockDesc *>(kStepBlock);
+    auto *program = block->Program();
+
+    auto *step_scopes =
+        scope.FindVar(Input(kStepScopes))->GetMutable<StepScopeVar>();
+
+    auto outside_og_names = Inputs(framework::GradVarName(kOutputs));
+    auto inside_og_names =
+        Attr<std::vector<std::string>>("original_output_grad");
+
+    PADDLE_ENFORCE_EQ(outside_og_names.size(), inside_og_names.size());
+
+    for (auto cur_scope_iter = step_scopes->rbegin();
+         cur_scope_iter != step_scopes->rend(); ++cur_scope_iter) {
+      VLOG(3) << "Start backward at time_step "
+              << cur_scope_iter - step_scopes->rbegin();
+      framework::Scope &cur_scope = **cur_scope_iter;
+      // Link OG from outside to inside
+      for (size_t i = 0; i < outside_og_names.size(); ++i) {
+        auto outside_og_name = outside_og_names[i];
+        auto inside_og_name = inside_og_names[i];
+        VLOG(8) << "Linking outside " << outside_og_name << " --> inside "
+                << inside_og_name;
+        auto &og_outside =
+            detail::Ref(scope.FindVar(outside_og_name),
+                        "Cannot find Outside Gradient %s", outside_og_name);
+        auto &og_inside =
+            detail::Ref(cur_scope.Var(inside_og_name),
+                        "Cannot find inside gradient %s", inside_og_name);
+        if (og_outside.Type().hash_code() ==
+            typeid(framework::LoDTensor).hash_code()) {
+          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
+          auto &inside_tensor =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensor>());
+          inside_tensor.set_lod(outside_tensor.lod());
+          inside_tensor.ShareDataWith(outside_tensor);
+        } else if (og_outside.Type().hash_code() ==
+                   typeid(framework::LoDTensorArray).hash_code()) {
+          auto &outside_array = og_outside.Get<framework::LoDTensorArray>();
+          auto &inside_array =
+              detail::Ref(og_inside.GetMutable<framework::LoDTensorArray>());
+          VLOG(8) << outside_og_name << " size = " << outside_array.size();
+          inside_array.resize(outside_array.size());
+
+          for (size_t j = 0; j < inside_array.size(); ++j) {
+            VLOG(8) << j << " " << outside_array[j].numel();
+            if (outside_array[j].numel() != 0) {
+              inside_array[j].set_lod(outside_array[j].lod());
+              inside_array[j].ShareDataWith(outside_array[j]);
+            } else {
+              PADDLE_ENFORCE_EQ(inside_array[j].numel(), 0);
+            }
+          }
+        }
+      }
+
+      executor.Run(*program, *cur_scope_iter, block->ID(), false);
+
+      auto &pg_names = Outputs(kXGRAD);
+      auto &p_names = Inputs(kX);
+      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
+      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
+        if (pg_names[param_id] == framework::kEmptyVarName) {
+          continue;  // parameter doesn't have gradient
+        }
+        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
+
+        //  // TODO(tonyyang-svail): Not sure we need the following
+        //  // If does not compute gradient of that variable inside rnn,
+        //  just
+        //  // continue
+        //  if (local_var_names.find(inside_grad_name) ==
+        //  local_var_names.end()) {
+        //    continue;
+        //  }
+
+        // zero gradient variable in step 0
+        if (cur_scope_iter == step_scopes->rbegin()) {
+          auto *var = (*cur_scope_iter)->FindVar(inside_grad_name);
+          PADDLE_ENFORCE_NOT_NULL(var, "Can not find var %s", inside_grad_name);
+          if (var->IsType<LoDTensor>()) {
+            auto &inside_tensor = var->Get<framework::LoDTensor>();
+            framework::AttributeMap attrs;
+            attrs["dtype"] = framework::ToDataType(inside_tensor.type());
+            attrs["shape"] = framework::vectorize2int(inside_tensor.dims());
+            attrs["value"] = 0.0f;
+
+            auto var_name = pg_names[param_id];
+            auto zero_op = framework::OpRegistry::CreateOp(
+                "fill_constant", framework::VariableNameMap{},
+                {{"Out", {var_name}}}, attrs);
+            zero_op->Run(scope, dev_place);
+            scope.FindVar(var_name)
+                ->GetMutable<framework::LoDTensor>()
+                ->set_lod(inside_tensor.lod());
+          }
+        }
+
+        auto new_inside_name = cur_scope.Rename(inside_grad_name);
+        auto sum_op = framework::OpRegistry::CreateOp(
+            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
+            {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{});
+        sum_op->Run(cur_scope, dev_place);
+        cur_scope.Rename(new_inside_name, inside_grad_name);
+      }
+    }
+  }
+};
+
+class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto *while_grad = new framework::OpDesc();
+    while_grad->SetType("while_grad");
+    while_grad->SetInput(kX, Input(kX));
+    while_grad->SetInput(kOutputs, Output(kOutputs));
+    while_grad->SetInput(kStepScopes, Output(kStepScopes));
+
+    auto *grad_block = this->grad_block_[0];
+    auto *fwd_block = grad_block->ParentBlock();
+
+    // Not all of IGs will be generated by inner gradient operators of while op.
+    // Ignore IGs that is not generated by the inside block.
+    std::unordered_set<std::string> inner_op_outputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &oname : op->OutputArgumentNames()) {
+        inner_op_outputs.insert(oname);
+      }
+    }
+    auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
+    for (auto &each_ig : igs) {
+      if (inner_op_outputs.find(each_ig) == inner_op_outputs.end()) {
+        VLOG(8) << "Ignore " << each_ig;
+        each_ig = framework::kEmptyVarName;
+      }
+    }
+    while_grad->SetOutput(framework::GradVarName(kX), igs);
+
+    // OG should be re-calculated by step blocks, since many outputs of while op
+    // do not need to calculate gradients.
+    std::unordered_set<std::string> block_ins;
+    block_ins.reserve(Input(kX).size() + Output(kOutputs).size());
+    for (auto &p : Input(kX)) {
+      block_ins.insert(p);
+    }
+    for (auto &o : Output(kOutputs)) {
+      block_ins.insert(o);
+    }
+    std::unordered_set<std::string> extra_inputs;
+    for (const auto *op : grad_block->AllOps()) {
+      for (auto &input_name : op->InputArgumentNames()) {
+        // If the input of Op has been recorded or is generated by the forward
+        // block, do not make it as input again.
+        if (block_ins.find(input_name) != block_ins.end() ||
+            fwd_block->FindVar(input_name) != nullptr) {
+          continue;
+        }
+        extra_inputs.insert(input_name);
+      }
+      for (auto &output_name : op->OutputArgumentNames()) {
+        block_ins.insert(output_name);
+      }
+    }
+
+    std::vector<std::string> extra_inputs_list;
+    extra_inputs_list.resize(extra_inputs.size());
+    std::copy(extra_inputs.begin(), extra_inputs.end(),
+              extra_inputs_list.begin());
+    while_grad->SetInput(framework::GradVarName(kOutputs), extra_inputs_list);
+
+    while_grad->SetAttrMap(this->Attrs());
+    while_grad->SetBlockAttr(kStepBlock, *grad_block);
+    // record the original output gradient names, since the gradient name of
+    // while operator could be renamed.
+    while_grad->SetAttr("original_output_grad", extra_inputs_list);
+
+    return std::unique_ptr<framework::OpDesc>(while_grad);
+  }
+};
+
+class WhileGradOpVarTypeInference : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
+    auto p_names = op_desc.Input(kX);
+    auto pg_names = op_desc.Output(framework::GradVarName(kX));
+
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
+      auto *g_var = block->FindVarRecursive(pg_names[i]);
+      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+        VLOG(5) << "Setting " << pg_names[i] << " following " << p_names[i]
+                << " type: " << p_var.GetType();
+        g_var->SetType(p_var.GetType());
+        g_var->SetDataType(p_var.GetDataType());
+      }
+    }
+  }
+};
+
+class WhileGradOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    ctx->HasInputs(kX);
+    ctx->HasOutputs(framework::GradVarName(kX));
+    ctx->HasInputs(kOutputs);
+    ctx->HasInputs(framework::GradVarName(kOutputs));
+
+    auto p_names = ctx->Inputs(kX);
+    auto pg_names = ctx->Outputs(kXGRAD);
+    auto var_types = ctx->GetInputsVarType(kX);
+    std::vector<std::string> names_to_set;
+    std::vector<framework::DDim> dims_to_set;
+    for (size_t i = 0; i < p_names.size(); ++i) {
+      if (pg_names[i] == framework::kEmptyVarName) {
+        continue;
+      }
+      auto dims = ctx->GetInputsElementDim(kX, i);
+      if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      } else if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR_ARRAY) {
+        // not sure how to set the dim of LOD_TENSOR_ARRAY
+        names_to_set.push_back(pg_names[i]);
+        dims_to_set.push_back(dims);
+      }
+    }
+    ctx->SetDims(names_to_set, dims_to_set);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(while, paddle::operators::WhileOp,
+                  paddle::operators::WhileOpMaker,
+                  paddle::operators::WhileGradOpDescMaker);
+REGISTER_OPERATOR(while_grad, paddle::operators::WhileGradOp,
+                  paddle::operators::WhileGradOpShapeInference,
+                  paddle::operators::WhileGradOpVarTypeInference);
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
new file mode 100644
index 0000000000..25fc35311f
--- /dev/null
+++ b/paddle/optimizer/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(OPITMIZER_SRCS
+    adadelta_optimizer.cc
+    adagrad_optimizer.cc
+    adam_optimizer.cc
+    optimizer.cc
+    parameter_optimizer.cc
+    sgd_optimizer.cc
+  )
+
+cc_library(paddle_optimizer STATIC SRCS ${OPITMIZER_SRCS} DEPS paddle_proto glog)
+cc_test(serialization_test SRCS serialization_test.cc DEPS paddle_proto)
+cc_test(parameter_optimizer_test SRCS parameter_optimizer_test.cc DEPS paddle_optimizer)
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
new file mode 100644
index 0000000000..8ca048257e
--- /dev/null
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "adadelta_optimizer.h"
+#include <algorithm>
+#include <cmath>
+
+namespace paddle {
+namespace optimizer {
+
+void AdadeltaOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  const Tensor& grad = *gradient;
+  Tensor& accum_g = *accum_gradient_;
+  Tensor& accum_d = *accum_delta_;
+  Tensor& update_d = *update_delta_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];
+
+    update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
+                  std::sqrt(accum_g[i] + epsilon_) * grad[i];
+
+    accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];
+
+    param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
+  }
+}
+
+std::string AdadeltaOptimizer::SerializeState() {
+  AdadeltaOptimizerState state;
+  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState();
+  state.mutable_lr_state()->ParseFromString(lr_str);
+
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  TensorToProto(*accum_delta_, state.mutable_accum_delta());
+  TensorToProto(*update_delta_, state.mutable_update_delta());
+  return state.SerializeAsString();
+}
+
+void AdadeltaOptimizer::DeserializeState(const std::string& str) {
+  AdadeltaOptimizerState state;
+  state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+  num_sample_passed_ = state.num_sample_passed();
+
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+  ProtoToTensor(state.accum_delta(), accum_delta_);
+  ProtoToTensor(state.update_delta(), update_delta_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
new file mode 100644
index 0000000000..48f1ae1750
--- /dev/null
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdadeltaOptimizer : public ParameterOptimizer {
+public:
+  AdadeltaOptimizer(
+      Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        accum_delta_(new Tensor(parameter->size())),
+        update_delta_(new Tensor(parameter->size())),
+        rho_(rho),
+        epsilon_(epsilon),
+        decay_(decay) {}
+
+  ~AdadeltaOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+    if (accum_delta_) delete accum_delta_;
+    if (update_delta_) delete update_delta_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *accum_gradient_;
+  Tensor *accum_delta_;
+  Tensor *update_delta_;
+  double rho_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
new file mode 100644
index 0000000000..c6d39a366a
--- /dev/null
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+
+#include "adagrad_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+void AdagradOptimizer::Update(const Tensor* gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  Tensor& param = *parameter_;
+  Tensor& accum_g = *accum_gradient_;
+  const Tensor& grad = *gradient;
+  for (size_t i = 0; i < param.size(); ++i) {
+    accum_g[i] += grad[i] * grad[i];
+    param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
+                learning_rate * decay_ * param[i];
+  }
+}
+std::string AdagradOptimizer::SerializeState() {
+  AdagradOptimizerState state;
+  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState();
+  state.mutable_lr_state()->ParseFromString(lr_str);
+
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
+  return state.SerializeAsString();
+}
+
+void AdagradOptimizer::DeserializeState(const std::string& str) {
+  AdagradOptimizerState state;
+  state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.accum_gradient(), accum_gradient_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
new file mode 100644
index 0000000000..b0cff061f5
--- /dev/null
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdagradOptimizer : public ParameterOptimizer {
+public:
+  AdagradOptimizer(Tensor *parameter,
+                   LrPolicy *lr,
+                   double epsilon,
+                   double decay)
+      : ParameterOptimizer(parameter, lr),
+        accum_gradient_(new Tensor(parameter->size())),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdagradOptimizer() {
+    if (accum_gradient_) delete accum_gradient_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *accum_gradient_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
new file mode 100644
index 0000000000..8a384b59c4
--- /dev/null
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "adam_optimizer.h"
+#include <cmath>
+
+namespace paddle {
+namespace optimizer {
+
+void AdamOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
+  double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
+  learning_rate *= std::sqrt(coef2) / coef1;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  Tensor &v = *velocitys_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
+    v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
+    param[i] -=
+        learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
+  }
+}
+
+std::string AdamOptimizer::SerializeState() {
+  AdamOptimizerState state;
+  std::string lr_str = this->lr_policy_->SerializeState();
+  state.mutable_lr_state()->ParseFromString(lr_str);
+  state.set_num_sample_passed(num_sample_passed_);
+
+  TensorToProto(*parameter_, state.mutable_parameter());
+  TensorToProto(*momentums_, state.mutable_momentums());
+  TensorToProto(*velocitys_, state.mutable_velocitys());
+  return state.SerializeAsString();
+}
+
+void AdamOptimizer::DeserializeState(const std::string &str) {
+  AdamOptimizerState state;
+  state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+  num_sample_passed_ = state.num_sample_passed();
+
+  ProtoToTensor(state.parameter(), parameter_);
+  ProtoToTensor(state.momentums(), momentums_);
+  ProtoToTensor(state.velocitys(), velocitys_);
+}
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
new file mode 100644
index 0000000000..7df40064df
--- /dev/null
+++ b/paddle/optimizer/adam_optimizer.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class AdamOptimizer : public ParameterOptimizer {
+public:
+  AdamOptimizer(Tensor *parameter,
+                LrPolicy *lr,
+                double beta_1,
+                double beta_2,
+                double epsilon,
+                double decay)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(new Tensor(parameter->size())),
+        velocitys_(new Tensor(parameter->size())),
+        beta_1_(beta_1),
+        beta_2_(beta_2),
+        epsilon_(epsilon),
+        decay_(decay) {}
+  ~AdamOptimizer() {
+    if (momentums_) delete momentums_;
+    if (velocitys_) delete velocitys_;
+  }
+  void Update(const Tensor *gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string &state);
+
+private:
+  Tensor *momentums_;
+  Tensor *velocitys_;
+  double beta_1_;
+  double beta_2_;
+  double epsilon_;
+  double decay_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
new file mode 100644
index 0000000000..9a44a776f2
--- /dev/null
+++ b/paddle/optimizer/lr_policy.h
@@ -0,0 +1,82 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <algorithm>
+#include "OptimizerConfig.pb.h"
+
+namespace paddle {
+namespace optimizer {
+
+class LrPolicy {
+public:
+  virtual ~LrPolicy() {}
+  virtual double LearningRate(const uint64_t num_sample_passed) = 0;
+  virtual std::string SerializeState() = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+};
+
+// constant learning rate policy
+class ConstLr final : public LrPolicy {
+public:
+  ConstLr(double lr) : learning_rate_(lr){};
+  double LearningRate(const uint64_t num_sample_passed) {
+    return learning_rate_;
+  }
+  std::string SerializeState() {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    return state.SerializeAsString();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+  }
+
+private:
+  double learning_rate_;
+};
+
+class LinearLr final : public LrPolicy {
+public:
+  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
+      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
+  double LearningRate(const uint64_t num_sample_passed) {
+    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
+                    lr_decay_b_);
+  }
+  std::string SerializeState() {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    state.set_lr_decay_a(lr_decay_a_);
+    state.set_lr_decay_b(lr_decay_b_);
+    return state.SerializeAsString();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+    lr_decay_a_ = state.lr_decay_a();
+    lr_decay_b_ = state.lr_decay_b();
+  }
+
+private:
+  double learning_rate_;
+  double lr_decay_a_;
+  double lr_decay_b_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
new file mode 100644
index 0000000000..3af4448436
--- /dev/null
+++ b/paddle/optimizer/optimizer.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "optimizer.h"
+#include <glog/logging.h>
+#include <cstdlib>
+#include <cstring>
+#include <string>
+
+#include "parameter_optimizer.h"
+
+using paddle::optimizer::ParameterOptimizer;
+using paddle::optimizer::Tensor;
+
+template <paddle_element_type VALUE>
+struct EnumToType {};
+
+template <class T>
+struct TypeToEnum {};
+
+#define MATCH_ENUM_TYPE(TYPE, ENUM)                 \
+  template <>                                       \
+  struct TypeToEnum<TYPE> {                         \
+    static paddle_element_type v() { return ENUM; } \
+    static constexpr TYPE value = ENUM;             \
+  };                                                \
+  template <>                                       \
+  struct EnumToType<ENUM> {                         \
+    typedef TYPE Type;                              \
+  }
+
+MATCH_ENUM_TYPE(int32_t, PADDLE_ELEMENT_TYPE_INT32);
+MATCH_ENUM_TYPE(uint32_t, PADDLE_ELEMENT_TYPE_UINT32);
+MATCH_ENUM_TYPE(int64_t, PADDLE_ELEMENT_TYPE_INT64);
+MATCH_ENUM_TYPE(uint64_t, PADDLE_ELEMENT_TYPE_UINT64);
+MATCH_ENUM_TYPE(float, PADDLE_ELEMENT_TYPE_FLOAT32);
+MATCH_ENUM_TYPE(double, PADDLE_ELEMENT_TYPE_FLOAT64);
+
+struct paddle_optimizer {
+  paddle::optimizer::ParameterOptimizer* impl;
+};
+
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len) {
+  paddle_optimizer* optimizer = new paddle_optimizer;
+  std::string config(config_proto, config_proto + config_proto_len);
+  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
+                                 num_bytes / sizeof(float));
+  optimizer->impl = ParameterOptimizer::Create(config, parameter);
+  if (state != nullptr) {
+    std::string s(state, state + state_len);
+    optimizer->impl->DeserializeState(s);
+  }
+  return optimizer;
+}
+
+int paddle_release_optimizer(paddle_optimizer* o) {
+  if (o != nullptr) delete o->impl;
+  return PADDLE_SUCCESS;
+}
+
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* grad_buffer,
+                            int num_bytes) {
+  // TOOD(zhihong): datatype not work. need to add the runtime datatype
+  auto grad_type = reinterpret_cast<const float*>(grad_buffer);
+  Tensor* gradient =
+      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
+  o->impl->Update(gradient);
+  return PADDLE_SUCCESS;
+}
+
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
+  int param_size = 0;
+  *param_buffer = (void*)o->impl->get_weight(&param_size);
+  return param_size;
+}
+
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
+  std::string s = o->impl->SerializeState();
+  int state_len = s.size();
+
+  if (state_len > 0) {
+    *state = (char*)std::malloc(state_len);
+    std::memcpy((void*)*state, (const void*)s.c_str(), state_len);
+  }
+
+  return state_len;
+}
diff --git a/paddle/optimizer/optimizer.h b/paddle/optimizer/optimizer.h
new file mode 100644
index 0000000000..516e612167
--- /dev/null
+++ b/paddle/optimizer/optimizer.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdbool.h>
+#include <stdint.h>
+
+/**
+ * @brief optimizer library in independent with other module
+ * which will be used in :
+ * Case A, the gradient optimized locally on the trainer.
+ *
+ * Case B, the gradient optimized on the parameter server.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32 = 0,
+  PADDLE_ELEMENT_TYPE_UINT32 = 1,
+  PADDLE_ELEMENT_TYPE_INT64 = 2,
+  PADDLE_ELEMENT_TYPE_UINT64 = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+/**
+ * @brief execution status code
+ */
+const int32_t PADDLE_SUCCESS = 0;
+const int32_t PADDLE_ERROR = -1;
+
+typedef struct paddle_optimizer paddle_optimizer;
+/**
+ * this group interface called in order :
+ * 1. create optimizer with config
+ * 2. set weights
+ * 3. update_parameter
+ * 4. get_weights
+ * 5. release optimizer
+ */
+
+/**
+ *  @brief create optimizer with proto_config
+ *  @param config_proto, optimizer protobuf, see OptimizerConfig.proto in detail
+ *  @return return optimizer instance
+ */
+paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
+                                          const int config_proto_len,
+                                          const paddle_element_type data_type,
+                                          void* param_buffer,
+                                          int num_bytes,
+                                          const char* state,
+                                          const int state_len);
+
+/**
+ *  @brief release optimizer
+ *  @param optimizer
+ *  @return return exec status
+ */
+int paddle_release_optimizer(paddle_optimizer* o);
+
+/**
+ *  @brief optimizer instance
+ *  @param datatype of gradient and parameter
+ *  @param gradient, calculate by optimzizer caller.
+ *       TODO(zhihong): just pass loss to reduce communicate overhead.
+ *                     Project Adam Ms'14 paper for detail
+ *  @param num_bytes, gradient size
+ *  @return return exec status
+ */
+int paddle_update_parameter(paddle_optimizer* o,
+                            const paddle_element_type data_type,
+                            const void* gradient,
+                            int num_bytes);
+
+/**
+ *  @brief optimizer for get parameter buffer
+ *  @param param_buffer, initilized parameter buffer
+ *  @return return content length
+ */
+int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer);
+
+/**
+ *  @brief optimzizer for saving training state
+ *  @param training state for receive SerializeState
+ *  @return return state_buffer length
+ */
+int paddle_optimizer_get_state(paddle_optimizer* o, const char** state);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
new file mode 100644
index 0000000000..1603e5fdc8
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+#include "adadelta_optimizer.h"
+#include "adagrad_optimizer.h"
+#include "adam_optimizer.h"
+#include "lr_policy.h"
+#include "sgd_optimizer.h"
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
+                                               Tensor *parameter) {
+  paddle::OptimizerConfig config;
+  CHECK(config.ParseFromString(config_proto) == true)
+      << "failed parse optimizer config";
+  auto select_lr_policy = [=](const OptimizerConfig &config) -> LrPolicy * {
+    if (config.lr_policy() == OptimizerConfig::Const)
+      return new ConstLr(config.const_lr().learning_rate());
+    if (config.lr_policy() == OptimizerConfig::Linear)
+      return new LinearLr(config.linear_lr().learning_rate(),
+                          config.linear_lr().lr_decay_a(),
+                          config.linear_lr().lr_decay_b());
+    // default
+    LOG(WARNING) << " have not select any LrPolicy. use ConstLr in default";
+    return new ConstLr(0.1);
+  };
+
+  LrPolicy *lr = select_lr_policy(config);
+  auto select_optimizer = [=](
+      Tensor *parameter,
+      const OptimizerConfig &config) -> ParameterOptimizer * {
+    if (config.optimizer() == OptimizerConfig::SGD) {
+      LOG(INFO) << "creating SGD optimizer";
+      return new SGDOptimizer(parameter,
+                              lr,
+                              config.sgd().momentum(),
+                              config.sgd().decay(),
+                              config.sgd().nesterov());
+    }
+    if (config.optimizer() == OptimizerConfig::Adadelta) {
+      LOG(INFO) << "creating Adadelta optimizer";
+      return new AdadeltaOptimizer(parameter,
+                                   lr,
+                                   config.adadelta().rho(),
+                                   config.adadelta().epsilon(),
+                                   config.adadelta().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adagrad) {
+      LOG(INFO) << "creating Adagrad optimizer";
+      return new AdagradOptimizer(
+          parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
+    }
+    if (config.optimizer() == OptimizerConfig::Adam) {
+      LOG(INFO) << "creating Adam optimizer";
+      return new AdamOptimizer(parameter,
+                               lr,
+                               config.adam().beta_1(),
+                               config.adam().beta_2(),
+                               config.adam().epsilon(),
+                               config.adam().decay());
+    }
+    // default
+    LOG(WARNING)
+        << "have not select any Optimizer. use SGDOptimizer in default";
+    return new SGDOptimizer(parameter, lr, 0.0, 0.0, false);
+  };
+  return select_optimizer(parameter, config);
+}
+
+float *ParameterOptimizer::get_weight(int *param_size) const {
+  *param_size = (int)parameter_->size();
+  return parameter_->get_buffer();
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
new file mode 100644
index 0000000000..1f501c49e1
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <functional>
+#include <string>
+#include "OptimizerConfig.pb.h"
+#include "lr_policy.h"
+#include "serialization.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+class ParameterOptimizer {
+public:
+  /**
+   * @brief  update hook for algorithm need to traverse parameter more than
+   * once.
+   */
+  ParameterOptimizer(Tensor *parameter, LrPolicy *lr)
+      : parameter_(parameter), lr_policy_(lr), num_sample_passed_(0) {}
+  virtual ~ParameterOptimizer() {
+    delete parameter_;
+    delete lr_policy_;
+  }
+
+  static ParameterOptimizer *Create(const std::string &config_proto,
+                                    Tensor *parameter);
+  virtual void Update(const Tensor *gradient) = 0;
+  virtual float *get_weight(int *param_size) const;
+  virtual std::string SerializeState() = 0;
+  virtual void DeserializeState(const std::string &state) = 0;
+
+protected:
+  Tensor *parameter_;
+  // learning rate policy
+  LrPolicy *lr_policy_;
+  uint64_t num_sample_passed_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
new file mode 100644
index 0000000000..2bcfca55cc
--- /dev/null
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -0,0 +1,127 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "parameter_optimizer.h"
+#include <cmath>
+#include <map>
+#include <vector>
+#include "gtest/gtest.h"
+#include "lr_policy.h"
+
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = (float)rand() / (float)RAND_MAX;
+  }
+  return param;
+}
+
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
+  for (size_t i = 0; i < p.size(); ++i) {
+    p[i] = i;
+  }
+  return param;
+}
+
+class OptimizerTest : public testing::Test {
+public:
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
+  const size_t kSize = 5;
+
+  virtual void SetUp() {
+    CreateSGD();
+    CreateAdam();
+  }
+  virtual void TearDown() {}
+
+  void CreateSGD() {
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
+    config_.mutable_sgd()->set_momentum(0.0);
+    config_.mutable_sgd()->set_decay(0.0);
+    config_.mutable_sgd()->set_nesterov(false);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void CreateAdam() {
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
+    config_.mutable_adam()->set_beta_1(0.9);
+    config_.mutable_adam()->set_beta_2(0.1);
+    config_.mutable_adam()->set_epsilon(1e-3);
+    config_.mutable_adam()->set_decay(0.0);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
+    config_.mutable_const_lr()->set_learning_rate(0.1);
+    std::string str = config_.SerializeAsString();
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
+    opts_.push_back(opt);
+  }
+
+  void TestGetWeight() {
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+
+  void TestUpdate() {
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      opts_[i]->Update(g);
+    }
+  }
+
+  void TestCheckPoint() {
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
+    for (size_t i = 0; i < opts_.size(); ++i) {
+      auto state = opts_[i]->SerializeState();
+      opts_[i]->DeserializeState(state);
+      auto state1 = opts_[i]->SerializeState();
+      opts_[i]->DeserializeState(state);
+      EXPECT_EQ(state, state1);
+
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(static_cast<size_t>(s), kSize);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
+    }
+  }
+
+private:
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
+};
+
+TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
+
+TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
+
+TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
diff --git a/paddle/optimizer/serialization.h b/paddle/optimizer/serialization.h
new file mode 100644
index 0000000000..98548ddb7a
--- /dev/null
+++ b/paddle/optimizer/serialization.h
@@ -0,0 +1,49 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include "OptimizerConfig.pb.h"
+#include "paddle/utils/Logging.h"
+#include "tensor.h"
+
+namespace paddle {
+namespace optimizer {
+
+static void TensorToProto(const Tensor& tensor, TensorProto* proto) {
+  proto->set_data_type(TensorProto::PADDLE_ELEMENT_TYPE_FLOAT32);
+  std::stringstream os;
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    os << tensor[i];
+    proto->add_content(os.str());
+    os.str(std::string());
+  }
+}
+
+static void ProtoToTensor(const TensorProto& proto, Tensor* tensor) {
+  std::stringstream sin;
+  for (auto i = 0; i < proto.content_size(); ++i) {
+    sin << proto.content(i);
+    sin >> (*tensor)[i];
+    sin.str(std::string());
+    sin.clear();
+  }
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc
new file mode 100644
index 0000000000..25a8f5d351
--- /dev/null
+++ b/paddle/optimizer/serialization_test.cc
@@ -0,0 +1,46 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "serialization.h"
+#include "gtest/gtest.h"
+
+TEST(TensorToProto, Case1) {
+  paddle::optimizer::Tensor t(3), t1(3);
+  for (size_t i = 0; i < t.size(); ++i) {
+    t[i] = i;
+    t1[i] = 10;
+  }
+
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
+  for (size_t i = 0; i < t1.size(); ++i) {
+    EXPECT_EQ(t1[i], t[i]);
+  }
+}
+
+TEST(TensorToProto, Case2) {
+  paddle::optimizer::Tensor t(1), t1(1);
+  for (size_t i = 0; i < t.size(); ++i) {
+    t[i] = i;
+    t1[i] = 10;
+  }
+
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
+  for (size_t i = 0; i < t1.size(); ++i) {
+    EXPECT_EQ(t1[i], t[i]);
+  }
+}
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
new file mode 100644
index 0000000000..ee80f543fc
--- /dev/null
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "sgd_optimizer.h"
+#include "serialization.h"
+
+namespace paddle {
+namespace optimizer {
+
+void SGDOptimizer::Update(const Tensor *gradient) {
+  num_sample_passed_ += 1;
+  double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
+  float velocity = 0.0;
+  Tensor &param = *parameter_;
+  const Tensor &grad = *gradient;
+  Tensor &m = *momentums_;
+  for (size_t i = 0; i < param.size(); ++i) {
+    if (momentum_ == 0.0) {
+      velocity = -learning_rate * grad[i] - learning_rate * decay_ * param[i];
+    } else {
+      m[i] = momentum_ * m[i] - learning_rate * grad[i] -
+             learning_rate * decay_ * param[i];
+      velocity = m[i];
+    }
+    if (nesterov_) {
+      param[i] += momentum_ * velocity - learning_rate * grad[i];
+    } else {
+      param[i] += velocity;
+    }
+  }
+}
+
+std::string SGDOptimizer::SerializeState() {
+  SGDOptimizerState state;
+  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState();
+  state.mutable_lr_state()->ParseFromString(lr_str);
+  TensorToProto(*parameter_, state.mutable_parameter());
+  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
+  return state.SerializeAsString();
+}
+
+void SGDOptimizer::DeserializeState(const std::string &str) {
+  SGDOptimizerState state;
+  state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+  num_sample_passed_ = state.num_sample_passed();
+  ProtoToTensor(state.parameter(), parameter_);
+  if (momentum_ != 0.0) ProtoToTensor(state.momentums(), momentums_);
+}
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
new file mode 100644
index 0000000000..16a4df9973
--- /dev/null
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "parameter_optimizer.h"
+
+namespace paddle {
+namespace optimizer {
+
+class SGDOptimizer : public ParameterOptimizer {
+public:
+  SGDOptimizer(Tensor* parameter, LrPolicy* lr, double m, double d, bool n)
+      : ParameterOptimizer(parameter, lr),
+        momentums_(nullptr),
+        momentum_(m),
+        decay_(d),
+        nesterov_(n) {
+    if (momentum_ != 0.0) {
+      size_t size = parameter->size();
+      momentums_ = new Tensor(size);
+    }
+  }
+  virtual ~SGDOptimizer() {
+    if (momentums_) delete momentums_;
+  }
+  void Update(const Tensor* gradient);
+  std::string SerializeState();
+  void DeserializeState(const std::string& state);
+
+private:
+  Tensor* momentums_;
+  double momentum_;
+  double decay_;
+  bool nesterov_;
+};
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/optimizer/tensor.h b/paddle/optimizer/tensor.h
new file mode 100644
index 0000000000..e999e9bda1
--- /dev/null
+++ b/paddle/optimizer/tensor.h
@@ -0,0 +1,68 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+/**
+ * @brief tensor used by optimizer
+ */
+
+#include <string.h>
+#include <memory>
+#include "paddle/utils/Common.h"
+#include "paddle/utils/Logging.h"
+
+namespace paddle {
+namespace optimizer {
+
+template <class T>
+class TensorT {
+public:
+  TensorT(size_t size) : height_(1), width_(size) {
+    // new T[size]() initializes all element to zero value.
+    data_ptr_ = std::shared_ptr<T>(new T[size](), std::default_delete<T[]>());
+    data_ = data_ptr_.get();
+  }
+
+  TensorT(T* data, size_t size)
+      : height_(1), width_(size), data_ptr_(nullptr), data_(data) {}
+
+  TensorT(T* data, size_t h, size_t w)
+      : height_(h), width_(w), data_ptr_(nullptr), data_(data) {}
+
+  virtual ~TensorT() {}
+
+  T* get_buffer() { return this->data_; }
+
+  T& operator[](const size_t idx) {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  T& operator[](const size_t idx) const {
+    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
+    return data_[idx];
+  }
+  // TODO: replace with tensorshape
+  size_t size() const { return this->width_ * this->height_; }
+
+protected:
+  size_t height_;
+  size_t width_;
+  std::shared_ptr<T> data_ptr_;
+  T* data_;
+};
+
+// TODO(zhihong): design problem of dynamic datatype, need to fix it
+typedef TensorT<float> Tensor;
+
+}  // namespace optimizer
+}  // namespace paddle
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 6d9365af2d..8dbef0b22e 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -186,6 +186,7 @@ void Argument::resizeAndCopyFrom(const Argument& src,
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 }
 
 int32_t Argument::resizeAndCopyFrom(const Argument& src,
@@ -206,6 +207,7 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
   dataId = src.dataId;
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
+  frameDepth = src.frameDepth;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
@@ -276,17 +278,21 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
 void Argument::concat(const std::vector<Argument>& args,
                       const std::vector<int>& selectRows,
                       const std::vector<int>& seqStartPos,
+                      const std::vector<int>& copySize,
                       bool useGpu,
                       hl_stream_t stream,
                       PassType passType) {
   CHECK(!subSequenceStartPositions)
       << "undefined behavior for subsequence positions";
 
-  size_t batchSize = selectRows.size();
+  size_t batchSize = 0;
+  for (size_t i = 0; i < copySize.size(); ++i)
+    batchSize += copySize[i] * (seqStartPos[i + 1] - seqStartPos[i]);
+
   auto copyArg = [batchSize, stream](MatrixPtr& dst,
                                      MatrixPtr src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -300,14 +306,14 @@ void Argument::concat(const std::vector<Argument>& args,
       dst->resize(batchSize, width);
     }
 
-    MatrixPtr tmpMatrix = dst->subMatrix(startRow, size);
-    tmpMatrix->copyFrom(*src->subMatrix(pos, size), stream);
+    MatrixPtr tmpMatrix = dst->subMatrix(desStartRow, size);
+    tmpMatrix->copyFrom(*src->subMatrix(srcStartRow, size), stream);
   };
 
   auto copyIds = [batchSize, stream](IVectorPtr& dst,
                                      const IVectorPtr& src,
-                                     int startRow,
-                                     int pos,
+                                     int desStartRow,
+                                     int srcStartRow,
                                      int size,
                                      bool useGpu) {
     if (!src) {
@@ -315,13 +321,14 @@ void Argument::concat(const std::vector<Argument>& args,
       return;
     }
     IVector::resizeOrCreate(dst, batchSize, useGpu);
-    dst->subVec(startRow, size)->copyFrom(*src->subVec(pos, size), stream);
+    dst->subVec(desStartRow, size)
+        ->copyFrom(*src->subVec(srcStartRow, size), stream);
   };
 
   auto copyStrs = [batchSize, stream](SVectorPtr& dst,
                                       const SVectorPtr& src,
-                                      int startRow,
-                                      int pos,
+                                      int desStartRow,
+                                      int srcStartRow,
                                       int size,
                                       bool useGpu) {
     if (!src) {
@@ -333,30 +340,31 @@ void Argument::concat(const std::vector<Argument>& args,
     } else {
       dst->resize(batchSize);
     }
-    std::copy(
-        src->begin() + pos, src->begin() + pos + size, dst->begin() + startRow);
+    std::copy(src->begin() + srcStartRow,
+              src->begin() + srcStartRow + size,
+              dst->begin() + desStartRow);
   };
 
   dataId = args[0].dataId;
   CHECK_NE(seqStartPos.size(), 0UL);
-  size_t sampleNum = seqStartPos.size() - 1;
-  for (size_t i = 0; i < sampleNum; ++i) {
+  int desStartRow = 0;
+  for (size_t i = 0; i < copySize.size(); ++i) {
     int startPos = seqStartPos[i];
     int endPos = seqStartPos[i + 1];
     CHECK_GE(args.size(), static_cast<size_t>(endPos - startPos));
     for (int j = startPos; j < endPos; ++j) {
       const Argument& arg = args[j - startPos];
-      CHECK_EQ(arg.dataId, dataId) << "Arguments in concat should have"
-                                   << " same dataId";
-      const int copySize = 1;
-      const int rowIdx = selectRows[j];
-      copyArg(in, arg.in, j, rowIdx, copySize, useGpu);
-      copyArg(value, arg.value, j, rowIdx, copySize, useGpu);
+      CHECK_EQ(arg.dataId, dataId) << "Arguments to concatenate should have "
+                                   << "the same dataId.";
+      const int srcStartRow = selectRows[j];
+      copyArg(in, arg.in, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyArg(value, arg.value, desStartRow, srcStartRow, copySize[i], useGpu);
       if (passType != PASS_TEST) {
-        copyArg(grad, arg.grad, j, rowIdx, copySize, useGpu);
+        copyArg(grad, arg.grad, desStartRow, srcStartRow, copySize[i], useGpu);
       }
-      copyIds(ids, arg.ids, j, rowIdx, copySize, useGpu);
-      copyStrs(strs, arg.strs, j, rowIdx, copySize, useGpu);
+      copyIds(ids, arg.ids, desStartRow, srcStartRow, copySize[i], useGpu);
+      copyStrs(strs, arg.strs, desStartRow, srcStartRow, copySize[i], useGpu);
+      desStartRow += copySize[i];
     }
   }
   ICpuGpuVector::resizeOrCreate(
@@ -561,7 +569,7 @@ void Argument::degradeSequence(const Argument& input) {
 
 void Argument::poolSequenceWithStride(const Argument& input,
                                       size_t stride,
-                                      IVectorPtr* stridePostions,
+                                      ICpuGpuVectorPtr* stridePostions,
                                       bool reversed) {
   // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
   // then sequenceStartPositions = [0, 2, 3, 4, 7].
@@ -598,8 +606,8 @@ void Argument::poolSequenceWithStride(const Argument& input,
   stridePos.emplace_back(starts[numSequences]);
   int size = stridePos.size();
   CHECK_EQ(size - 1, tgtBuf[numSequences]);
-  IVector::resizeOrCreate(*stridePostions, size, false);
-  (*stridePostions)->copyFrom(stridePos.data(), size);
+  ICpuGpuVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->getMutableVector(false)->copyFrom(stridePos.data(), size);
 }
 
 void Argument::getValueString(
@@ -632,7 +640,7 @@ void Argument::printValueString(std::ostream& stream,
                                 const std::string& prefix) const {
   std::unordered_map<std::string, std::string> out;
   getValueString(&out);
-  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
     auto it = out.find(field);
     if (it != out.end()) {
       stream << prefix << field << ":\n" << it->second;
@@ -666,4 +674,34 @@ void Argument::subArgFrom(const Argument& input,
   }
 }
 
+void Argument::reorganizeSeqInfo(
+    const ICpuGpuVectorPtr seqStartPos,
+    const ICpuGpuVectorPtr subSeqStartPos,
+    std::vector<std::vector<int>>& reorganizedSeqInfo) {
+  CHECK(seqStartPos);
+  reorganizedSeqInfo.clear();
+
+  int seqNum = seqStartPos->getSize() - 1;
+  int* seqStarts = seqStartPos->getMutableData(false);
+
+  if (subSeqStartPos) {
+    int* subSeqStarts = subSeqStartPos->getMutableData(false);
+    reorganizedSeqInfo.resize(seqNum, std::vector<int>());
+    int seqIdx = 0;
+    for (size_t i = 0; i < subSeqStartPos->getSize(); ++i) {
+      reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      if (subSeqStarts[i] == seqStarts[seqIdx + 1]) {
+        seqIdx++;
+        if (seqIdx == seqNum) return;
+        reorganizedSeqInfo[seqIdx].push_back(subSeqStarts[i]);
+      }
+    }
+  } else {
+    reorganizedSeqInfo.resize(1, std::vector<int>(seqNum + 1, 0));
+    memcpy(reorganizedSeqInfo[0].data(),
+           seqStarts,
+           sizeof(int) * seqStartPos->getSize());
+  }
+}
+
 }  // namespace paddle
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 91aca98e18..7b59199dde 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,6 +32,7 @@ struct Argument {
         strs(nullptr),
         frameHeight(0),
         frameWidth(0),
+        frameDepth(0),
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
@@ -64,6 +62,7 @@ struct Argument {
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
     frameWidth = argument.frameWidth;
+    frameDepth = argument.frameDepth;
     dataId = argument.dataId;
   }
 
@@ -76,6 +75,7 @@ struct Argument {
   // A dataBatch includes batchSize frames, one frame maybe not only vector
   size_t frameHeight;
   size_t frameWidth;
+  size_t frameDepth;
 
   // If NULL, each position is treated independently.
   // Otherwise, its size should be #NumberOfSequences + 1.
@@ -136,8 +136,10 @@ struct Argument {
   }
   size_t getFrameHeight() const { return frameHeight; }
   size_t getFrameWidth() const { return frameWidth; }
+  size_t getFrameDepth() const { return frameDepth; }
   void setFrameHeight(size_t h) { frameHeight = h; }
   void setFrameWidth(size_t w) { frameWidth = w; }
+  void setFrameDepth(size_t d) { frameDepth = d; }
 
   int64_t getNumSequences() const {
     return sequenceStartPositions ? sequenceStartPositions->getSize() - 1
@@ -149,6 +151,7 @@ struct Argument {
                                      : getBatchSize();
   }
 
+  bool hasSeq() const { return sequenceStartPositions != nullptr; }
   bool hasSubseq() const { return subSequenceStartPositions != nullptr; }
 
   const int* getCpuStartPositions() const {
@@ -239,6 +242,7 @@ struct Argument {
   void concat(const std::vector<Argument>& args,
               const std::vector<int>& selectRows,
               const std::vector<int>& seqStartPos,
+              const std::vector<int>& copySize,
               bool useGpu,
               hl_stream_t stream,
               PassType passType);
@@ -298,7 +302,7 @@ struct Argument {
    */
   void poolSequenceWithStride(const Argument& input,
                               size_t stride,
-                              IVectorPtr* stridePositions,
+                              ICpuGpuVectorPtr* stridePositions,
                               bool reversed = false);
   /**
    * @brief getValueString will return the argument's output in string. There
@@ -316,6 +320,30 @@ struct Argument {
    */
   void printValueString(std::ostream& stream,
                         const std::string& prefix = "") const;
+
+  /**
+   * @brief reorganizeSeqInfo will reorganize sequenceStartPositions and
+   * subSequenceStartPositions into a 2 dimensional arrary: reorganizedSeqInfo.
+   *
+   * @param seqStartPos: sequenceStartPositions of an Argument.
+   * @param subSeqStartPos: subSequenceStartPositions of an Argument.
+   * @param the reorganized sequence start position information.
+   *
+   * Examples:
+   * seqStartPos: [0, 4, 15, 20, 28]
+   * subSeqStartPos: [0, 3, 4, 5, 7, 10, 15, 20, 22, 23, 25, 28]
+   * reorganizedSeqInfo:
+   *   [
+   *     [0,3,4],
+   *     [4,5,7,10,15],
+   *     [15,20],
+   *     [20,22,23,25,28]
+   *   ]
+   */
+  static void reorganizeSeqInfo(
+      const ICpuGpuVectorPtr seqStartPos,
+      const ICpuGpuVectorPtr subSeqStartPos,
+      std::vector<std::vector<int>>& reorganizedSeqInfo);
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index a35e46997f..d2ae1c16c6 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -7,7 +7,7 @@ add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter gen_proto_cpp)
+add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index caa78acd98..5b0c52a30d 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "ParameterOptimizer.h"
+#include "ParameterUpdateFunctions.h"
 #include "Regularizer.h"
 
 namespace paddle {
@@ -37,6 +38,15 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
+#ifdef PADDLE_WITH_MKLDNN
+    sgdUpdate(learningRate_ * paraConfig.learning_rate() *
+                  (firstTime_ ? 1.0 : torch_learningRate),
+              paraConfig.momentum(),
+              applyDecay_ ? paraConfig.decay_rate() : 0,
+              vecs[PARAMETER_VALUE].get(),
+              vecs[PARAMETER_GRADIENT].get(),
+              vecs[PARAMETER_MOMENTUM].get());
+#else
     vecs[PARAMETER_VALUE]->sgdUpdate(
         *vecs[PARAMETER_GRADIENT],
         *vecs[PARAMETER_MOMENTUM],
@@ -44,6 +54,7 @@ public:
             (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
   }
   virtual void finishBatch() { firstTime_ = false; }
 };
@@ -254,6 +265,10 @@ public:
     addParameterType(PARAMETER_SECOND_MOMENTUM);
   }
 
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
   virtual void finishBatch() { ++step_; }
 
   virtual void update(const VectorPtr vecs[],
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index ebe36d4937..3b0f09cea6 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -48,7 +48,8 @@ Parameter::Parameter(const ParameterConfig& config, bool useGpu, bool doInit)
       deviceId_(-1),
       sharedCount_(0),
       updateCounter_(0),
-      updated_(false) {
+      updated_(false),
+      headerFormat_(PARAM_FORMAT_ORIGINAL) {
   setID(-1); /* capture uninitialized id */
   if (useGpu_ && FLAGS_parallel_nn) {
     /* gpu environment is specified by device property */
@@ -199,7 +200,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
                                      false,
                                      useGpu_);
     }
-  } else if (matType == MAT_NORMAL_SHARED) {
+  }
+#ifndef PADDLE_MOBILE_INFERENCE
+  // NOLINTNEXTLINE
+  else if (matType == MAT_NORMAL_SHARED) {
     CHECK_EQ(height * width, bufs_[pType]->getSize());
     size_t blockNum = 0;
     CHECK(isGradShared(&blockNum));
@@ -258,7 +262,10 @@ void Parameter::setMat(ParameterType pType, int matType) {
   } else if (matType == MAT_SPARSE_ROW_AUTO_GROW) {
     CHECK(isGradSparseUpdate());
     mats_[pType] = std::make_shared<SparseAutoGrowRowCpuMatrix>(height, width);
-  } else {
+  }
+#endif
+  // NOLINTNEXTLINE
+  else {
     LOG(FATAL) << "Unsupported mat type" << matType;
   }
 }
@@ -285,7 +292,7 @@ bool Parameter::save(const std::string& filename) const {
 bool Parameter::save(std::ostream& s) const {
   CpuVector vec(*bufs_[PARAMETER_VALUE].get());
   Header header;
-  header.version = kFormatVersion;
+  header.format = headerFormat_;
   header.valueSize = sizeof(real);
   header.size = getSize();
 
@@ -344,8 +351,9 @@ bool Parameter::load(std::istream& s) {
   Header header;
   CHECK(s.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameter " << getName();
-  CHECK_EQ(header.version, kFormatVersion) << "Incorrect format version: "
-                                           << header.version;
+  CHECK(isHeaderFormatSupported(header.format)) << "Incorrect format version: "
+                                                << header.format;
+  headerFormat_ = header.format;
   CHECK_EQ(header.size, getSize())
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << getSize() << ") of the parameter: " << getName();
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 0bac76f068..04f12efaac 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -34,6 +34,20 @@ limitations under the License. */
 
 namespace paddle {
 
+typedef enum {
+  /// The paddle original basic format
+  PARAM_FORMAT_ORIGINAL = 0,
+
+  /// See mkldnn_memory_format_t in
+  /// https://github.com/01org/mkl-dnn/blob/master/include/mkldnn_types.h
+  /// for a detailed description.
+  /// 2D weights tensor in the format (output channels, input channels).
+  PARAM_FORMAT_MKLDNN_OI,
+
+  /// The total format items numbers
+  PARAM_FORMAT_ITEMS,
+} PARAM_FORMAT;
+
 class SparsePrefetchRowCpuMatrix;
 
 class Parameter;
@@ -51,7 +65,10 @@ public:
   size_t getSize() const { return config_.size(); }
 
   bool isFullSize() const {
-    return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    if (bufs_[PARAMETER_VALUE]) {
+      return this->getSize() == bufs_[PARAMETER_VALUE]->getSize();
+    }
+    return false;
   }
 
   inline bool useGpu() const { return useGpu_; }
@@ -242,14 +259,34 @@ public:
   /// Initialize the value to 0
   void zeroMem();
 
-  static const int kFormatVersion = 0;
   /// file header structure
   struct Header {
-    int32_t version;     // = 0, file format version
+    int32_t format;      // = PARAM_FORMAT
     uint32_t valueSize;  // = sizeof(real)
     uint64_t size;       // = getSize()
   };
 
+  /**
+   * @brief Is the header format supported.
+   */
+  static bool isHeaderFormatSupported(int32_t fmt) {
+    return fmt < PARAM_FORMAT_ITEMS;
+  }
+
+  /**
+   * @brief Get the format in header.
+   */
+  int getHeaderFormat() { return headerFormat_; }
+
+  /**
+   * @brief Set the format in header.
+   */
+  void setHeaderFormat(int32_t fmt) {
+    CHECK(isHeaderFormatSupported(fmt)) << "Unsupported format version: "
+                                        << fmt;
+    headerFormat_ = fmt;
+  }
+
   /**
    * @brief  Parameter Update Hook.
    *
@@ -321,6 +358,9 @@ protected:
   bool updated_;
   SparseFormat format_;
 
+  /// The header format for saving or loading param
+  int32_t headerFormat_;
+
   std::vector<std::shared_ptr<IParameterUpdaterHook>> updaterHooks_;
 
 public:
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index c8af7105c7..d60cb36383 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,6 +30,9 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
                      decayRate * value[i];
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index f826e8448c..c8b47687f5 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #include "ParameterUpdaterHook.h"
 
+#include <algorithm>
 #include <atomic>
 #include <fstream>
 #include <mutex>
 #include <thread>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
@@ -29,106 +31,76 @@ namespace paddle {
 
 /**
  * The static pruning hook
- *
- * Static means user load a mask map before training started. This map will
- * define which link/weight between neural is disabled.
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
  */
+
 class StaticPruningHook : public IParameterUpdaterHook {
 public:
-  /**
-   * The Mask Map Header.
-   * The map file started with this header.
-   *
-   * In Version 0, reset file will be:
-   *  contains header.size bit, each bit means such weight is enabled or not.
-   *    if bit is 1, then such weight is enabled.
-   *  at end, the file will round to byte, and the low bits of end byte will be
-   *  filled by zero.
-   *
-   */
-  struct StaticMaskHeader {
-    uint32_t version;
-    size_t size;
-  } __attribute__((__packed__));
-
-  explicit StaticPruningHook(const std::string& mask_filename) : initCount_(0) {
-    bool ok = this->loadMaskFile(mask_filename);
-    if (!ok) {
-      LOG(WARNING) << "Fail to load mask file " << mask_filename
-                   << " in current directory, searching in init_model_path";
-      std::string combineMaskFilename =
-          path::join(FLAGS_init_model_path, mask_filename);
-      CHECK(this->loadMaskFile(combineMaskFilename))
-          << "Cannot load " << mask_filename << " in ./" << mask_filename
-          << " and " << combineMaskFilename;
-    }
-    VLOG(3) << mask_filename << " mask size = " << this->mask_.size();
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
   }
 
-  void update(Parameter* para) {
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
     updateThreadChecker_.check();
-    auto& vec = para->getBuf(PARAMETER_GRADIENT);
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
     if (vec) {
       vec->dotMul(*maskVec_);
     }
   }
 
-  void init(Parameter* para) {
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
 
-    auto maskVec = Vector::create(this->mask_.size(), false);
-    {  // Initialize maskVec with float mask vector
-      real* dataPtr = maskVec->getData();
-      size_t i = 0;
-      for (bool m : mask_) {
-        dataPtr[i++] = m ? 1.0 : 0.0;
-      }
-    }
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
 
     // Currently just use a mask vector for hack.
-    // @TODO(yuyang18): Implemented the mask operation in vector.
     if (para->useGpu()) {
-      maskVec_ = Vector::create(this->mask_.size(), para->useGpu());
-      maskVec_->copyFrom(*maskVec);
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
     } else {
-      maskVec_ = maskVec;
+      maskVec_ = maskTemp;
     }
-
-    auto& vec = para->getBuf(PARAMETER_VALUE);
-    vec->dotMul(*maskVec_);
   }
 
-private:
-  bool loadMaskFile(const std::string& mask_filename) {
-    std::ifstream fin;
-    fin.open(mask_filename);
-    if (fin.is_open()) {
-      StaticMaskHeader header;
-      fin.read(reinterpret_cast<char*>(&header), sizeof(StaticMaskHeader));
-      CHECK_EQ(header.version, 0UL);
-      mask_.resize(header.size);
-      uint8_t buf;
-      for (size_t i = 0; i < header.size; ++i, buf <<= 1) {
-        if (i % 8 == 0) {
-          fin.read(reinterpret_cast<char*>(&buf), sizeof(uint8_t));
-        }
-        mask_[i] = buf & 0x80;
-      }
-      fin.close();
-      return true;
-    } else {
-      return false;
-    }
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
   }
 
+private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
-  std::vector<bool> mask_;
+  real sparsityRatio_;
 };
 
 IParameterUpdaterHook::IParameterUpdaterHook() {}
@@ -145,7 +117,7 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  */
 class StringIntPairHasher {
 public:
-  size_t operator()(const std::pair<std::string, int>& k) const {
+  size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
@@ -162,19 +134,19 @@ static WeakKVCache<std::pair<std::string, int>,
 /**
  * ParameterUpdaterHook actually factory method.
  */
-static IParameterUpdaterHook* createImpl(
-    const ParameterUpdaterHookConfig& config) {
-  auto& type = config.type();
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
   if (type == "pruning") {
-    if (config.has_purning_mask_filename()) {
-      return new StaticPruningHook(config.purning_mask_filename());
-    }
+    return new StaticPruningHook(config);
   }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
   return nullptr;
 }
 
 std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig& paramConfig, int idx) {
+    const ParameterConfig &paramConfig, int idx) {
   std::pair<std::string, int> key = {paramConfig.name(), idx};
   return g_hookCache_.get(
       key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
index 81fe4ee397..19df6ea957 100644
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -31,7 +31,7 @@ TEST(Argument, poolSequenceWithStride) {
   int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
 
   for (auto reversed : {false, true}) {
-    IVectorPtr stridePositions;
+    ICpuGpuVectorPtr stridePositions;
     output.poolSequenceWithStride(
         input, 5 /* stride */, &stridePositions, reversed);
 
@@ -42,10 +42,10 @@ TEST(Argument, poolSequenceWithStride) {
     CHECK_EQ(outStart[3], 4);
     CHECK_EQ(outStart[4], 7);
 
-    CHECK_EQ(stridePositions->getSize(), 8);
+    CHECK_EQ(stridePositions->getSize(), 8UL);
     auto result = reversed ? strideResultReversed : strideResult;
     for (int i = 0; i < 8; i++) {
-      CHECK_EQ(stridePositions->getData()[i], result[i]);
+      CHECK_EQ(stridePositions->getData(false)[i], result[i]);
     }
   }
 }
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 8bab5a6289..64d204aea1 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -172,53 +172,3 @@ TEST_F(CommonTest, syncThreadPool) {
     EXPECT_EQ((int)0, nums[i]);
   }
 }
-
-TEST_F(CommonTest, barrierStat) {
-  const int threadNum = 10;
-
-  SyncThreadPool pool(threadNum);
-
-#define TEST_BARRIER_RANDOM(statName, numConnThreads, ...)       \
-  pool.exec([&](int tid, size_t numThreads) {                    \
-    struct timeval time;                                         \
-    gettimeofday(&time, nullptr);                                \
-    uint64_t usec = timeToMicroSecond(time);                     \
-    std::srand(usec);                                            \
-    auto value = std::rand() % 100000;                           \
-    usleep(value);                                               \
-    REGISTER_SLOW_NODES_PROBE(                                   \
-        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
-  });
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER_RANDOM("synThreadBarrier1", threadNum);
-    TEST_BARRIER_RANDOM("synThreadBarrier2", threadNum);
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER_RANDOM("synThreadBarrier3", threadNum, "tag0");
-    TEST_BARRIER_RANDOM("synThreadBarrier4", threadNum, "tag1");
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-
-// use it to test accurate barrier gap
-#define TEST_BARRIER(statName, numConnThreads, ...)              \
-  pool.exec([&](int tid, size_t numThreads) {                    \
-    usleep(tid * 10000);                                         \
-    REGISTER_SLOW_NODES_PROBE(                                   \
-        globalStat, statName, numConnThreads, tid, __VA_ARGS__); \
-  });
-
-  for (auto i = 0; i < 10; i++) {
-    TEST_BARRIER("synThreadBarrier3", threadNum, "tag0");
-    TEST_BARRIER("synThreadBarrier4", threadNum, "tag1");
-  }
-
-  globalStat.printAllStatus();
-  globalStat.reset();
-}
diff --git a/paddle/platform/.clang-format b/paddle/platform/.clang-format
new file mode 100644
index 0000000000..29282dc87e
--- /dev/null
+++ b/paddle/platform/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
new file mode 100644
index 0000000000..d68caea997
--- /dev/null
+++ b/paddle/platform/CMakeLists.txt
@@ -0,0 +1,49 @@
+if(WITH_GPU)
+  cc_library(enforce SRCS enforce.cc DEPS nccl)
+else()
+  cc_library(enforce SRCS enforce.cc)
+endif()
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece enforce)
+
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog enforce)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
+
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
+
+cc_library(place SRCS place.cc DEPS enforce boost)
+cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
+add_subdirectory(dynload)
+
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
+
+IF(WITH_MKLDNN)
+    set(MKLDNN_CTX_DEPS mkldnn)
+ELSE()
+    set(MKLDNN_CTX_DEPS)
+ENDIF()
+
+# memcpy deoends on device_context, here add deps individually for
+# avoiding cycle dependencies
+cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
+    system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
+
+nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
+nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
+
+cc_library(profiler SRCS profiler.cc DEPS device_context)
+cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB PLATFORM_HEADERS *.h)
+  file(GLOB PLATFORM_dynload_HEADERS dynload/*.h)
+  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform)
+  install(FILES ${PLATFORM_HEADERS} DESTINATION include/paddle/platform/dynload)
+  install(FILES details/device_ptr_cast.h DESTINATION include/paddle/platform/details)
+endif()
diff --git a/paddle/platform/assert.h b/paddle/platform/assert.h
new file mode 100644
index 0000000000..d813b9529b
--- /dev/null
+++ b/paddle/platform/assert.h
@@ -0,0 +1,43 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#define STRINGIFY(x) #x
+#define TOSTRING(x) STRINGIFY(x)
+
+#if defined(__APPLE__) && defined(__CUDA_ARCH__) && !defined(NDEBUG)
+#include <stdio.h>
+#define PADDLE_ASSERT(e)                                           \
+  do {                                                             \
+    if (!(e)) {                                                    \
+      printf("%s:%d Assertion `%s` failed.\n", __FILE__, __LINE__, \
+             TOSTRING(e));                                         \
+      asm("trap;");                                                \
+    }                                                              \
+  } while (0)
+
+#define PADDLE_ASSERT_MSG(e, m)                                         \
+  do {                                                                  \
+    if (!(e)) {                                                         \
+      printf("%s:%d Assertion `%s` failed (%s).\n", __FILE__, __LINE__, \
+             TOSTRING(e), m);                                           \
+      asm("trap;");                                                     \
+    }                                                                   \
+  } while (0)
+#else
+#include <assert.h>
+#define PADDLE_ASSERT(e) assert(e)
+#define PADDLE_ASSERT_MSG(e, m) assert((e) && (m))
+#endif
diff --git a/paddle/platform/call_once.h b/paddle/platform/call_once.h
new file mode 100644
index 0000000000..44a4d38f67
--- /dev/null
+++ b/paddle/platform/call_once.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mutex>
+
+namespace paddle {
+namespace platform {
+
+/*
+ The current implementation of std::call_once has a bug described in
+ https://stackoverflow.com/questions/41717579/stdcall-once-hangs-on-second-call-after-callable-threw-on-first-call.
+ This is likely caused by a deeper bug of pthread_once, which is discussed in
+ https://patchwork.ozlabs.org/patch/482350/
+
+ This wrap is a hack to avoid this bug.
+*/
+template <typename Callable, typename... Args>
+inline void call_once(std::once_flag& flag, Callable&& f, Args&&... args) {
+  bool good = true;
+  std::exception ex;
+  try {
+    std::call_once(flag,
+                   [&](Args&&... args) {
+                     try {
+                       f(args...);
+                     } catch (const std::exception& e) {
+                       ex = e;
+                       good = false;
+                     } catch (...) {
+                       ex = std::runtime_error("excption caught in call_once");
+                       good = false;
+                     }
+                   },
+                   args...);
+  } catch (std::system_error& x) {
+    throw std::runtime_error("call once failed");
+  }
+  if (!good) {
+    throw std::exception(ex);
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000..78e1fa9df5
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
new file mode 100644
index 0000000000..8df7c7b4bc
--- /dev/null
+++ b/paddle/platform/cpu_info.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000..1bfe62c1fb
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,34 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
diff --git a/paddle/platform/cuda_helper.h b/paddle/platform/cuda_helper.h
new file mode 100644
index 0000000000..376bb0e688
--- /dev/null
+++ b/paddle/platform/cuda_helper.h
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda.h>
+
+namespace paddle {
+namespace platform {
+
+#define CUDA_ATOMIC_WRAPPER(op, T) \
+  __device__ __forceinline__ T CudaAtomic##op(T* address, const T val)
+
+#define USE_CUDA_ATOMIC(op, T) \
+  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
+
+// Default thread count per block(or block size).
+// TODO(typhoonzero): need to benchmark against setting this value
+//                    to 1024.
+constexpr int PADDLE_CUDA_NUM_THREADS = 512;
+
+// For atomicAdd.
+USE_CUDA_ATOMIC(Add, float);
+USE_CUDA_ATOMIC(Add, int);
+USE_CUDA_ATOMIC(Add, unsigned int);
+USE_CUDA_ATOMIC(Add, unsigned long long int);
+
+CUDA_ATOMIC_WRAPPER(Add, int64_t) {
+  static_assert(sizeof(int64_t) == sizeof(long long int),
+                "long long should be int64");
+  return CudaAtomicAdd(reinterpret_cast<unsigned long long int*>(address),
+                       static_cast<unsigned long long int>(val));
+}
+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+USE_CUDA_ATOMIC(Add, double);
+#else
+CUDA_ATOMIC_WRAPPER(Add, double) {
+  unsigned long long int* address_as_ull =
+      reinterpret_cast<unsigned long long int*>(address);
+  unsigned long long int old = *address_as_ull, assumed;
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+
+    // Note: uses integer comparison to avoid hang in case of NaN
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000..67d5f626d4
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::string config_file) {
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
new file mode 100644
index 0000000000..80a4c9bb4b
--- /dev/null
+++ b/paddle/platform/cudnn_helper.h
@@ -0,0 +1,286 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/macros.h"
+
+namespace paddle {
+namespace platform {
+
+inline const char* cudnnGetErrorString(cudnnStatus_t status) {
+  switch (status) {
+    case CUDNN_STATUS_SUCCESS:
+      return "CUDNN_STATUS_SUCCESS";
+    case CUDNN_STATUS_NOT_INITIALIZED:
+      return "CUDNN_STATUS_NOT_INITIALIZED";
+    case CUDNN_STATUS_ALLOC_FAILED:
+      return "CUDNN_STATUS_ALLOC_FAILED";
+    case CUDNN_STATUS_BAD_PARAM:
+      return "CUDNN_STATUS_BAD_PARAM";
+    case CUDNN_STATUS_INTERNAL_ERROR:
+      return "CUDNN_STATUS_INTERNAL_ERROR";
+    case CUDNN_STATUS_INVALID_VALUE:
+      return "CUDNN_STATUS_INVALID_VALUE";
+    case CUDNN_STATUS_ARCH_MISMATCH:
+      return "CUDNN_STATUS_ARCH_MISMATCH";
+    case CUDNN_STATUS_MAPPING_ERROR:
+      return "CUDNN_STATUS_MAPPING_ERROR";
+    case CUDNN_STATUS_EXECUTION_FAILED:
+      return "CUDNN_STATUS_EXECUTION_FAILED";
+    case CUDNN_STATUS_NOT_SUPPORTED:
+      return "CUDNN_STATUS_NOT_SUPPORTED";
+    case CUDNN_STATUS_LICENSE_ERROR:
+      return "CUDNN_STATUS_LICENSE_ERROR";
+    default:
+      return "Unknown cudnn error number";
+  }
+}
+
+#define CUDNN_VERSION_MIN(major, minor, patch) \
+  (CUDNN_VERSION >= ((major)*1000 + (minor)*100 + (patch)))
+
+#define CUDNN_ENFORCE(condition)                                  \
+  do {                                                            \
+    cudnnStatus_t status = condition;                             \
+    if (status != CUDNN_STATUS_SUCCESS) {                         \
+      VLOG(1) << ::paddle::platform::cudnnGetErrorString(status); \
+      PADDLE_THROW("cuDNN call failed");                          \
+    }                                                             \
+  } while (false)
+
+enum class DataLayout {  // Not use
+  kNHWC,
+  kNCHW,
+  kNCDHW,
+  kNCHW_VECT_C,
+};
+
+enum class PoolingMode {
+  kMaximum,
+  kAverage,
+};
+
+template <typename T>
+class CudnnDataType;
+
+template <>
+class CudnnDataType<float> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_FLOAT;
+  typedef const float ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+template <>
+class CudnnDataType<double> {
+ public:
+  static const cudnnDataType_t type = CUDNN_DATA_DOUBLE;
+  typedef const double ScalingParamType;
+  static ScalingParamType* kOne() {
+    static ScalingParamType v = 1.0;
+    return &v;
+  }
+  static ScalingParamType* kZero() {
+    static ScalingParamType v = 0.0;
+    return &v;
+  }
+};
+
+inline cudnnTensorFormat_t GetCudnnTensorFormat(
+    const DataLayout& order) {  // Not use
+  switch (order) {
+    case DataLayout::kNHWC:
+      return CUDNN_TENSOR_NHWC;
+    case DataLayout::kNCHW:
+      return CUDNN_TENSOR_NCHW;
+    case DataLayout::kNCDHW:
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
+    default:
+      PADDLE_THROW("Unknown cudnn equivalent for order");
+  }
+  return CUDNN_TENSOR_NCHW;
+}
+
+class ScopedTensorDescriptor {
+ public:
+  ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateTensorDescriptor(&desc_));
+  }
+  ~ScopedTensorDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyTensorDescriptor(desc_));
+  }
+
+  inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    // the format is not used now, will add later
+    std::vector<int> strides(dims.size());
+    strides[dims.size() - 1] = 1;
+    for (int i = dims.size() - 2; i >= 0; i--) {
+      strides[i] = dims[i + 1] * strides[i + 1];
+    }
+    // Update tensor descriptor dims setting if groups > 1
+    // NOTE: Assume using NCHW or NCDHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
+        desc_, type, dims_with_group.size(), dims_with_group.data(),
+        strides.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
+                      groups);
+  }
+
+ private:
+  cudnnTensorDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+};
+
+class ScopedFilterDescriptor {
+ public:
+  ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateFilterDescriptor(&desc_));
+  }
+  ~ScopedFilterDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyFilterDescriptor(desc_));
+  }
+
+  inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
+                                            const cudnnDataType_t type,
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    // filter layout: MCHW(MCDHW), where M is the number of
+    // output image channels, C is the number of input image channels,
+    // D is the depth of the filter, H is the height of the filter, and W is the
+    // width of the filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
+    PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
+        desc_, type, format, kernel_with_group.size(),
+        kernel_with_group.data()));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
+                      kernel, groups);
+  }
+
+ private:
+  cudnnFilterDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+};
+
+class ScopedConvolutionDescriptor {
+ public:
+  ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreateConvolutionDescriptor(&desc_));
+  }
+  ~ScopedConvolutionDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyConvolutionDescriptor(desc_));
+  }
+
+  inline cudnnConvolutionDescriptor_t descriptor(
+      cudnnDataType_t type, const std::vector<int>& pads,
+      const std::vector<int>& strides, const std::vector<int>& dilations) {
+    PADDLE_ENFORCE_EQ(pads.size(), strides.size());
+    PADDLE_ENFORCE_EQ(pads.size(), dilations.size());
+
+#if !CUDNN_VERSION_MIN(6, 0, 0)
+    // cudnn v5 does not support dilation conv, the argument is called upscale
+    // instead of dilations and it is must be one.
+    for (size_t i = 0; i < dilations.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          dilations[i], 1,
+          "Dilations conv is not supported in this cuDNN version(%d.%d.%d).",
+          CUDNN_VERSION / 1000, CUDNN_VERSION % 1000 / 100,
+          CUDNN_VERSION % 100);
+    }
+#endif
+
+    PADDLE_ENFORCE(dynload::cudnnSetConvolutionNdDescriptor(
+        desc_, pads.size(), pads.data(), strides.data(), dilations.data(),
+        CUDNN_CROSS_CORRELATION, type));
+    return desc_;
+  }
+
+  template <typename T>
+  inline cudnnConvolutionDescriptor_t descriptor(
+      const std::vector<int>& pads, const std::vector<int>& strides,
+      const std::vector<int>& dilations) {
+    return descriptor(CudnnDataType<T>::type, pads, strides, dilations);
+  }
+
+ private:
+  cudnnConvolutionDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+};
+
+class ScopedPoolingDescriptor {
+ public:
+  ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnCreatePoolingDescriptor(&desc_));
+  }
+  ~ScopedPoolingDescriptor() {
+    PADDLE_ENFORCE(dynload::cudnnDestroyPoolingDescriptor(desc_));
+  }
+
+  inline cudnnPoolingDescriptor_t descriptor(const PoolingMode& mode,
+                                             const std::vector<int>& kernel,
+                                             const std::vector<int>& pads,
+                                             const std::vector<int>& strides) {
+    PADDLE_ENFORCE_EQ(kernel.size(), pads.size());
+    PADDLE_ENFORCE_EQ(kernel.size(), strides.size());
+    PADDLE_ENFORCE(dynload::cudnnSetPoolingNdDescriptor(
+        desc_, (mode == PoolingMode::kMaximum
+                    ? CUDNN_POOLING_MAX
+                    : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
+        CUDNN_PROPAGATE_NAN,  // Always propagate nans.
+        kernel.size(), kernel.data(), pads.data(), strides.data()));
+    return desc_;
+  }
+
+ private:
+  cudnnPoolingDescriptor_t desc_;
+  DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper_test.cc b/paddle/platform/cudnn_helper_test.cc
new file mode 100644
index 0000000000..427359f697
--- /dev/null
+++ b/paddle/platform/cudnn_helper_test.cc
@@ -0,0 +1,154 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cudnn_helper.h"
+#include <gtest/gtest.h>
+
+TEST(CudnnHelper, ScopedTensorDescriptor) {
+  using paddle::platform::ScopedTensorDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedTensorDescriptor tensor_desc;
+  std::vector<int> shape = {2, 4, 6, 6};
+  auto desc = tensor_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  std::vector<int> dims(4);
+  std::vector<int> strides(4);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc, 4, &type, &nd, dims.data(), strides.data());
+
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < dims.size(); ++i) {
+    EXPECT_EQ(dims[i], shape[i]);
+  }
+  EXPECT_EQ(strides[3], 1);
+  EXPECT_EQ(strides[2], 6);
+  EXPECT_EQ(strides[1], 36);
+  EXPECT_EQ(strides[0], 144);
+
+  // test tensor5d: ScopedTensorDescriptor
+  ScopedTensorDescriptor tensor5d_desc;
+  std::vector<int> shape_5d = {2, 4, 6, 6, 6};
+  auto desc_5d = tensor5d_desc.descriptor<float>(DataLayout::kNCDHW, shape_5d);
+
+  std::vector<int> dims_5d(5);
+  std::vector<int> strides_5d(5);
+  paddle::platform::dynload::cudnnGetTensorNdDescriptor(
+      desc_5d, 5, &type, &nd, dims_5d.data(), strides_5d.data());
+
+  EXPECT_EQ(nd, 5);
+  for (size_t i = 0; i < dims_5d.size(); ++i) {
+    EXPECT_EQ(dims_5d[i], shape_5d[i]);
+  }
+  EXPECT_EQ(strides_5d[4], 1);
+  EXPECT_EQ(strides_5d[3], 6);
+  EXPECT_EQ(strides_5d[2], 36);
+  EXPECT_EQ(strides_5d[1], 216);
+  EXPECT_EQ(strides_5d[0], 864);
+}
+
+TEST(CudnnHelper, ScopedFilterDescriptor) {
+  using paddle::platform::ScopedFilterDescriptor;
+  using paddle::platform::DataLayout;
+
+  ScopedFilterDescriptor filter_desc;
+  std::vector<int> shape = {2, 3, 3};
+  auto desc = filter_desc.descriptor<float>(DataLayout::kNCHW, shape);
+
+  cudnnDataType_t type;
+  int nd;
+  cudnnTensorFormat_t format;
+  std::vector<int> kernel(3);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(desc, 3, &type, &format,
+                                                        &nd, kernel.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    EXPECT_EQ(kernel[i], shape[i]);
+  }
+
+  ScopedFilterDescriptor filter_desc_4d;
+  std::vector<int> shape_4d = {2, 3, 3, 3};
+  auto desc_4d = filter_desc.descriptor<float>(DataLayout::kNCDHW, shape_4d);
+
+  std::vector<int> kernel_4d(4);
+  paddle::platform::dynload::cudnnGetFilterNdDescriptor(
+      desc_4d, 4, &type, &format, &nd, kernel_4d.data());
+
+  EXPECT_EQ(GetCudnnTensorFormat(DataLayout::kNCHW), format);
+  EXPECT_EQ(nd, 4);
+  for (size_t i = 0; i < shape_4d.size(); ++i) {
+    EXPECT_EQ(kernel_4d[i], shape_4d[i]);
+  }
+}
+
+TEST(CudnnHelper, ScopedConvolutionDescriptor) {
+  using paddle::platform::ScopedConvolutionDescriptor;
+
+  ScopedConvolutionDescriptor conv_desc;
+  std::vector<int> src_pads = {2, 2, 2};
+  std::vector<int> src_strides = {1, 1, 1};
+  std::vector<int> src_dilations = {1, 1, 1};
+  auto desc = conv_desc.descriptor<float>(src_pads, src_strides, src_dilations);
+
+  cudnnDataType_t type;
+  cudnnConvolutionMode_t mode;
+  int nd;
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  std::vector<int> dilations(3);
+  paddle::platform::dynload::cudnnGetConvolutionNdDescriptor(
+      desc, 3, &nd, pads.data(), strides.data(), dilations.data(), &mode,
+      &type);
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+    EXPECT_EQ(dilations[i], src_dilations[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_CROSS_CORRELATION);
+}
+
+TEST(CudnnHelper, ScopedPoolingDescriptor) {
+  using paddle::platform::ScopedPoolingDescriptor;
+  using paddle::platform::PoolingMode;
+
+  ScopedPoolingDescriptor pool_desc;
+  std::vector<int> src_kernel = {2, 2, 5};
+  std::vector<int> src_pads = {1, 1, 2};
+  std::vector<int> src_strides = {2, 2, 3};
+  auto desc = pool_desc.descriptor(PoolingMode::kMaximum, src_kernel, src_pads,
+                                   src_strides);
+
+  cudnnPoolingMode_t mode;
+  cudnnNanPropagation_t nan_t = CUDNN_PROPAGATE_NAN;
+  int nd;
+  std::vector<int> kernel(3);
+  std::vector<int> pads(3);
+  std::vector<int> strides(3);
+  paddle::platform::dynload::cudnnGetPoolingNdDescriptor(
+      desc, 3, &mode, &nan_t, &nd, kernel.data(), pads.data(), strides.data());
+
+  EXPECT_EQ(nd, 3);
+  for (size_t i = 0; i < src_pads.size(); ++i) {
+    EXPECT_EQ(kernel[i], src_kernel[i]);
+    EXPECT_EQ(pads[i], src_pads[i]);
+    EXPECT_EQ(strides[i], src_strides[i]);
+  }
+  EXPECT_EQ(mode, CUDNN_POOLING_MAX);
+}
diff --git a/paddle/platform/details/device_ptr_cast.h b/paddle/platform/details/device_ptr_cast.h
new file mode 100644
index 0000000000..4015491fcd
--- /dev/null
+++ b/paddle/platform/details/device_ptr_cast.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef __NVCC__
+#error device_ptr_cast must be include by .cu file
+#endif
+
+#include <thrust/device_ptr.h>
+
+namespace paddle {
+namespace platform {
+namespace details {
+template <typename T, bool is_ptr>
+struct DevicePtrCast;
+
+template <typename T>
+struct DevicePtrCast<T, true> {
+  using ELEM = typename std::remove_pointer<T>::type;
+  using RTYPE = thrust::device_ptr<ELEM>;
+
+  inline thrust::device_ptr<ELEM> operator()(ELEM* ele) const {
+    return thrust::device_pointer_cast(ele);
+  }
+};
+
+template <typename T>
+struct DevicePtrCast<T, false> {
+  using RTYPE = T;
+  inline RTYPE operator()(RTYPE it) const { return it; }
+};
+
+// Cast T to thrust::device_ptr if T is a pointer.
+// Otherwise, e.g., T is a iterator, return T itself.
+template <typename T>
+auto DevPtrCast(T t) ->
+    typename DevicePtrCast<T, std::is_pointer<T>::value>::RTYPE {
+  DevicePtrCast<T, std::is_pointer<T>::value> cast;
+  return cast(t);
+}
+
+}  // namespace details
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000..9d9348079a
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,236 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+#include "paddle/memory/memory.h"
+
+namespace paddle {
+namespace platform {
+
+DeviceContextPool* DeviceContextPool::pool = nullptr;
+
+const platform::DeviceContext* DeviceContextPool::Get(
+    const platform::Place& place) {
+  auto it = device_contexts_.find(place);
+  if (it == device_contexts_.end()) {
+    PADDLE_THROW(
+        "'Place' is not supported, Please re-compile with WITH_GPU "
+        "option");
+  }
+  return it->second;
+}
+
+DeviceContextPool::DeviceContextPool(
+    const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_.emplace(places[i],
+                               new platform::CPUDeviceContext(
+                                   boost::get<platform::CPUPlace>(places[i])));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_.emplace(places[i],
+                               new platform::CUDADeviceContext(
+                                   boost::get<platform::CUDAPlace>(places[i])));
+#else
+      PADDLE_THROW(
+          "'CUDAPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
+CPUDeviceContext::CPUDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) : place_(place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return place_; }
+
+#ifdef PADDLE_WITH_CUDA
+
+class EigenCudaStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenCudaStreamDevice() override {}
+
+  void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) {
+    stream_ = cuda_stream;
+    place_ = place;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const cudaStream_t& stream() const override { return *stream_; }
+
+  const cudaDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    return paddle::memory::Alloc(place_, num_bytes);
+  }
+
+  void deallocate(void* buffer) const override {
+    paddle::memory::Free(place_, buffer);
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch =
+          static_cast<char*>(scratchpad()) + Eigen::kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      PADDLE_ENFORCE(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_));
+    }
+    return semaphore_;
+  }
+
+ private:
+  CUDAPlace place_;
+  const cudaStream_t* stream_;         // not owned;
+  const cudaDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+};
+
+CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
+  SetDeviceId(place_.device);
+  PADDLE_ENFORCE(cudaStreamCreate(&stream_));
+  eigen_stream_.reset(new EigenCudaStreamDevice());
+  eigen_stream_->Reinitialize(&stream_, place);
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+  PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+  PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
+  if (dynload::HasCUDNN()) {
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+    PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
+  } else {
+    cudnn_handle_ = nullptr;
+  }
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  if (cudnn_handle_ != nullptr) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+  eigen_stream_.reset();
+  eigen_device_.reset();
+  PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(stream_));
+  PADDLE_ENFORCE(cudaGetLastError());
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() const {
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
+
+cudaStream_t CUDADeviceContext::stream() const { return stream_; }
+
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
+    : CPUDeviceContext(place), ready_(false) {
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+}
+
+template <typename T>
+void MKLDNNDeviceContext::AddElement(const std::string& op_key,
+                                     const T& value) {
+  if (GetElement<T>(op_key)) {
+    return;
+  }
+  GetElementPool<T>().emplace(op_key, std::move(value));
+}
+
+template <typename T>
+const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
+  auto it = GetElementPool<T>().find(op_key);
+  return it == GetElementPool<T>().end() ? nullptr : it->second;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
+  return memory_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
+  return primitive_pool_;
+}
+
+template <>
+const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                         std::hash<std::string>>&
+MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
+  return primitive_desc_pool_;
+}
+
+void MKLDNNDeviceContext::Execute(bool block) {
+  if (pipeline_.empty()) {
+    return;
+  }
+  ResetStream();
+  stream_->submit(pipeline_).wait(block);
+  ready_ = false;
+  pipeline_.clear();
+}
+
+void MKLDNNDeviceContext::ResetStream() {
+  if (ready_) {
+    return;
+  }
+  // TODO(TJ): change me when mkldnn have specific method to reset this state
+  stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+  ready_ = true;
+}
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
new file mode 100644
index 0000000000..9826a64276
--- /dev/null
+++ b/paddle/platform/device_context.h
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include <unordered_map>
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/gpu_info.h"
+#define EIGEN_USE_GPU
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/platform/mkldnn_helper.h"
+#endif
+
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  virtual void Wait() const {}
+};
+
+class CPUDeviceContext : public DeviceContext {
+ public:
+  CPUDeviceContext();
+  explicit CPUDeviceContext(CPUPlace place);
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+  Place GetPlace() const override;
+
+ private:
+  CPUPlace place_;
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+template <typename Place>
+struct DefaultDeviceContextType;
+
+template <>
+struct DefaultDeviceContextType<platform::CPUPlace> {
+  using TYPE = CPUDeviceContext;
+};
+
+#ifdef PADDLE_WITH_CUDA
+
+class EigenCudaStreamDevice;
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(CUDAPlace place);
+  virtual ~CUDADeviceContext();
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t cublas_handle() const;
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t cudnn_handle() const;
+
+  /*! \brief  Return cuda stream in the device context. */
+  cudaStream_t stream() const;
+
+ private:
+  CUDAPlace place_;
+
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<EigenCudaStreamDevice> eigen_stream_;
+
+  cudaStream_t stream_;
+  cudnnHandle_t cudnn_handle_;
+  cublasHandle_t cublas_handle_;
+};
+
+template <>
+struct DefaultDeviceContextType<platform::CUDAPlace> {
+  using TYPE = CUDADeviceContext;
+};
+
+#endif
+
+#ifdef PADDLE_WITH_MKLDNN
+class MKLDNNDeviceContext : public CPUDeviceContext {
+ public:
+  explicit MKLDNNDeviceContext(CPUPlace place);
+
+  /* \brief  Add new element: memory, primitive or primitive desc */
+  template <typename T>
+  void AddElement(const std::string& op_key, const T& value);
+
+  /* \brief  Get existed element: memory, primitive or primitive desc */
+  template <typename T>
+  const T& GetElement(const std::string& op_key) const;
+
+  /* \brief  Get element pool: memory, primitive or primitive desc pool */
+  template <typename T>
+  const std::unordered_map<const std::string, const T, std::hash<std::string>>&
+  GetElementPool() const;
+
+  /* \brief  Get the active engine */
+  const MKLDNNEngine& engine() const { return *engine_; }
+
+  /* \brief  Submit primitive to pipeline */
+  void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
+
+  /*! \brief  Execute all submitted primitives in pipeline */
+  void Execute(bool block = true);
+
+ protected:
+  /*! \brief  Reset the stream to prepare next exectue */
+  void ResetStream();
+
+ private:
+  std::unordered_map<const std::string, const MKLDNNMemoryPtr,
+                     std::hash<std::string>>
+      memory_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
+                     std::hash<std::string>>
+      primitive_pool_;
+  std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
+                     std::hash<std::string>>
+      primitive_desc_pool_;
+  std::vector<MKLDNNPrimitive> pipeline_;
+  MKLDNNStreamPtr stream_;
+  MKLDNNEnginePtr engine_;
+  bool ready_;
+};
+#endif
+
+/*! \brief device context pool singleton */
+class DeviceContextPool {
+ public:
+  explicit DeviceContextPool(const std::vector<platform::Place>& places);
+
+  static DeviceContextPool& Instance() {
+    PADDLE_ENFORCE_NOT_NULL(pool, "Need to Create DeviceContextPool first!");
+    return *pool;
+  }
+
+  /*! \brief  Create should only called by Init function */
+  static DeviceContextPool& Init(const std::vector<platform::Place>& places) {
+    if (pool == nullptr) {
+      pool = new DeviceContextPool(places);
+    }
+    return *pool;
+  }
+
+  /*! \brief  Return handle of single device context. */
+  const platform::DeviceContext* Get(const platform::Place& place);
+
+  template <typename Place>
+  const typename DefaultDeviceContextType<Place>::TYPE* GetByPlace(
+      const Place& place) {
+    return reinterpret_cast<
+        const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
+  }
+
+  size_t size() const { return device_contexts_.size(); }
+
+ private:
+  static DeviceContextPool* pool;
+  constexpr static int LEFT_SHIFT = 8;
+  struct Hash {
+    std::hash<int> hash_;
+    size_t operator()(const platform::Place& place) const {
+      int pre_hash = place.which() << LEFT_SHIFT;
+      if (platform::is_gpu_place(place)) {
+        pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
+      }
+      return hash_(pre_hash);
+    }
+  };
+  std::unordered_map<const platform::Place, const platform::DeviceContext*,
+                     Hash>
+      device_contexts_;
+  DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
+};
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu
new file mode 100644
index 0000000000..767fe9b24a
--- /dev/null
+++ b/paddle/platform/device_context_test.cu
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+
+#include "glog/logging.h"
+
+TEST(Device, Init) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
+    delete device_context;
+  }
+}
+
+TEST(Device, CUDADeviceContext) {
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i));
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    ASSERT_NE(nullptr, device_context->stream());
+    delete device_context;
+  }
+}
+
+TEST(Device, DeviceContextPool) {
+  using paddle::platform::DeviceContextPool;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::Place;
+  using paddle::platform::CPUPlace;
+  using paddle::platform::CUDAPlace;
+
+  DeviceContextPool& pool = DeviceContextPool::Instance();
+  auto cpu_dev_ctx1 = pool.Get(CPUPlace());
+  auto cpu_dev_ctx2 = pool.Get(CPUPlace());
+  ASSERT_EQ(cpu_dev_ctx2, cpu_dev_ctx1);
+
+  std::vector<Place> gpu_places;
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    auto dev_ctx = pool.Get(CUDAPlace(i));
+    ASSERT_NE(dev_ctx, nullptr);
+  }
+}
+
+int main(int argc, char** argv) {
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
new file mode 100644
index 0000000000..cf2081b434
--- /dev/null
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
+        DEPS dynamic_loader nccl)
+cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
new file mode 100644
index 0000000000..6aca716657
--- /dev/null
+++ b/paddle/platform/dynload/cublas.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/cublas.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
new file mode 100644
index 0000000000..61a22d9db3
--- /dev/null
+++ b/paddle/platform/dynload/cublas.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cublas_v2.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load cublas routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    inline cublasStatus_t operator()(Args... args) {                \
+      typedef cublasStatus_t (*cublasFunc)(Args...);                \
+      std::call_once(cublas_dso_flag,                               \
+                     paddle::platform::dynload::GetCublasDsoHandle, \
+                     &cublas_dso_handle);                           \
+      void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
+      return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  struct DynLoad__##__name {                         \
+    template <typename... Args>                      \
+    inline cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                        \
+    }                                                \
+  };                                                 \
+  extern DynLoad__##__name __name
+#endif
+
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+
+#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+  __macro(cublasSaxpy_v2);                \
+  __macro(cublasDaxpy_v2);                \
+  __macro(cublasSgemv_v2);                \
+  __macro(cublasDgemv_v2);                \
+  __macro(cublasSgemm_v2);                \
+  __macro(cublasDgemm_v2);                \
+  __macro(cublasSgeam_v2);                \
+  __macro(cublasDgeam_v2);                \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgemmStridedBatched);     \
+  __macro(cublasDgemmStridedBatched);     \
+  __macro(cublasCgemmStridedBatched);     \
+  __macro(cublasZgemmStridedBatched);     \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
+
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
+
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000..701f6240fe
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.cc
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
+#ifdef PADDLE_USE_DSO
+bool HasCUDNN() {
+  std::call_once(cudnn_dso_flag, GetCUDNNDsoHandle, &cudnn_dso_handle);
+  return cudnn_dso_handle != nullptr;
+}
+
+void EnforceCUDNNLoaded(const char* fn_name) {
+  PADDLE_ENFORCE(cudnn_dso_handle != nullptr,
+                 "Cannot load cudnn shared library. Cannot invoke method %s",
+                 fn_name);
+}
+#else
+bool HasCUDNN() { return true; }
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
new file mode 100644
index 0000000000..b926347949
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <cudnn.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
+extern bool HasCUDNN();
+
+#ifdef PADDLE_USE_DSO
+
+extern void EnforceCUDNNLoaded(const char* fn_name);
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
+  struct DynLoad__##__name {                                       \
+    template <typename... Args>                                    \
+    auto operator()(Args... args) -> decltype(__name(args...)) {   \
+      using cudnn_func = decltype(__name(args...)) (*)(Args...);   \
+      std::call_once(cudnn_dso_flag,                               \
+                     paddle::platform::dynload::GetCUDNNDsoHandle, \
+                     &cudnn_dso_handle);                           \
+      EnforceCUDNNLoaded(#__name);                                 \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
+    }                                                              \
+  };                                                               \
+  extern struct DynLoad__##__name __name
+
+#else
+
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+
+#endif
+
+/**
+ * include all needed cudnn functions in HPPL
+ * different cudnn version has different interfaces
+ **/
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnSetTensorNdDescriptor);              \
+  __macro(cudnnGetTensorNdDescriptor);              \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetFilterNdDescriptor);              \
+  __macro(cudnnGetFilterNdDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnSetPoolingNdDescriptor);             \
+  __macro(cudnnGetPoolingNdDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnSetConvolutionNdDescriptor);         \
+  __macro(cudnnGetConvolutionNdDescriptor);         \
+  __macro(cudnnDeriveBNTensorDescriptor);           \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+// APIs available after R3:
+#if CUDNN_VERSION >= 3000
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+// APIs available after R4:
+#if CUDNN_VERSION >= 4007
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+// APIs in R5
+#if CUDNN_VERSION >= 5000
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
new file mode 100644
index 0000000000..d05dd88126
--- /dev/null
+++ b/paddle/platform/dynload/curand.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/platform/dynload/curand.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
new file mode 100644
index 0000000000..7bfe0778c7
--- /dev/null
+++ b/paddle/platform/dynload/curand.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <curand.h>
+#include <dlfcn.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
+  struct DynLoad__##__name {                                        \
+    template <typename... Args>                                     \
+    curandStatus_t operator()(Args... args) {                       \
+      typedef curandStatus_t (*curandFunc)(Args...);                \
+      std::call_once(curand_dso_flag,                               \
+                     paddle::platform::dynload::GetCurandDsoHandle, \
+                     &curand_dso_handle);                           \
+      void *p_##__name = dlsym(curand_dso_handle, #__name);         \
+      return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
+    }                                                               \
+  };                                                                \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
+#endif
+
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
+  __macro(curandDestroyGenerator);
+
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
new file mode 100644
index 0000000000..c8c09ae608
--- /dev/null
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -0,0 +1,180 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/dynamic_loader.h"
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "paddle/platform/enforce.h"
+
+DEFINE_string(cudnn_dir, "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir, "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
+DEFINE_string(nccl_dir, "",
+              "Specify path for loading nccl library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+static inline std::string join(const std::string& part1,
+                               const std::string& part2) {
+  // directory separator
+  const char sep = '/';
+  if (!part2.empty() && part2.front() == sep) {
+    return part2;
+  }
+  std::string ret;
+  ret.reserve(part1.size() + part2.size() + 1);
+  ret = part1;
+  if (!ret.empty() && ret.back() != sep) {
+    ret += sep;
+  }
+  ret += part2;
+  return ret;
+}
+
+static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
+                                               void** dso_handle,
+                                               int dynload_flags) {
+  VLOG(3) << "Try to find library: " << dso_path
+          << " from default system path.";
+  // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
+  *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+
+// DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+// bring System Integrity Projection (SIP), if dso_handle
+// is null, search from default package path in Mac OS.
+#if defined(__APPLE__) || defined(__OSX__)
+  if (nullptr == *dso_handle) {
+    dso_path = join("/usr/local/cuda/lib/", dso_path);
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    if (nullptr == *dso_handle) {
+      if (dso_path == "libcudnn.dylib") {
+        LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n "
+                        "For instance, sudo tar -xzf "
+                        "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo "
+                        "chmod a+r /usr/local/cuda/include/cudnn.h "
+                        "/usr/local/cuda/lib/libcudnn*";
+      }
+    }
+  }
+#endif
+}
+
+static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
+                                              const std::string& dso_name,
+                                              void** dso_handle,
+                                              bool throw_on_error = true) {
+  int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
+  *dso_handle = nullptr;
+
+  std::string dlPath = dso_name;
+  if (search_root.empty()) {
+    GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+  } else {
+    // search xxx.so from custom path
+    dlPath = join(search_root, dso_name);
+    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+    // if not found, search from default path
+    if (nullptr == *dso_handle) {
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
+      dlPath = dso_name;
+      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
+    }
+  }
+  auto error_msg =
+      "Failed to find dynamic library: %s ( %s ) \n Please specify "
+      "its path correctly using following ways: \n Method. set "
+      "environment variable LD_LIBRARY_PATH on Linux or "
+      "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: "
+      "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
+      "using the DYLD_LIBRARY_PATH is impossible unless System "
+      "Integrity Protection (SIP) is disabled.";
+  if (throw_on_error) {
+    PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror());
+  } else if (nullptr == *dso_handle) {
+    LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror());
+  }
+}
+
+void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
+}
+
+void GetCUDNNDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle,
+                             false);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false);
+#endif
+}
+
+void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
+}
+
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+#endif
+}
+
+void GetNCCLDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+#endif
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
new file mode 100644
index 0000000000..7b0c8c16d7
--- /dev/null
+++ b/paddle/platform/dynload/dynamic_loader.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+/**
+ * @brief    load the DSO of CUBLAS
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCublasDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CUDNN
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCUDNNDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of CURAND
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetCurandDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+/**
+ * @brief    load the DSO of NVIDIA nccl
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetNCCLDsoHandle(void** dso_handle);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
new file mode 100644
index 0000000000..4cec829a8a
--- /dev/null
+++ b/paddle/platform/dynload/nccl.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/nccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+void LoadNCCLDSO() {
+  platform::call_once(nccl_dso_flag,
+                      [] { GetNCCLDsoHandle(&nccl_dso_handle); });
+}
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
new file mode 100644
index 0000000000..6c776afc97
--- /dev/null
+++ b/paddle/platform/dynload/nccl.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <nccl.h>
+#include <mutex>
+#include "paddle/platform/call_once.h"
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+extern void LoadNCCLDSO();
+
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                   \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);  \
+      paddle::platform::dynload::LoadNCCLDSO();                  \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);        \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);   \
+    }                                                            \
+  };                                                             \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
+  struct DynLoad__##__name {                   \
+    template <typename... Args>                \
+    ncclResult_t operator()(Args... args) {    \
+      return __name(args...);                  \
+    }                                          \
+  };                                           \
+  extern DynLoad__##__name __name
+#endif
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclGroupStart);              \
+  __macro(ncclGroupEnd);                \
+  __macro(ncclReduce);                  \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.cc b/paddle/platform/dynload/warpctc.cc
new file mode 100644
index 0000000000..9b7d01a6e8
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/dynload/warpctc.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/warpctc.h b/paddle/platform/dynload/warpctc.h
new file mode 100644
index 0000000000..acafcaff2c
--- /dev/null
+++ b/paddle/platform/dynload/warpctc.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <mutex>
+#include "ctc.h"
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag warpctc_dso_flag;
+extern void* warpctc_dso_handle;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading.
+ */
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                            \
+  struct DynLoad__##__name {                                         \
+    template <typename... Args>                                      \
+    auto operator()(Args... args) -> decltype(__name(args...)) {     \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);    \
+      std::call_once(warpctc_dso_flag,                               \
+                     paddle::platform::dynload::GetWarpCTCDsoHandle, \
+                     &warpctc_dso_handle);                           \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);          \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);      \
+    }                                                                \
+  };                                                                 \
+  extern DynLoad__##__name __name
+
+#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
+  DYNAMIC_LOAD_WARPCTC_WRAP(__name)
+
+#define WARPCTC_ROUTINE_EACH(__macro) \
+  __macro(get_warpctc_version);       \
+  __macro(ctcGetStatusString);        \
+  __macro(compute_ctc_loss);          \
+  __macro(get_workspace_size)
+
+WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
+
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.cc b/paddle/platform/enforce.cc
new file mode 100644
index 0000000000..e8d31bc782
--- /dev/null
+++ b/paddle/platform/enforce.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace platform {}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
new file mode 100644
index 0000000000..d1c7be0790
--- /dev/null
+++ b/paddle/platform/enforce.h
@@ -0,0 +1,258 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>     // for dladdr
+#include <execinfo.h>  // for backtrace
+#include <iomanip>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#include "paddle/platform/macros.h"
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"
+
+#ifdef __GNUC__
+#include <cxxabi.h>  // for __cxa_demangle
+#endif
+
+#include <glog/logging.h>
+
+#ifdef PADDLE_WITH_CUDA
+
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/dynload/nccl.h"
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif
+
+namespace paddle {
+namespace platform {
+
+#ifdef __GNUC__
+inline std::string demangle(std::string name) {
+  int status = -4;  // some arbitrary value to eliminate the compiler warning
+  std::unique_ptr<char, void (*)(void*)> res{
+      abi::__cxa_demangle(name.c_str(), NULL, NULL, &status), std::free};
+  return (status == 0) ? res.get() : name;
+}
+#else
+inline std::string demangle(std::string name) { return name; }
+#endif
+
+struct EnforceNotMet : public std::exception {
+  std::exception_ptr exp_;
+  std::string err_str_;
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    try {
+      std::rethrow_exception(exp_);
+    } catch (const std::exception& exp) {
+      std::ostringstream sout;
+
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "PaddlePaddle Call Stacks: " << std::endl;
+
+      void* call_stack[TRACE_STACK_LIMIT];
+      auto size = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto symbols = backtrace_symbols(call_stack, size);
+
+      Dl_info info;
+      for (int i = 0; i < size; ++i) {
+        if (dladdr(call_stack[i], &info) && info.dli_sname) {
+          auto demangled = demangle(info.dli_sname);
+          auto addr_offset = static_cast<char*>(call_stack[i]) -
+                             static_cast<char*>(info.dli_saddr);
+          sout << string::Sprintf("%-3d %*0p %s + %zd\n", i,
+                                  2 + sizeof(void*) * 2, call_stack[i],
+                                  demangled, addr_offset);
+        } else {
+          sout << string::Sprintf("%-3d %*0p\n", i, 2 + sizeof(void*) * 2,
+                                  call_stack[i]);
+        }
+      }
+      free(symbols);
+      err_str_ = sout.str();
+    }
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    bool stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(string::Sprintf(args...));
+  }
+}
+
+#ifdef PADDLE_WITH_CUDA
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
+  if (UNLIKELY(e)) {
+    throw thrust::system_error(e, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
+  std::string err;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    err = "CUBLAS: not initialized, ";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    err = "CUBLAS: alloc failed, ";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    err = "CUBLAS: invalid value, ";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    err = "CUBLAS: arch mismatch, ";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    err = "CUBLAS: mapping error, ";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    err = "CUBLAS: execution failed, ";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    err = "CUBLAS: internal error, ";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    err = "CUBLAS: not supported, ";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    err = "CUBLAS: license error, ";
+  }
+  throw std::runtime_error(err + string::Sprintf(args...));
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    ncclResult_t stat, const Args&... args) {
+  if (stat == ncclSuccess) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+#define PADDLE_THROW(...)                                              \
+  do {                                                                 \
+    throw ::paddle::platform::EnforceNotMet(                           \
+        std::make_exception_ptr(                                       \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                           \
+  } while (false)
+
+#define PADDLE_ENFORCE(...)                                             \
+  do {                                                                  \
+    try {                                                               \
+      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+    } catch (...) {                                                     \
+      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                              __FILE__, __LINE__);      \
+    }                                                                   \
+  } while (false)
+
+/*
+ * Some enforce helpers here, usage:
+ *    int a = 1;
+ *    int b = 2;
+ *    PADDLE_ENFORCE_EQ(a, b);
+ *
+ *    will raise an expression described as follows:
+ *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *
+ *    extra messages is also supported, for example:
+ *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
+ */
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+  do {                                                       \
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
+                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   paddle::string::to_string(__VAL1),                   \
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
new file mode 100644
index 0000000000..8206a055ea
--- /dev/null
+++ b/paddle/platform/enforce_test.cc
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <array>
+#include <iostream>
+#include <memory>
+
+#include "gtest/gtest.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE, NO_ARG_OK) {
+  int a = 2;
+  int b = 2;
+  PADDLE_ENFORCE_EQ(a, b);
+  // test enforce with extra message.
+  PADDLE_ENFORCE_EQ(a, b, "some thing wrong %s", "info");
+}
+
+TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
+  int a = 2;
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()),
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NE, OK) {
+  PADDLE_ENFORCE_NE(1, 2);
+  PADDLE_ENFORCE_NE(1.0, 2UL);
+}
+TEST(ENFORCE_NE, FAIL) {
+  bool caught_exception = false;
+
+  try {
+    // 2UL here to check data type compatible
+    PADDLE_ENFORCE_NE(1.0, 1UL);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
+        << error.what() << " does not have expected prefix";
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
+TEST(ENFORCE_GT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_GE, OK) {
+  PADDLE_ENFORCE_GE(2, 2UL);
+  PADDLE_ENFORCE_GE(3, 2UL);
+  PADDLE_ENFORCE_GE(3, 2);
+  PADDLE_ENFORCE_GE(3.21, 2UL);
+}
+TEST(ENFORCE_GE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GE(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LE, OK) {
+  PADDLE_ENFORCE_LE(1, 1);
+  PADDLE_ENFORCE_LE(1, 1UL);
+  PADDLE_ENFORCE_LE(2, 3UL);
+  PADDLE_ENFORCE_LE(2UL, 3);
+  PADDLE_ENFORCE_LE(2UL, 3.2);
+}
+TEST(ENFORCE_LE, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_GT(1, 2UL);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_LT, OK) {
+  PADDLE_ENFORCE_LT(3, 10);
+  PADDLE_ENFORCE_LT(2, 3UL);
+  PADDLE_ENFORCE_LT(2UL, 3);
+}
+TEST(ENFORCE_LT, FAIL) {
+  bool caught_exception = false;
+  try {
+    PADDLE_ENFORCE_LT(1UL, 0.12);
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+TEST(ENFORCE_NOT_NULL, OK) {
+  int* a = new int;
+  PADDLE_ENFORCE_NOT_NULL(a);
+  delete a;
+}
+TEST(ENFORCE_NOT_NULL, FAIL) {
+  bool caught_exception = false;
+  try {
+    int* a = nullptr;
+    PADDLE_ENFORCE_NOT_NULL(a);
+
+  } catch (paddle::platform::EnforceNotMet error) {
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
+    }
+    return true;
+  }
+};
+
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h
new file mode 100644
index 0000000000..694a66d9ac
--- /dev/null
+++ b/paddle/platform/for_range.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <typename DeviceContext>
+struct ForRange {
+  ForRange(const DeviceContext& dev_ctx, size_t limit);
+
+  template <typename Function>
+  void operator()(Function func) const;
+};
+
+template <>
+struct ForRange<CPUDeviceContext> {
+  ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {}
+
+  template <typename Function>
+  void operator()(Function func) const {
+    for (size_t i = 0; i < limit_; ++i) {
+      func(i);
+    }
+  }
+
+  size_t limit_;
+};
+
+#ifdef __NVCC__
+template <typename Function>
+__global__ static void ForRangeElemwiseOpGridIsOne(Function func) {
+  size_t idx = static_cast<size_t>(threadIdx.x);
+  func(idx);
+}
+
+template <typename Function>
+__global__ static void ForRangeElemwiseOp(Function func, int limit) {
+  size_t idx = static_cast<size_t>(blockIdx.x * blockDim.x + threadIdx.x);
+  if (idx < limit) {
+    func(idx);
+  }
+}
+
+template <>
+struct ForRange<CUDADeviceContext> {
+  ForRange(const CUDADeviceContext& dev_ctx, size_t limit)
+      : dev_ctx_(dev_ctx), limit_(static_cast<int>(limit)) {}
+
+  template <typename Function>
+  inline void operator()(Function func) const {
+    constexpr int num_threads = 1024;
+    int block_size = limit_ <= num_threads ? limit_ : num_threads;
+    int grid_size = (limit_ + num_threads - 1) / num_threads;
+
+    if (grid_size == 1) {
+      ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>(
+          func);
+    } else {
+      ForRangeElemwiseOp<<<grid_size, block_size, 0, dev_ctx_.stream()>>>(
+          func, limit_);
+    }
+  }
+
+  const CUDADeviceContext& dev_ctx_;
+  int limit_;
+};
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000..7037551d75
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/platform/enforce.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
+              "Default use 92% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetCUDADeviceCount() {
+  int count;
+  PADDLE_ENFORCE(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
+  PADDLE_ENFORCE(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t &available, size_t &total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+  VLOG(10) << "GPU Usage " << available / 1024 / 1024 << "M/"
+           << total / 1024 / 1024 << "M";
+  size_t reserving = static_cast<size_t>(0.05 * total);
+  // If available less than minimum chunk size, no usable memory exists.
+  available =
+      std::min(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               total - reserving);
+
+  // Reserving the rest memory for page tables, etc.
+
+  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
+                                          (total - reserving));
+
+  PADDLE_ENFORCE_LE(allocating, available);
+
+  return allocating;
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
+
+void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemsetAsync(dst, value, count, stream),
+                 "cudaMemsetAsync failed in paddle::platform::GpuMemsetAsync");
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000..d05131fa41
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+#include <string>
+
+namespace paddle {
+namespace platform {
+
+//! Environment variable: fraction of GPU memory to use on each device.
+const std::string kEnvFractionGpuMemoryToUse =
+    "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
+
+//! Get the total number of GPU devices in system.
+int GetCUDADeviceCount();
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//! Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t &available, size_t &total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
+//! Set memory dst with value count size asynchronously
+void GpuMemsetAsync(void *dst, int value, size_t count, cudaStream_t stream);
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/platform/hostdevice.h b/paddle/platform/hostdevice.h
new file mode 100644
index 0000000000..fa4659ed29
--- /dev/null
+++ b/paddle/platform/hostdevice.h
@@ -0,0 +1,24 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#ifdef __CUDACC__
+#define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
+#define HOST __host__
+#else
+#define HOSTDEVICE
+#define DEVICE
+#define HOST
+#endif
diff --git a/paddle/platform/macros.h b/paddle/platform/macros.h
new file mode 100644
index 0000000000..feae7bdd77
--- /dev/null
+++ b/paddle/platform/macros.h
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+// Disable the copy and assignment operator for a class.
+#ifndef DISABLE_COPY_AND_ASSIGN
+#define DISABLE_COPY_AND_ASSIGN(classname)         \
+ private:                                          \
+  classname(const classname&) = delete;            \
+  classname(const classname&&) = delete;           \
+  classname& operator=(const classname&) = delete; \
+  classname& operator=(const classname&&) = delete
+#endif
diff --git a/paddle/platform/mkldnn_helper.h b/paddle/platform/mkldnn_helper.h
new file mode 100644
index 0000000000..cd52a8b4c4
--- /dev/null
+++ b/paddle/platform/mkldnn_helper.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <mkldnn.hpp>
+
+namespace paddle {
+namespace platform {
+
+using MKLDNNStream = mkldnn::stream;
+using MKLDNNEngine = mkldnn::engine;
+using MKLDNNMemory = mkldnn::memory;
+using MKLDNNPrimitive = mkldnn::primitive;
+using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
+
+typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
+typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
+typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
+typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
+typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
new file mode 100644
index 0000000000..ef6d845874
--- /dev/null
+++ b/paddle/platform/nccl_test.cu
@@ -0,0 +1,151 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/dynload/nccl.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+
+static int dev_count = 0;
+
+namespace paddle {
+namespace platform {
+
+TEST(NCCL, init) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+  PADDLE_ENFORCE(dynload::ncclCommInitAll(comms.data(), dev_count, nullptr));
+
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+
+template <typename T>
+struct PerThreadData {
+  thrust::device_vector<T> send_buff;
+  thrust::device_vector<T> recv_buff;
+  CUDADeviceContext dev_ctx;
+
+  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
+
+  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
+
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) {
+    send_buff.resize(size);
+    for (size_t i = 0; i < size; ++i) {
+      send_buff[i] = static_cast<T>(i);
+    }
+    recv_buff.resize(size);
+  }
+};
+
+static constexpr int ELEM_COUNT = 10000;
+
+TEST(NCCL, all_reduce) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+  VLOG(1) << "Initializing ncclComm";
+  dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  VLOG(1) << "ncclComm initialized";
+  VLOG(1) << "Creating thread data";
+  std::vector<std::unique_ptr<PerThreadData<double>>> data;
+  data.reserve(dev_count);
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Creating thread data for device " << i;
+    SetDeviceId(i);
+    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
+  }
+  VLOG(1) << "Thread data created";
+
+  VLOG(1) << "Check send_buf data";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Check on device " << i;
+    SetDeviceId(i);
+    thrust::host_vector<double> tmp = data[i]->send_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
+    }
+  }
+
+  VLOG(1) << "Invoking ncclAllReduce";
+
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Invoking ncclAllReduce with device " << i;
+    SetDeviceId(i);
+    PADDLE_ENFORCE(dynload::ncclAllReduce(
+        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
+        ncclSum, comms[i], data[i]->dev_ctx.stream()));
+    VLOG(1) << "Invoked ncclAllReduce for device " << i;
+  }
+
+  VLOG(1) << "Invoked ncclAllReduce";
+
+  VLOG(1) << "Sync devices";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Sync device " << i;
+    SetDeviceId(i);
+    data[i]->dev_ctx.Wait();
+  }
+  VLOG(1) << "device synced";
+
+  for (int i = 0; i < dev_count; ++i) {
+    SetDeviceId(i);
+    VLOG(1) << "Checking vector on device " << i;
+    thrust::host_vector<double> tmp = data[i]->recv_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      auto elem = static_cast<double>(j);
+      elem *= dev_count;
+      ASSERT_NEAR(tmp[j], elem, 1e-4);
+    }
+  }
+
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+}  // namespace platform
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+
+  std::vector<paddle::platform::Place> places;
+
+  places.emplace_back(paddle::platform::CPUPlace());
+  int count = paddle::platform::GetCUDADeviceCount();
+  for (int i = 0; i < count; ++i) {
+    places.emplace_back(paddle::platform::CUDAPlace(i));
+  }
+
+  VLOG(0) << " DeviceCount " << count;
+  paddle::platform::DeviceContextPool::Init(places);
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
new file mode 100644
index 0000000000..f05260ccac
--- /dev/null
+++ b/paddle/platform/place.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace platform {
+
+namespace detail {
+
+class PlacePrinter : public boost::static_visitor<> {
+ public:
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
+  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const CUDAPlace &p) {
+    os_ << "CUDAPlace(" << p.device << ")";
+  }
+
+ private:
+  std::ostream &os_;
+};
+
+}  // namespace detail
+
+static Place the_default_place;
+
+void set_place(const Place &place) { the_default_place = place; }
+const Place &get_place() { return the_default_place; }
+
+const CUDAPlace default_gpu() { return CUDAPlace(0); }
+const CPUPlace default_cpu() { return CPUPlace(); }
+
+bool is_gpu_place(const Place &p) {
+  return boost::apply_visitor(IsCUDAPlace(), p);
+}
+
+bool is_cpu_place(const Place &p) { return !is_gpu_place(p); }
+
+bool places_are_same_class(const Place &p1, const Place &p2) {
+  return p1.which() == p2.which();
+}
+
+bool is_same_place(const Place &p1, const Place &p2) {
+  if (places_are_same_class(p1, p2)) {
+    if (is_cpu_place(p1)) {
+      return true;
+    } else {
+      return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
+    }
+  } else {
+    return false;
+  }
+}
+
+std::ostream &operator<<(std::ostream &os, const Place &p) {
+  detail::PlacePrinter printer(os);
+  boost::apply_visitor(printer, p);
+  return os;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
new file mode 100644
index 0000000000..fbb43fa043
--- /dev/null
+++ b/paddle/platform/place.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace platform {
+
+struct CPUPlace {
+  // WORKAROUND: for some reason, omitting this constructor
+  // causes errors with boost 1.59 and OSX
+  CPUPlace() {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const CPUPlace &) const { return true; }
+  inline bool operator!=(const CPUPlace &) const { return false; }
+};
+
+struct CUDAPlace {
+  CUDAPlace() : CUDAPlace(0) {}
+  explicit CUDAPlace(int d) : device(d) {}
+
+  inline int GetDeviceId() const { return device; }
+  // needed for variant equality comparison
+  inline bool operator==(const CUDAPlace &o) const {
+    return device == o.device;
+  }
+  inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); }
+
+  int device;
+};
+
+struct IsCUDAPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const CUDAPlace &gpu) const { return true; }
+};
+
+typedef boost::variant<CUDAPlace, CPUPlace> Place;
+
+using PlaceList = std::vector<Place>;
+
+void set_place(const Place &);
+const Place &get_place();
+
+const CUDAPlace default_gpu();
+const CPUPlace default_cpu();
+
+bool is_gpu_place(const Place &);
+bool is_cpu_place(const Place &);
+bool places_are_same_class(const Place &, const Place &);
+bool is_same_place(const Place &, const Place &);
+
+std::ostream &operator<<(std::ostream &, const Place &);
+
+template <typename Visitor>
+struct PlaceVisitorWrapper
+    : public boost::static_visitor<typename Visitor::result_type> {
+  const Visitor &visitor_;
+  explicit PlaceVisitorWrapper(const Visitor &visitor) : visitor_(visitor) {}
+
+  typename Visitor::result_type operator()(const CPUPlace &cpu) const {
+    return visitor_(cpu);
+  }
+
+  typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
+#ifdef PADDLE_WITH_CUDA
+    return visitor_(cuda);
+#else
+    PADDLE_THROW("Paddle is not compiled with CUDA. Cannot visit cuda device");
+    return typename Visitor::result_type();
+#endif
+  }
+};
+
+template <typename Visitor>
+typename Visitor::result_type VisitPlace(const Place &place,
+                                         const Visitor &visitor) {
+  return boost::apply_visitor(PlaceVisitorWrapper<Visitor>(visitor), place);
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
new file mode 100644
index 0000000000..150b2d3b1f
--- /dev/null
+++ b/paddle/platform/place_test.cc
@@ -0,0 +1,54 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/platform/place.h"
+#include <sstream>
+#include "gtest/gtest.h"
+
+TEST(Place, Equality) {
+  paddle::platform::CPUPlace cpu;
+  paddle::platform::CUDAPlace g0(0), g1(1), gg0(0);
+
+  EXPECT_EQ(cpu, cpu);
+  EXPECT_EQ(g0, g0);
+  EXPECT_EQ(g1, g1);
+  EXPECT_EQ(g0, gg0);
+
+  EXPECT_NE(g0, g1);
+
+  EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0));
+  EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu));
+}
+
+TEST(Place, Default) {
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place()));
+  EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
+
+  EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+  paddle::platform::set_place(paddle::platform::CPUPlace());
+  EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
+}
+
+TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << paddle::platform::CUDAPlace(1);
+    EXPECT_EQ("CUDAPlace(1)", ss.str());
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::platform::CPUPlace();
+    EXPECT_EQ("CPUPlace", ss.str());
+  }
+}
diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
new file mode 100644
index 0000000000..2a8afc9403
--- /dev/null
+++ b/paddle/platform/profiler.cc
@@ -0,0 +1,346 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+#include <iomanip>
+#include <map>
+#include "glog/logging.h"
+
+namespace paddle {
+namespace platform {
+
+// The profiler state, the initial value is ProfilerState::kDisabled
+static ProfilerState g_state = ProfilerState::kDisabled;
+// To record which timer the profiler used, CUDA or CPU.
+static std::string g_profiler_place = "";
+// The thread local event list only can be accessed by the specific thread
+// The thread index of each thread
+static thread_local int32_t g_thread_id;
+// The g_next_thread_id is a global counter for threads, by the g_thread_id and
+// g_next_thread_id, we can know how many threads have created EventList.
+static uint32_t g_next_thread_id = 0;
+// The global mutex
+static std::mutex g_all_event_lists_mutex;
+// The total event lists of all threads
+static std::list<std::shared_ptr<EventList>> g_all_event_lists;
+// The thread local event list only can be accessed by the specific thread
+static thread_local std::shared_ptr<EventList> g_event_list;
+
+inline uint64_t GetTimeInNsec() {
+  using clock = std::conditional<std::chrono::high_resolution_clock::is_steady,
+                                 std::chrono::high_resolution_clock,
+                                 std::chrono::steady_clock>::type;
+  return std::chrono::duration_cast<std::chrono::nanoseconds>(
+             clock::now().time_since_epoch())
+      .count();
+}
+
+Event::Event(EventKind kind, std::string name, uint32_t thread_id,
+             const DeviceContext* dev_ctx)
+    : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
+#ifdef PADDLE_WITH_CUDA
+  has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
+  if (has_cuda_) {
+    auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
+    PADDLE_ENFORCE(cudaGetDevice(&device_));
+    PADDLE_ENFORCE(cudaEventCreate(&event_));
+    auto stream = cuda_dev_ctx->stream();
+    PADDLE_ENFORCE(cudaEventRecord(event_, stream));
+  }
+#endif
+  cpu_ns_ = GetTimeInNsec();
+}
+
+std::string Event::kind() const {
+  switch (kind_) {
+    case EventKind::kMark:
+      return "mark";
+    case EventKind::kPushRange:
+      return "push";
+    case EventKind::kPopRange:
+      return "pop";
+  }
+  PADDLE_THROW("Unknown EventKind.");
+}
+
+double Event::CpuElapsedMs(const Event& e) const {
+  return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
+}
+
+double Event::CudaElapsedMs(const Event& e) const {
+#ifdef PADDLE_WITH_CUDA
+  PADDLE_ENFORCE(e.has_cuda() && has_cuda());
+  PADDLE_ENFORCE(e.device() == device());
+  PADDLE_ENFORCE(cudaEventSynchronize(event_));
+  PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
+  float ms;
+  PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
+  return ms;
+#else
+  PADDLE_THROW("CUDA is not enabled");
+#endif
+}
+
+#ifdef PADDLE_WITH_CUDA
+static void ForEachDevice(std::function<void(int)> func) {
+  auto original_device = GetCurrentDeviceId();
+  int count = GetCUDADeviceCount();
+  for (int i = 0; i < count; i++) {
+    SetDeviceId(i);
+    func(i);
+  }
+  SetDeviceId(original_device);
+}
+#endif
+
+inline EventList& GetEventList() {
+  if (!g_event_list) {
+    std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+    g_event_list = std::make_shared<EventList>();
+    g_thread_id = g_next_thread_id++;
+    g_all_event_lists.emplace_front(g_event_list);
+  }
+  return *g_event_list;
+}
+
+void Mark(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
+}
+
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
+}
+
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
+  GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
+}
+
+RecordEvent::RecordEvent(const std::string& name,
+                         const DeviceContext* dev_ctx) {
+  if (g_state == ProfilerState::kDisabled) return;
+  dev_ctx_ = dev_ctx;
+  name_ = name;
+  PushEvent(name_, dev_ctx_);
+}
+
+RecordEvent::~RecordEvent() {
+  if (g_state == ProfilerState::kDisabled) return;
+  PopEvent(name_, dev_ctx_);
+}
+
+void EnableProfiler(ProfilerState state) {
+  PADDLE_ENFORCE(state != ProfilerState::kDisabled,
+                 "Can't enbale profling, since the input state is ",
+                 "ProfilerState::kDisabled");
+  PADDLE_ENFORCE(g_state == ProfilerState::kDisabled,
+                 "The profiling state should be disabled when calling ",
+                 "EnableProfiler.");
+  g_state = state;
+  g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
+#ifdef PADDLE_WITH_CUDA
+  if (g_state == ProfilerState::kCUDA) {
+    // Generate some dummy evenets first to reduce the startup overhead.
+    for (int i = 0; i < 5; i++) {
+      ForEachDevice([](int d) {
+        DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
+        Mark("_cuda_startup_", dev_ctx);
+        dev_ctx->Wait();
+        delete dev_ctx;
+      });
+    }
+  }
+#endif
+  // Mark the profiling start.
+  Mark("_start_profiler_", nullptr);
+}
+
+void ResetProfiler() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    (*it)->Clear();
+  }
+}
+
+std::vector<std::vector<Event>> GetAllEvents() {
+  std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
+  std::vector<std::vector<Event>> result;
+  for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
+       ++it) {
+    result.emplace_back((*it)->Reduce());
+  }
+  return result;
+}
+
+void DisableProfiler(EventSortingKey sorted_key) {
+  PADDLE_ENFORCE(g_state != ProfilerState::kDisabled,
+                 "Can't disable profiling, since it's not starting.");
+  // Mark the profiling stop.
+  Mark("_stop_profiler_", nullptr);
+  g_state = ProfilerState::kDisabled;
+
+  std::vector<std::vector<Event>> all_events = GetAllEvents();
+  ParseEvents(all_events, sorted_key);
+  ResetProfiler();
+}
+
+void ParseEvents(std::vector<std::vector<Event>>& events,
+                 EventSortingKey sorted_by) {
+  if (g_profiler_place == "") return;
+
+  std::string sorted_domain;
+  std::function<bool(const EventItem&, const EventItem&)> sorted_func;
+  switch (sorted_by) {
+    case EventSortingKey::kCalls:
+      sorted_domain = "number of calls";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.calls > b.calls;
+      };
+      break;
+    case EventSortingKey::kTotal:
+      sorted_domain = "total time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.total_time > b.total_time;
+      };
+      break;
+    case EventSortingKey::kMin:
+      sorted_domain = "minimum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.min_time > b.min_time;
+      };
+      break;
+    case EventSortingKey::kMax:
+      sorted_domain = "maximum time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.max_time > b.max_time;
+      };
+      break;
+    case EventSortingKey::kAve:
+      sorted_domain = "average time";
+      sorted_func = [](const EventItem& a, const EventItem& b) {
+        return a.ave_time > b.ave_time;
+      };
+      break;
+    default:
+      sorted_domain = "event end time";
+  }
+
+  std::vector<std::vector<EventItem>> events_table;
+  size_t max_name_width = 0;
+  for (size_t i = 0; i < events.size(); i++) {
+    std::list<Event> pushed_events;
+    std::vector<EventItem> event_items;
+    std::unordered_map<std::string, int> event_idx;
+
+    for (size_t j = 0; j < events[i].size(); j++) {
+      if (events[i][j].kind() == "push") {
+        pushed_events.push_back(events[i][j]);
+      } else if (events[i][j].kind() == "pop") {
+        std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+        while (rit != pushed_events.rend() &&
+               rit->name() != events[i][j].name()) {
+          ++rit;
+        }
+
+        if (rit != pushed_events.rend()) {
+          double event_time = (g_profiler_place == "CUDA")
+                                  ? rit->CudaElapsedMs(events[i][j])
+                                  : rit->CpuElapsedMs(events[i][j]);
+          std::string event_name =
+              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
+          max_name_width = std::max(max_name_width, event_name.size());
+
+          if (event_idx.find(event_name) == event_idx.end()) {
+            event_idx[event_name] = event_items.size();
+            EventItem event_item = {event_name, 1,          event_time,
+                                    event_time, event_time, event_time};
+            event_items.push_back(event_item);
+          } else {
+            int index = event_idx[event_name];
+            event_items[index].calls += 1;
+            // total time
+            event_items[index].total_time += event_time;
+            // min time
+            event_items[index].min_time =
+                std::min(event_time, event_items[index].min_time);
+            // max time
+            event_items[index].max_time =
+                std::max(event_time, event_items[index].max_time);
+          }
+
+          // remove the push marker from the list
+          pushed_events.erase((++rit).base());
+        } else {
+          LOG(WARNING) << "Cannot find the push marker of event \'"
+                       << events[i][j].name()
+                       << "\', which will be ignored in profiling report.";
+        }
+      }
+    }
+    // average time
+    for (auto& item : event_items) {
+      item.ave_time = item.total_time / item.calls;
+    }
+    // sort
+    if (sorted_by != EventSortingKey::kDefault) {
+      std::sort(event_items.begin(), event_items.end(), sorted_func);
+    }
+
+    events_table.push_back(event_items);
+    // log warning if there are events with `push` but without `pop`
+    std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
+    while (rit != pushed_events.rend()) {
+      LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
+                   << "\', which will be ignored in profiling report.";
+      ++rit;
+    }
+  }
+
+  // Print report
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12);
+}
+
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width) {
+  // Output header information
+  std::cout << "\n------------------------->"
+            << "     Profiling Report     "
+            << "<-------------------------\n\n";
+  std::cout << "Place: " << g_profiler_place << std::endl;
+  std::cout << "Time unit: ms" << std::endl;
+  std::cout << "Sorted by " << sorted_domain
+            << " in descending order in the same thread\n\n";
+  // Output events table
+  std::cout.setf(std::ios::left);
+  std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
+            << "Calls" << std::setw(data_width) << "Total"
+            << std::setw(data_width) << "Min." << std::setw(data_width)
+            << "Max." << std::setw(data_width) << "Ave." << std::endl;
+  for (size_t i = 0; i < events_table.size(); ++i) {
+    for (size_t j = 0; j < events_table[i].size(); ++j) {
+      EventItem& event_item = events_table[i][j];
+      std::cout << std::setw(name_width) << event_item.name
+                << std::setw(data_width) << event_item.calls
+                << std::setw(data_width) << event_item.total_time
+                << std::setw(data_width) << event_item.min_time
+                << std::setw(data_width) << event_item.max_time
+                << std::setw(data_width) << event_item.ave_time << std::endl;
+    }
+  }
+  std::cout << std::endl;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h
new file mode 100644
index 0000000000..8de1e6ad29
--- /dev/null
+++ b/paddle/platform/profiler.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <forward_list>
+#include <list>
+#include <mutex>
+#include <vector>
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+enum EventKind { kMark, kPushRange, kPopRange };
+
+class Event {
+ public:
+  // The DeviceContext is used to get the cuda stream.
+  // If CPU profiling mode, can pass nullptr.
+  Event(EventKind kind, std::string name, uint32_t thread_id,
+        const DeviceContext* dev_ctx);
+
+  std::string kind() const;
+  std::string name() const { return name_; }
+  uint32_t thread_id() const { return thread_id_; }
+  bool has_cuda() const { return has_cuda_; }
+
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event() const { return event_; }
+  int device() const { return device_; }
+#endif
+
+  double CpuElapsedMs(const Event& e) const;
+  double CudaElapsedMs(const Event& e) const;
+
+ private:
+  EventKind kind_;
+  std::string name_;
+  uint32_t thread_id_;
+  int64_t cpu_ns_;
+  bool has_cuda_;
+#ifdef PADDLE_WITH_CUDA
+  cudaEvent_t event_ = nullptr;
+  int device_ = -1;
+#endif
+};
+
+struct EventList {
+  constexpr static size_t kMB = 1024 * 1024;
+  constexpr static size_t kEventBlockSize = 16 * kMB;
+  constexpr static size_t kEventSize = sizeof(Event);
+  constexpr static size_t kEventAlign = alignof(Event);
+  constexpr static size_t kNumBlock =
+      kEventBlockSize /
+      ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
+
+  template <typename... Args>
+  void Record(Args&&... args) {
+    if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
+      event_blocks.emplace_front();
+      event_blocks.front().reserve(kNumBlock);
+    }
+    event_blocks.front().emplace_back(std::forward<Args>(args)...);
+  }
+
+  std::vector<Event> Reduce() {
+    std::vector<Event> result;
+    for (auto& block : event_blocks) {
+      result.insert(result.begin(), std::make_move_iterator(block.begin()),
+                    std::make_move_iterator(block.end()));
+    }
+    event_blocks.clear();
+    return result;
+  }
+
+  void Clear() { event_blocks.clear(); }
+
+  std::forward_list<std::vector<Event>> event_blocks;
+};
+
+enum ProfilerState {
+  kDisabled,  // disabled state
+  kCPU,       // CPU profiling state
+  kCUDA,      // GPU profiling state
+};
+
+void Mark(const std::string& name, const DeviceContext* dev_ctx);
+
+void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+struct RecordEvent {
+  explicit RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
+
+  ~RecordEvent();
+
+  // The device context is used by Event to get the current cuda stream.
+  const DeviceContext* dev_ctx_;
+  // Event name
+  std::string name_;
+};
+
+// Return the event list of all threads. Asummed the returned value calls
+// event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
+std::vector<std::vector<Event>> GetAllEvents();
+
+// The information of each event given in the profiling report
+struct EventItem {
+  std::string name;
+  int calls;
+  double total_time;
+  double min_time;
+  double max_time;
+  double ave_time;
+};
+
+// Candidate keys to sort the profiling report
+enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
+
+// Enable the profiling function.
+void EnableProfiler(ProfilerState state);
+
+// Clear the g_all_event_lists, which is total event lists of all threads.
+void ResetProfiler();
+
+void DisableProfiler(EventSortingKey sorted_key);
+
+// Parse the event list and output the profiling report
+void ParseEvents(std::vector<std::vector<Event>>&,
+                 EventSortingKey sorted_by = EventSortingKey::kDefault);
+
+// Print results
+void PrintProfiler(std::vector<std::vector<EventItem>>& events_table,
+                   std::string& sorted_domain, const size_t name_width,
+                   const size_t data_width);
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc
new file mode 100644
index 0000000000..81f10c9134
--- /dev/null
+++ b/paddle/platform/profiler_test.cc
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/profiler.h"
+#include "gtest/gtest.h"
+
+TEST(Event, CpuElapsedTime) {
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  Event start_event(EventKind::kPushRange, "test", 0, nullptr);
+  EXPECT_TRUE(start_event.has_cuda() == false);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
+  EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(Event, CudaElapsedTime) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+
+  DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
+  Event start_event(EventKind::kPushRange, "test", 0, dev_ctx);
+  EXPECT_TRUE(start_event.has_cuda() == true);
+  int counter = 0;
+  while (counter != 1000) {
+    counter++;
+  }
+  Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
+  EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
+}
+#endif
+
+TEST(RecordEvent, RecordEvent) {
+  using paddle::platform::DeviceContext;
+  using paddle::platform::Event;
+  using paddle::platform::EventKind;
+  using paddle::platform::RecordEvent;
+  using paddle::platform::ProfilerState;
+  using paddle::platform::EventSortingKey;
+
+  ProfilerState state = ProfilerState::kCPU;
+  DeviceContext* dev_ctx = nullptr;
+#ifdef PADDLE_WITH_CUDA
+  using paddle::platform::CUDADeviceContext;
+  using paddle::platform::CUDAPlace;
+  state = ProfilerState::kCUDA;
+  dev_ctx =
+      new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
+#endif
+  EnableProfiler(state);
+
+  /* Usage 1:
+  *  PushEvent(evt_name, dev_ctx);
+  *  ...
+  *  code to be analyzed
+  *  ...
+  * PopEvent(evt_name, dev_ctx);
+  */
+  for (int loop = 0; loop < 3; ++loop) {
+    for (int i = 1; i < 5; ++i) {
+      std::string name = "op_" + std::to_string(i);
+      PushEvent(name, dev_ctx);
+      int counter = 1;
+      while (counter != i * 1000) counter++;
+      PopEvent(name, dev_ctx);
+    }
+  }
+
+  /* Usage 2:
+   * {
+   *   RecordEvent record_event(name, dev_ctx);
+   *   ...
+   *   code to be analyzed
+   *   ...
+   * }
+   */
+  for (int i = 1; i < 5; ++i) {
+    std::string name = "evs_op_" + std::to_string(i);
+    RecordEvent record_event(name, dev_ctx);
+    int counter = 1;
+    while (counter != i * 1000) counter++;
+  }
+
+  // Bad Usage:
+  PushEvent("event_without_pop", dev_ctx);
+  PopEvent("event_without_push", dev_ctx);
+  std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
+
+  int cuda_startup_count = 0;
+  int start_profiler_count = 0;
+  for (size_t i = 0; i < events.size(); ++i) {
+    for (size_t j = 0; j < events[i].size(); ++j) {
+      if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count;
+      if (events[i][j].name() == "_start_profiler_") ++start_profiler_count;
+      if (events[i][j].name() == "push") {
+        EXPECT_EQ(events[i][j + 1].name(), "pop");
+#ifdef PADDLE_WITH_CUDA
+        EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
+#else
+        EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
+#endif
+      }
+    }
+  }
+  EXPECT_EQ(cuda_startup_count % 5, 0);
+  EXPECT_EQ(start_profiler_count, 1);
+
+  // Will remove parsing-related code from test later
+  DisableProfiler(EventSortingKey::kTotal);
+}
diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h
new file mode 100644
index 0000000000..a88902b164
--- /dev/null
+++ b/paddle/platform/transform.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/hostdevice.h"
+#include "paddle/platform/place.h"
+
+#include <algorithm>
+#include <type_traits>
+#ifdef __NVCC__
+#include <thrust/execution_policy.h>
+#include <thrust/transform.h>
+#include "paddle/platform/details/device_ptr_cast.h"
+#endif
+
+namespace paddle {
+namespace platform {
+
+// Transform on host or device. It provides the same API in std library.
+template <typename DeviceContext>
+struct Transform {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const DeviceContext& context, InputIter first, InputIter last,
+                  OutputIter result, UnaryOperation op);
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const DeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op);
+};
+
+template <>
+struct Transform<platform::CPUDeviceContext> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const platform::CPUDeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
+    std::transform(first, last, result, op);
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const platform::CPUDeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    std::transform(first1, last1, first2, result, op);
+  }
+};
+
+#ifdef __NVCC__
+template <>
+struct Transform<platform::CUDADeviceContext> {
+  template <typename InputIter, typename OutputIter, typename UnaryOperation>
+  void operator()(const platform::CUDADeviceContext& context, InputIter first,
+                  InputIter last, OutputIter result, UnaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::DevPtrCast(first), details::DevPtrCast(last),
+                      details::DevPtrCast(result), op);
+  }
+
+  template <typename InputIter1, typename InputIter2, typename OutputIter,
+            typename BinaryOperation>
+  void operator()(const platform::CUDADeviceContext& context, InputIter1 first1,
+                  InputIter1 last1, InputIter2 first2, OutputIter result,
+                  BinaryOperation op) {
+    auto place = context.GetPlace();
+    PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place.");
+    thrust::transform(thrust::cuda::par.on(context.stream()),
+                      details::DevPtrCast(first1), details::DevPtrCast(last1),
+                      details::DevPtrCast(first2), details::DevPtrCast(result),
+                      op);
+  }
+};
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu
new file mode 100644
index 0000000000..af9204a0a7
--- /dev/null
+++ b/paddle/platform/transform_test.cu
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+#include "paddle/platform/hostdevice.h"
+#include "paddle/platform/transform.h"
+
+template <typename T>
+class Scale {
+ public:
+  explicit Scale(const T& scale) : scale_(scale) {}
+
+  HOSTDEVICE T operator()(const T& a) const { return a * scale_; }
+
+ private:
+  T scale_;
+};
+
+template <typename T>
+class Multiply {
+ public:
+  HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; }
+};
+
+TEST(Transform, CPUUnary) {
+  using namespace paddle::platform;
+  CPUDeviceContext ctx;
+  float buf[4] = {0.1, 0.2, 0.3, 0.4};
+  Transform<paddle::platform::CPUDeviceContext> trans;
+  trans(ctx, buf, buf + 4, buf, Scale<float>(10));
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, GPUUnary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  CUDAPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
+  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
+  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
+  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  ctx.Wait();
+  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf), ctx.stream());
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_NEAR(cpu_buf[i], static_cast<float>(i + 1), 1e-5);
+  }
+}
+
+TEST(Transform, CPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  Transform<paddle::platform::CPUDeviceContext> trans;
+  CPUDeviceContext ctx;
+  trans(ctx, buf, buf + 4, buf, buf, Multiply<int>());
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
+
+TEST(Transform, GPUBinary) {
+  using namespace paddle::platform;
+  using namespace paddle::memory;
+  int buf[4] = {1, 2, 3, 4};
+  CUDAPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
+  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
+  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream());
+  Transform<paddle::platform::CUDADeviceContext> trans;
+  trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  ctx.Wait();
+  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf), ctx.stream());
+  Free(gpu0, gpu_buf);
+  for (int i = 0; i < 4; ++i) {
+    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
+  }
+}
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
new file mode 100644
index 0000000000..ea6ef8fddf
--- /dev/null
+++ b/paddle/platform/variant.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __CUDACC__
+#ifdef __CUDACC_VER_MAJOR__
+// CUDA 9 define `__CUDACC_VER__` as a warning message, manually define
+// __CUDACC_VER__ instead.
+#undef __CUDACC_VER__
+
+#define __CUDACC_VER__                                         \
+  (__CUDACC_VER_MAJOR__ * 10000 + __CUDACC_VER_MINOR__ * 100 + \
+   __CUDACC_VER_BUILD__)
+#endif
+
+#endif
+
+#include <boost/config.hpp>
+
+#ifdef PADDLE_WITH_CUDA
+
+// Because boost's variadic templates has bug on nvcc, boost will disable
+// variadic template support when GPU enabled on nvcc.
+// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.
+//
+// https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#endif
+#endif
+
+#include <boost/mpl/comparison.hpp>
+#include <boost/mpl/less_equal.hpp>
+#include <boost/variant.hpp>
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index b7f85ea1a6..f75475a88f 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_network STATIC
 add_style_check_target(paddle_network ${NETWORK_SOURCES})
 add_style_check_target(paddle_network ${NETWORK_HEADERS})
 
-add_dependencies(paddle_network gen_proto_cpp)
+add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
 
 ################### paddle_pserver ######################
 set(PSERVER_SOURCES
@@ -40,19 +40,23 @@ add_library(paddle_pserver STATIC
 add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
 add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
 
-add_dependencies(paddle_pserver gen_proto_cpp)
+add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 
 set(PSERVER_MAIN_SOURCES
     ParameterServer2Main.cpp)
 
-add_executable(paddle_pserver_main
-    ${PSERVER_MAIN_SOURCES})
-link_paddle_exe(paddle_pserver_main)
 if(WITH_TESTING)
   add_subdirectory(test)
 endif()
-install(TARGETS paddle_pserver_main
-    RUNTIME DESTINATION opt/paddle/bin
-    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
-set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+if(NOT MOBILE_INFERENCE)
+  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
+  link_paddle_exe(paddle_pserver_main)
+
+  install(TARGETS paddle_pserver_main
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
+
+  set_target_properties(paddle_pserver_main PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 922f25734d..0e8e5a83a4 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include <arpa/inet.h>
 #include <net/if.h>
-#include <net/if_arp.h>
 #include <sys/ioctl.h>
 #include <sstream>
 
@@ -50,6 +49,11 @@ DEFINE_int32(sock_recv_buf_size,
              1024 * 1024 * 40,
              "restrict sock recv buff size");
 
+/// reasonable sock_listen_queue_size can control maximum pending connections.
+DEFINE_int32(sock_listen_queue_size,
+             1024,
+             "listen queue size when pserver listen a TCP port");
+
 namespace paddle {
 
 /**
@@ -130,7 +134,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
   if (rdmaCpu == -1) {
     tcpRdma_ = F_TCP;
     socket_ = 0;
-    maxPendingConnections_ = 100;
+    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
   } else {
     tcpRdma_ = F_RDMA;
     rdmaCpu_ = rdmaCpu;
@@ -142,7 +146,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
   }
 
   /// trigger to initialize RDMA lib
-  PCHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
+  CHECK(RdmaClientDaemons::get()) << "initilizate RDMA failed\n";
 }
 
 SocketServer::~SocketServer() {
@@ -168,7 +172,7 @@ void SocketServer::tcpServer() {
 
   /// First call to socket() function
   socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
 
   /// Initialize socket structure
   bzero((char *)&serv_addr, sizeof(serv_addr));
@@ -176,7 +180,7 @@ void SocketServer::tcpServer() {
   serv_addr.sin_port = htons(port_);
   if (!addr_.empty()) {
     server = gethostbyname(addr_.c_str());
-    PCHECK(server) << "ERROR, no such host: " << addr_;
+    CHECK(server) << "ERROR, no such host: " << addr_;
     bcopy((char *)server->h_addr,
           (char *)&serv_addr.sin_addr.s_addr,
           server->h_length);
@@ -187,7 +191,7 @@ void SocketServer::tcpServer() {
   setOption(socket_);
 
   /// Now bind the host address using bind() call.
-  PCHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR on binding " << addr_;
 
   /// Now start listening for the clients, here process will
@@ -201,7 +205,7 @@ void SocketServer::tcpServer() {
     if (stopping_) {
       break;
     }
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
     constexpr int kPeerNameLen = 128;
     char peerName[kPeerNameLen];
     CHECK(inet_ntop(AF_INET, &cli_addr.sin_addr, peerName, kPeerNameLen));
@@ -227,14 +231,14 @@ void SocketServer::rdmaServer() {
 
   /// First call to socket() function
   rdmaSocket_ = rdma::ssocket(rdmaCpu_);
-  PCHECK(rdmaSocket_) << "ERROR opening RDMA socket";
+  CHECK(rdmaSocket_) << "ERROR opening RDMA socket";
 
-  PCHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
+  CHECK(rdma::bind(rdmaSocket_, rdmaUri_.c_str()) == 0)
       << "ERROR bind RDMA socket";
 
   /// Now start listening for the clients, here process will
   /// go in sleep mode and will wait for the incoming connection
-  PCHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
+  CHECK(rdma::listen(rdmaSocket_) == 0) << "ERROR listen RDMA socket";
 
   while (true) {
     /// Accept actual connection from the client
@@ -242,7 +246,7 @@ void SocketServer::rdmaServer() {
     if (stopping_) {
       break;
     }
-    PCHECK(newsock) << "ERROR on accept";
+    CHECK(newsock) << "ERROR on accept";
 
     constexpr int kPeerNameLen = 128;
     char peerName[kPeerNameLen];
@@ -290,7 +294,7 @@ RdmaClientDaemons::RdmaClientDaemons() {
     onlineCpus_ = rdma::numCpus();
     for (auto i = 0; i < onlineCpus_; i++) {
       socket = rdma::csocket(i);
-      PCHECK(socket) << "ERROR open client socket daemon";
+      CHECK(socket) << "ERROR open client socket daemon";
 
       rdmaClientSocket_.push_back(socket);
     }
@@ -355,7 +359,7 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
 
   /// Create a socket point
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
 
 #if defined(__OSX__) || defined(__APPLE__)
   server = getipnodebyname(serverAddr.c_str(), AF_INET, AI_DEFAULT, &errRet);
@@ -396,8 +400,8 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
       }
       std::this_thread::sleep_for(std::chrono::seconds(1));
     } else {
-      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
-                         << serverPort << "errorno: " << errno;
+      CHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                        << serverPort << "errorno: " << errno;
     }
   } while (errno == ECONNREFUSED);
 
@@ -426,7 +430,7 @@ void SocketClient::RdmaClient(const std::string &serverAddr, int serverPort) {
 
   /// connect to server with socket daemon
   sock = rdma::connect(socketDaemon_, rdmaUri.c_str());
-  PCHECK(sock) << "ERROR connect to server" << rdmaUri;
+  CHECK(sock) << "ERROR connect to server" << rdmaUri;
 
   std::vector<std::string> seg;
   str::split(rdmaUri, '/', &seg);
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index f7e391f763..9562c64986 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -65,7 +65,6 @@ void ParameterClient2::initThreads() {
     LOG(INFO) << "parallel_thread_num dosent need to set";
   }
   syncThreadPool_.reset(new SyncThreadPool(threadNum_));
-
   startThreads();
 }
 
@@ -187,6 +186,7 @@ void ParameterClient2::sendParallel(int tid,
             parameter->getMat(recvParameterType).get());
         CHECK(recvMat);
         size_t width = parameter->getConfig().dims(1);
+        // TODO(wuyi): need add lock here? may also cause resize.
         buf = recvMat->getLocalRow(block.begin_pos() / width);
       }
       /// sparse_id is not useful while receiving data since sparse data
@@ -224,6 +224,14 @@ void ParameterClient2::prepareSendData(
     request.set_cost(cost);
     request.set_batch_status(batchStatus);
     CHECK_EQ(request.blocks_size(), 0);
+    VLOG(10) << "request: trainer_id: " << request.trainer_id()
+             << " update_mode" << request.update_mode()
+             << " send_back_parameter: " << request.send_back_parameter()
+             << " send_back_parameter_type: "
+             << request.send_back_parameter_type()
+             << " num_samples: " << request.num_samples()
+             << " cost: " << request.cost()
+             << " batch_status: " << request.batch_status();
   }
   for (const auto& segments : parameterSegments) {
     const auto it = parameterMap_.find(segments.id);
@@ -251,11 +259,17 @@ void ParameterClient2::prepareSendData(
       CHECK(sendMat != nullptr) << "sendMat is nullptr";
 
       syncThreadPool_->exec([&](int tid, size_t numThreads) {
+        std::lock_guard<std::mutex> guard(sparseAutoGrowthMutex_);
         const auto& localIndices = prefetchMat->getLocalIndices();
         /// num of sparse rows
         size_t nLocalBlocks = localIndices.size();
         uint64_t beginDim = 0;
         uint64_t endDim = 0;
+
+        // HACK(typhoonzero): let it resize first
+        prefetchMat->getLocalRow(nLocalBlocks);
+        sendMat->getLocalRow(nLocalBlocks);
+
         for (size_t row = 0; row < nLocalBlocks; ++row) {
           int64_t blockId = localIndices[row];  // local row -> sparse row
           int serverId = std::abs((blockId + nameHash) % serviceNum_);
@@ -275,7 +289,6 @@ void ParameterClient2::prepareSendData(
           block->set_begin_pos(row * blockSize);
           /// block len
           block->set_block_size(endDim - beginDim);
-
           if (sendingPara) {
             sendJob->parallelInputIovs[serverId].push_back(
                 {sendMat->getLocalRow(row), sizeof(real) * (size_t)blockSize});
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 89b3ddd502..29b9eeacdd 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -583,6 +583,7 @@ protected:
 #ifndef PADDLE_DISABLE_TIMER
   uint64_t forwardbackwordTime_;
 #endif
+  std::mutex sparseAutoGrowthMutex_;
 
   /// map id to parameter used for decoding protobuf data
   std::unordered_map<size_t, ParameterPtr> parameterMap_;
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 41ac15336d..54f5c4c0fb 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -217,10 +217,6 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
 
   SetConfigResponse response;
   callback(response);
-
-  /// always defined, barrier slowest node function need it.
-  statSet_.reset(new StatSet("ParameterServer" +
-                             str::to_string(static_cast<int>(serverId_))));
 }
 
 real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
@@ -369,50 +365,7 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-  // forwardbackward delta from all trainers
-  // indicate the fluctuation caused by forwardbackward.
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_DELTA_SERVER_SET(
-        *statSet_,
-        "forwardbackwardDelta",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        request.forwardbackward_time(),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
   {
-    /// approximately pure network overhead
-    REGISTER_TIMER_DYNAMIC_SET(
-        "pushRecv", timeToMicroSecond(*handleRequestBegin_), -1, *statSet_);
-  }
-
-#ifndef PADDLE_DISABLE_TIMER
-  gettimeofday(&(*addGradBegin_), nullptr);
-#endif
-
-  /// barrier fluctuation caused by network and previous forwardbackward
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_TIMER_SERVER_SET(
-        *statSet_,
-        "handleReqBegin",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        (*handleRequestBegin_),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
-  if (!numPassFinishClients_) {
-    REGISTER_BARRIER_TIMER_SERVER(
-        *statSet_,
-        "addGradBegin",
-        FLAGS_num_gradient_servers,
-        request.trainer_id(),
-        isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-  }
-
-  {
-    REGISTER_TIMER_DYNAMIC("addGradCore", -1, *statSet_);
     ReadLockGuard guard(parameterMutex_);
     int bufferIndex = 0;
     for (const auto& block : request.blocks()) {
@@ -444,15 +397,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
       std::lock_guard<std::mutex> guard(*info.lock);
       simd::addTo(gradientSumBuffer, gradientBuffer, size);
     }
-
-    if (!numPassFinishClients_) {
-      REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_,
-          "addGradCoreFinish",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
   }
   if (request.batch_status() == BATCH_FINISH ||
       request.batch_status() == BATCH_START_AND_FINISH) {
@@ -461,47 +405,12 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
     VLOG(1) << "num samples: " << numSamplesProcessed_
             << ", new cost:" << cost_;
 
-    /// numPassFinishClients_ means some trainer has entered finishPass
-    if (!numPassFinishClients_) {
-      REGISTER_SLOW_NODES_PROBE(
-          *statSet_,
-          "SLOW_NODES",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
-
     /// notify doOperation gradient ready
     gradientReadyBarrier_.wait();
 
-    /// if wait pass finish does not start, do check
-    if (!numPassFinishClients_) {
-      CHECK_BARRIER_TIMER(*statSet_,
-                          "SLOW_NODES",
-                          FLAGS_num_gradient_servers,
-                          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
-
-    /// barrier performance while all parameter add is finished
-    /// can indicate the fluctation caused by computation at pserver.
-    if (!numPassFinishClients_) {
-      REGISTER_BARRIER_TIMER_SERVER(
-          *statSet_,
-          "paraReady",
-          FLAGS_num_gradient_servers,
-          request.trainer_id(),
-          isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-    }
     /// wait doOperation finish
     parameterReadyBarrier_.wait();
     VLOG(1) << "start send back";
-    {
-      /// total time except overhead of network.
-      REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecvNoSend",
-                                 timeToMicroSecond(*addGradBegin_),
-                                 -1,
-                                 *statSet_);
-    }
   }
 }
 
@@ -543,57 +452,6 @@ bool ParameterServer2::asyncGrdientCommitCheckAndStat(
   return commitGradient;
 }
 
-void ParameterServer2::printAsyncGradientCommitStatAndReset() {
-  std::stringstream statFormat;
-  if (asyncUpdateSteps_) {
-    statFormat << "async discard gradients stat: " << std::endl;
-    statFormat << "serverId: " << serverId_
-               << " serverType: " << isSparseServer_
-               << " total updates: " << asyncUpdateSteps_
-               << " discard updates: " << asyncLaggedGradientsNum_
-               << " discard ratio: "
-               << (real)asyncLaggedGradientsNum_ / (real)asyncUpdateSteps_;
-    statFormat << std::endl;
-    statFormat << std::endl;
-
-    statFormat << "Async Gradient Update Steps distribution: " << std::endl
-               << "Sample: 1:1912(0.00284449) means "
-               << "the updates step=1 count 1912 times "
-               << "and account for 0.284449% of total updates" << std::endl;
-    size_t index = 0;
-    for (const auto& stat : asyncUpdateStat_) {
-      statFormat << index << ":" << stat << "("
-                 << (real)stat / (real)asyncUpdateSteps_ << ") ";
-      index++;
-    }
-    statFormat << std::endl;
-    statFormat << std::endl;
-
-    statFormat << "Async Gradient Discard based on trainer_id: " << std::endl
-               << "Sample: 2:22(0.0016363) means "
-               << "total discarded updates from trainer_id=2 count 22 "
-               << "and account for 0.16363% of all updates from trainer_id=2"
-               << std::endl;
-    for (auto i = 0; i < FLAGS_num_gradient_servers; i++) {
-      real ratio =
-          (real)asyncTrainerDiscardStat_[i] /
-          (real)(asyncTrainerCommitStat_[i] + asyncTrainerDiscardStat_[i]);
-      statFormat << i << ":" << asyncTrainerDiscardStat_[i] << "(" << ratio
-                 << ")"
-                 << " ";
-    }
-    LOG(INFO) << statFormat.str();
-
-    /// reset stat
-    asyncUpdateSteps_ = 0;
-    asyncTrainerSteps_.assign(asyncTrainerSteps_.size(), 0);
-    asyncLaggedGradientsNum_ = 0;
-    asyncUpdateStat_.assign(asyncUpdateStat_.size(), 0);
-    asyncTrainerDiscardStat_.assign(asyncTrainerDiscardStat_.size(), 0);
-    asyncTrainerCommitStat_.assign(asyncTrainerCommitStat_.size(), 0);
-  }
-}
-
 static ThreadLocal<std::vector<bool>> localBlockBitset_;
 
 void ParameterServer2::asyncSGD(const SendParameterRequest& request,
@@ -695,7 +553,6 @@ void ParameterServer2::asyncSGD(const SendParameterRequest& request,
   if (request.trainer_id() == 0) {
     /// batchId_ is approximately equal to "real batchId_"
     batchId_++;
-    tuningAsyncsgdMidOutput();
   }
 }
 
@@ -881,34 +738,6 @@ void ParameterServer2::sendParameter(const SendParameterRequest& request,
         }
         (*requestVec_).clear();
         (*callbackVec_).clear();
-
-        /// barrier perfromance while all data are send finished.
-        /// indicates network flucatuation for big message.
-        if (!numPassFinishClients_) {
-          REGISTER_BARRIER_TIMER_SERVER(
-              *statSet_,
-              "sendParamFinish",
-              FLAGS_num_gradient_servers,
-              request.trainer_id(),
-              isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
-        }
-        /// all time exhausted in parameterServer for big message.
-        /// it contains network and computation at pserver.
-        {
-          /// total time including overhead of network.
-          REGISTER_TIMER_DYNAMIC_SET("sendParaTotal",
-                                     timeToMicroSecond(*handleRequestBegin_),
-                                     -1,
-                                     *statSet_);
-        }
-        /// all time exhausted in pserverServer except recieve network.
-        {
-          /// total time except overhead of network receive
-          REGISTER_TIMER_DYNAMIC_SET("sendParaNoRecv",
-                                     timeToMicroSecond(*addGradBegin_),
-                                     -1,
-                                     *statSet_);
-        }
       }
       break;
     case PSERVER_UPDATE_MODE_SET_PARAM:
@@ -1088,8 +917,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
   }
 
   {
-    REGISTER_TIMER_DYNAMIC("op_SGD", -1, *statSet_);
-
     parallelExecForEachBlock([&](int64_t blockId, const VectorPtr vecs[]) {
       BlockInfo& info = blockInfos_[blockId];
       const ParameterConfig& config = getParameterConfig(blockId);
@@ -1113,7 +940,6 @@ void ParameterServer2::op_SGD(const Operation& operation,
   }
 
   batchId_++;
-  tuningSgdMidOutput();
 }
 
 void ParameterServer2::op_start_pass(const Operation& operation,
@@ -1146,8 +972,6 @@ void ParameterServer2::op_finish_pass(const Operation& operation,
     /// finish pass
     info.optimizer->finishPass();
   });
-
-  tuningSgdFinished();
   batchId_ = 0;
 }
 
@@ -1208,8 +1032,8 @@ void ParameterServer2::loadValueVector(const LoadValueRequest& request,
   Parameter::Header header;
   CHECK(fs.read(reinterpret_cast<char*>(&header), sizeof(header)))
       << "Fail to read parameters in pserver";
-  CHECK_EQ(header.version, Parameter::kFormatVersion)
-      << "Incorrect format version: " << header.version;
+  CHECK(Parameter::isHeaderFormatSupported(header.format))
+      << "Incorrect format version: " << header.format;
   CHECK_EQ(header.size, (size_t)size_)
       << "The size (" << header.size << ") in the file does not match the size "
       << "(" << size_ << ") of the pserver: " << serverId_;
@@ -1239,7 +1063,8 @@ void ParameterServer2::saveValueVector(const SaveValueRequest& request,
   CpuVector& vec = vectors_[PARAMETER_APPLY] ? *vectors_[PARAMETER_APPLY]
                                              : *vectors_[PARAMETER_VALUE];
   Parameter::Header header;
-  header.version = Parameter::kFormatVersion;
+  // TODO(TJ): save param headerFormat_
+  header.format = PARAM_FORMAT_ORIGINAL;
   header.valueSize = sizeof(real);
   header.size = size_;
 
@@ -1515,7 +1340,6 @@ void ParameterServer2::asyncFinishPass(const SynchronizeRequest& request,
   callback(SynchronizeResponse());
 
   if (request.trainer_id() == 0) {
-    tuningAsyncsgdFinished();
     batchId_ = 0;
   }
 }
@@ -1574,42 +1398,4 @@ void ParameterServer2::releaseMatrix(const ReleaseMatrixRequest& request,
   callback(response);
 }
 
-void ParameterServer2::tuningSgdMidOutput() {
-  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
-    LOG(INFO) << "======== Batch=" << batchId_ << "=======";
-    statSet_->setThreadInfo(true);
-    statSet_->printAllStatus();
-    /// not reset raw data for reducing the overhead of performance tuning
-    statSet_->reset(false);
-  }
-}
-
-void ParameterServer2::tuningSgdFinished() {
-  LOG(INFO) << "======== Batch=" << batchId_ << " pass END"
-            << "=======";
-  statSet_->setThreadInfo(true);
-  statSet_->printAllStatus();
-  /**
-   * reset raw data at end of pass since some raw data could be not
-   * complete. Otherwise the raw data will pollute next pass performance
-   * tuning
-   */
-  statSet_->reset();
-}
-
-void ParameterServer2::tuningAsyncsgdMidOutput() {
-#ifndef PADDLE_DISABLE_TIMER
-  if (batchId_ && batchId_ % FLAGS_log_period_server == 0) {
-    LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << "=======";
-    printAsyncGradientCommitStatAndReset();
-  }
-#endif
-}
-
-void ParameterServer2::tuningAsyncsgdFinished() {
-  LOG(INFO) << "======== [not accurate] Batch=" << batchId_ << " pass END"
-            << "=======";
-  printAsyncGradientCommitStatAndReset();
-}
-
 }  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 0f5a589590..f7d3587b88 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -298,24 +298,6 @@ protected:
   /// barrier performance tuning sync-sgd required
   std::atomic<int64_t> batchId_;
 
-  /// the beginning of addGradient without network overhead
-  ThreadLocal<struct timeval> addGradBegin_;
-
-  /**
-   * tuning barrier performance
-   * to better control log for sparse and dense parameter,
-   * we use different log entities for different parameterServer
-   * objects.
-   * it will output lots of performance stats to perceive the
-   * overhead of network, fluctuation of computation from
-   * forwardbackward and network, computation from optimization
-   * at pserver end, barrier overhead, etc. to understand tuning
-   * data, focus on the synchronization between addGradient and
-   * doOperation which indirectly call op_SGD operation controlled
-   * by remote updater controller
-   */
-  std::unique_ptr<StatSet> statSet_;
-
 public:
   struct Buffer {
     real* base;
@@ -325,7 +307,6 @@ public:
 protected:
   /// async gradient commit control
   bool asyncGrdientCommitCheckAndStat(const SendParameterRequest& request);
-  void printAsyncGradientCommitStatAndReset();
 
 public:
   /// disable default parameter for overloading
@@ -710,36 +691,6 @@ public:
 
   void op_load(const Operation& operation, OperationResult* result);
   void op_save(const Operation& operation, OperationResult* result);
-
-  /**
-   * @brief output log in at the middle stage of training
-   *
-   * @note  flush log histroy and state at the end for sgd
-   */
-  void tuningSgdMidOutput();
-
-  /**
-   * @brief output log in at the end stage of training
-   *
-   * @note  flush log histroy and state at the end for sgd. it will also
-   *        flush some stateful stat for next pass.
-   */
-  void tuningSgdFinished();
-
-  /**
-   * @brief output log in at the middle stage of training
-   *
-   * @note  flush log histroy and state at the end for async-sgd.
-   *        it will log some performance log if some lagged node are found
-   */
-  void tuningAsyncsgdMidOutput();
-
-  /**
-   * @brief output log in at the end stage of training
-   *
-   * @note  flush log histroy and state at the end for async-sgd.
-   */
-  void tuningAsyncsgdFinished();
 };
 
 }  // namespace paddle
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index 0599889164..12e3bc6552 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -51,7 +51,7 @@ size_t SocketChannel::read(void* buf, size_t size) {
     else
       len = rdma::read(rdmaSocket_, (char*)buf + total, size - total);
 
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
     if (len <= 0) {
       return total;
     }
@@ -69,7 +69,7 @@ size_t SocketChannel::write(const void* buf, size_t size) {
     else
       len = rdma::write(rdmaSocket_, (char*)buf + total, size - total);
 
-    PCHECK(len >= 0) << " peer=" << peerName_;
+    CHECK(len >= 0) << " peer=" << peerName_;
     if (len <= 0) {
       return total;
     }
@@ -98,10 +98,10 @@ static size_t readwritev(IOFunc iofunc,
   while (size < total) {
     ssize_t len =
         iofunc(socket, &iovs[curIov], std::min(iovcnt - curIov, maxiovs));
-    PCHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
-                    << " iovCnt=" << iovcnt
-                    << " iovs[curIov].base=" << iovs[curIov].iov_base
-                    << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
+    CHECK(len > 0) << " peer=" << peerName << " curIov=" << curIov
+                   << " iovCnt=" << iovcnt
+                   << " iovs[curIov].base=" << iovs[curIov].iov_base
+                   << " iovs[curIov].iov_len=" << iovs[curIov].iov_len;
     size += len;
 
     /// restore iovs[curIov] to the original value
@@ -183,7 +183,7 @@ void SocketChannel::writeMessage(const std::vector<struct iovec>& userIovs) {
     header.totalLength += iov.iov_len;
   }
 
-  PCHECK(writev(iovs) == (size_t)header.totalLength);
+  CHECK(writev(iovs) == (size_t)header.totalLength);
 }
 
 std::unique_ptr<MsgReader> SocketChannel::readMessage() {
@@ -194,7 +194,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
     return nullptr;
   }
 
-  PCHECK(len == sizeof(header));
+  CHECK(len == sizeof(header));
 
   std::unique_ptr<MsgReader> msgReader(new MsgReader(this, header.numIovs));
 
@@ -209,7 +209,7 @@ std::unique_ptr<MsgReader> SocketChannel::readMessage() {
 MsgReader::MsgReader(SocketChannel* channel, size_t numBlocks)
     : channel_(channel), blockLengths_(numBlocks), currentBlockIndex_(0) {
   size_t size = numBlocks * sizeof(blockLengths_[0]);
-  PCHECK(channel_->read(&blockLengths_[0], size) == size);
+  CHECK(channel_->read(&blockLengths_[0], size) == size);
 }
 
 void MsgReader::readBlocks(const std::vector<void*>& bufs) {
@@ -223,12 +223,12 @@ void MsgReader::readBlocks(const std::vector<void*>& bufs) {
     ++currentBlockIndex_;
   }
 
-  PCHECK(channel_->readv(&iovs) == totalLength);
+  CHECK(channel_->readv(&iovs) == totalLength);
 }
 
 void MsgReader::readNextBlock(void* buf) {
   CHECK_LT(currentBlockIndex_, blockLengths_.size());
-  PCHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
+  CHECK(channel_->read(buf, getNextBlockLength()) == getNextBlockLength());
   ++currentBlockIndex_;
 }
 
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 6e8f9c37f6..b66a00ba06 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -3,7 +3,7 @@ add_unittest_without_exec(socket_test
     SocketTest.cpp)
 
 add_test(NAME socket_test
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
         ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
 
 ####################### test_ProtoServer ####################
@@ -12,7 +12,7 @@ add_unittest_without_exec(test_ProtoServer
 
 IF(NOT ON_TRAVIS)
     add_test(NAME test_ProtoServer
-        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
             ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
 ENDIF(NOT ON_TRAVIS)
 
@@ -24,5 +24,5 @@ ENDIF(NOT ON_TRAVIS)
 add_unittest_without_exec(test_ParameterServer2
     test_ParameterServer2.cpp)
 add_test(NAME test_ParameterServer2
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port -n 4
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
         ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 066a6c0293..b43461d61b 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -113,7 +113,7 @@ void SocketServer::run() {
 
   /* First call to socket() function */
   socket_ = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(socket_ >= 0) << "ERROR opening socket";
+  CHECK(socket_ >= 0) << "ERROR opening socket";
 
   /* Initialize socket structure */
   bzero((char*)&serv_addr, sizeof(serv_addr));
@@ -122,7 +122,7 @@ void SocketServer::run() {
   serv_addr.sin_port = htons(port_);
 
   /* Now bind the host address using bind() call.*/
-  PCHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(bind(socket_, (struct sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR on binding";
 
   /* Now start listening for the clients, here process will
@@ -134,7 +134,7 @@ void SocketServer::run() {
   while (true) {
     /* Accept actual connection from the client */
     newsockfd = accept(socket_, (struct sockaddr*)&cli_addr, &clilen);
-    PCHECK(newsockfd >= 0) << "ERROR on accept";
+    CHECK(newsockfd >= 0) << "ERROR on accept";
 
     SocketWorker* worker = new SocketWorker(newsockfd);
     worker->start();
@@ -146,17 +146,17 @@ void SocketWorker::run() {
 
   while (true) {
     int64_t n = channel_.readAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
 
     buffer_.resize(header.dataLength);
     n = channel_.readAll(&buffer_[0], header.dataLength);
-    PCHECK(n == header.dataLength) << "ERROR reading from socket";
+    CHECK(n == header.dataLength) << "ERROR reading from socket";
 
     /* Write a response to the client */
     n = channel_.writeAll(&header, sizeof(header));
-    PCHECK(n == sizeof(header)) << "ERROR reading from socket";
+    CHECK(n == sizeof(header)) << "ERROR reading from socket";
     n = channel_.writeAll(buffer_.data(), buffer_.size());
-    PCHECK(n == header.dataLength) << "ERROR writing to socket";
+    CHECK(n == header.dataLength) << "ERROR writing to socket";
   }
 }
 
@@ -177,9 +177,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
 
   /* Create a socket point */
   int sockfd = socket(AF_INET, SOCK_STREAM, 0);
-  PCHECK(sockfd >= 0) << "ERROR opening socket";
+  CHECK(sockfd >= 0) << "ERROR opening socket";
   server = gethostbyname(serverAddr.c_str());
-  PCHECK(server) << "ERROR, no such host: " << serverAddr;
+  CHECK(server) << "ERROR, no such host: " << serverAddr;
 
   bzero((char*)&serv_addr, sizeof(serv_addr));
   serv_addr.sin_family = AF_INET;
@@ -189,7 +189,7 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   serv_addr.sin_port = htons(serverPort);
 
   /* Now connect to the server */
-  PCHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
+  CHECK(connect(sockfd, (sockaddr*)&serv_addr, sizeof(serv_addr)) >= 0)
       << "ERROR connecting";
 
   channel_.reset(new SocketChannel(sockfd));
@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
 
   uint64_t dataSize = FLAGS_dim * sizeof(real);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuVector gpuParam(FLAGS_dim);
   GpuVector gpuGrad(FLAGS_dim);
 #else
@@ -234,18 +234,18 @@ int main(int argc, char** argv) {
     cpuGrad.copyFrom(gpuGrad);
 
     header.dataLength = dataSize;
-    PCHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->writeAll(&header, sizeof(header)) == sizeof(header))
         << "Client write header error";
 
-    PCHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
+    CHECK(channel->writeAll(cpuGrad.getData(), dataSize) == dataSize)
         << "Client write data error";
 
     /* Now read server response */
-    PCHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
+    CHECK(channel->readAll(&header, sizeof(header)) == sizeof(header))
         << "Client read header error";
 
     CHECK_EQ((uint64_t)header.dataLength, dataSize);
-    PCHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
+    CHECK(channel->readAll(cpuParam.getData(), dataSize) == dataSize)
         << "Client read data error";
 
     gpuParam.copyFrom(cpuParam);
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 04236fda2f..ad8ffed9c1 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
 }
 
 TEST(ProtoServer, extended) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   ProtoClient* client;
   if (FLAGS_rdma_tcp == "rdma")
     client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index edc2e02923..43614b9779 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -109,6 +109,10 @@ class DenseScanner(IScanner):
             if len(self.__shape__) > 3:
                 raise ValueError(
                     "The dimension of input cannot be greater than 3.")
+            if len(self.__shape__) == 0:
+                raise ValueError(
+                    "The input should be a vector, please check your input data."
+                )
             self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
             if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
                 raise ValueError(
@@ -140,7 +144,7 @@ class DenseScanner(IScanner):
         if len(self.__shape__) > 1:
             # The last-two dimenstions are the frame height and width.
             # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the fram height and width.
+            # The H and W are the frame height and width.
             h, w = self.__shape__[-2:]
             argument.setSlotFrameHeight(self.pos, h)
             argument.setSlotFrameWidth(self.pos, w)
diff --git a/paddle/pybind/.clang-format b/paddle/pybind/.clang-format
new file mode 120000
index 0000000000..7d28cb3924
--- /dev/null
+++ b/paddle/pybind/.clang-format
@@ -0,0 +1 @@
+../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000..de53fea0dd
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1,13 @@
+if(WITH_PYTHON)
+  cc_library(paddle_pybind SHARED
+    SRCS pybind.cc exception.cc protobuf.cc const_value.cc
+    DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
+    ${GLOB_OP_LIB})
+  if(NOT APPLE AND NOT ANDROID)
+    target_link_libraries(paddle_pybind rt)
+  endif(NOT APPLE AND NOT ANDROID)
+endif(WITH_PYTHON)
+
+if(WITH_DOC)
+  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
+endif(WITH_DOC)
diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc
new file mode 100644
index 0000000000..b13ad42ea2
--- /dev/null
+++ b/paddle/pybind/const_value.cc
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "const_value.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindConstValue(pybind11::module& m) {
+  m.def("kEmptyVarName", [] { return framework::kEmptyVarName; });
+  m.def("kTempVarName", [] { return framework::kTempVarName; });
+  m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
+  m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/const_value.h b/paddle/pybind/const_value.h
new file mode 100644
index 0000000000..3d57c972a9
--- /dev/null
+++ b/paddle/pybind/const_value.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+extern void BindConstValue(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/exception.cc b/paddle/pybind/exception.cc
new file mode 100644
index 0000000000..e29ac3ebab
--- /dev/null
+++ b/paddle/pybind/exception.cc
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pybind/exception.h"
+
+namespace paddle {
+namespace pybind {
+
+void BindException(pybind11::module& m) {
+  static pybind11::exception<platform::EnforceNotMet> exc(m, "EnforceNotMet");
+  pybind11::register_exception_translator([](std::exception_ptr p) {
+    try {
+      if (p) std::rethrow_exception(p);
+    } catch (const platform::EnforceNotMet& e) {
+      exc(e.what());
+    }
+  });
+
+  m.def("__unittest_throw_exception__", [] { PADDLE_THROW("test exception"); });
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/exception.h b/paddle/pybind/exception.h
new file mode 100644
index 0000000000..436ddd5707
--- /dev/null
+++ b/paddle/pybind/exception.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <Python.h>
+#include "paddle/platform/enforce.h"
+#include "pybind11/pybind11.h"
+namespace paddle {
+namespace pybind {
+
+extern void BindException(pybind11::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
new file mode 100644
index 0000000000..b55ddee176
--- /dev/null
+++ b/paddle/pybind/print_operators_doc.cc
@@ -0,0 +1,148 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <iostream>
+#include <sstream>  // std::stringstream
+#include <string>
+
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/pybind/pybind.h"
+
+std::string Escape(const std::string& s) {
+  std::string r;
+  for (size_t i = 0; i < s.size(); i++) {
+    switch (s[i]) {
+      case '\"':
+        r += "\\\"";
+        break;
+      case '\\':
+        r += "\\\\";
+        break;
+      case '\n':
+        r += "\\n";
+        break;
+      case '\t':
+        r += "\\t";
+      case '\r':
+        break;
+      default:
+        r += s[i];
+        break;
+    }
+  }
+  return r;
+}
+
+std::string AttrType(paddle::framework::proto::AttrType at) {
+  switch (at) {
+    case paddle::framework::proto::INT:
+      return "int";
+    case paddle::framework::proto::FLOAT:
+      return "float";
+    case paddle::framework::proto::STRING:
+      return "string";
+    case paddle::framework::proto::BOOLEAN:
+      return "bool";
+    case paddle::framework::proto::INTS:
+      return "int array";
+    case paddle::framework::proto::FLOATS:
+      return "float array";
+    case paddle::framework::proto::STRINGS:
+      return "string array";
+    case paddle::framework::proto::BOOLEANS:
+      return "bool array";
+    case paddle::framework::proto::BLOCK:
+      return "block id";
+    case paddle::framework::proto::LONG:
+      return "long";
+  }
+  return "UNKNOWN";  // not possible
+}
+
+void PrintVar(const paddle::framework::proto::OpProto::Var& v,
+              std::stringstream& ss) {
+  ss << " { "
+     << "\n"
+     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
+     << "   \"comment\" : \"" << Escape(v.comment()) << "\",\n"
+     << "   \"duplicable\" : " << v.duplicable() << ",\n"
+     << "   \"intermediate\" : " << v.intermediate() << "\n"
+     << " },";
+}
+
+void PrintAttr(const paddle::framework::proto::OpProto::Attr& a,
+               std::stringstream& ss) {
+  ss << " { "
+     << "\n"
+     << "   \"name\" : \"" << Escape(a.name()) << "\",\n"
+     << "   \"type\" : \"" << AttrType(a.type()) << "\",\n"
+     << "   \"comment\" : \"" << Escape(a.comment()) << "\",\n"
+     << "   \"generated\" : " << a.generated() << "\n"
+     << " },";
+}
+
+void PrintOpProto(const std::string& type,
+                  const paddle::framework::OpInfo& opinfo,
+                  std::stringstream& ss) {
+  std::cerr << "Processing " << type << "\n";
+
+  const paddle::framework::proto::OpProto* p = opinfo.proto_;
+  if (p == nullptr) {
+    return;  // It is possible that an operator doesn't have OpProto.
+  }
+
+  ss << "{\n"
+     << " \"type\" : \"" << Escape(p->type()) << "\",\n"
+     << " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
+
+  ss << " \"inputs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->inputs_size(); i++) {
+    PrintVar(p->inputs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ], "
+     << "\n";
+
+  ss << " \"outputs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->outputs_size(); i++) {
+    PrintVar(p->outputs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ], "
+     << "\n";
+
+  ss << " \"attrs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->attrs_size(); i++) {
+    PrintAttr(p->attrs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ] "
+     << "\n";
+
+  ss << "},";
+}
+
+int main() {
+  std::stringstream ss;
+  ss << "[\n";
+  for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
+    PrintOpProto(iter.first, iter.second, ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << "]\n";
+  std::cout << ss.str();
+}
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
new file mode 100644
index 0000000000..371d6119d4
--- /dev/null
+++ b/paddle/pybind/protobuf.cc
@@ -0,0 +1,289 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pybind/protobuf.h"
+#include <deque>
+#include <iostream>
+#include "paddle/framework/backward.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/program_desc.h"
+#include "paddle/framework/var_desc.h"
+
+// Cast boost::variant for PyBind.
+// Copy from
+// https://github.com/pybind/pybind11/issues/576#issuecomment-269563199
+namespace pybind11 {
+namespace detail {
+
+// Can be replaced by a generic lambda in C++14
+struct variant_caster_visitor : public boost::static_visitor<handle> {
+  return_value_policy policy;
+  handle parent;
+
+  variant_caster_visitor(return_value_policy policy, handle parent)
+      : policy(policy), parent(parent) {}
+
+  template <class T>
+  handle operator()(T const &src) const {
+    return make_caster<T>::cast(src, policy, parent);
+  }
+};
+
+template <class Variant>
+struct variant_caster;
+
+template <template <class...> class V, class... Ts>
+struct variant_caster<V<Ts...>> {
+  using Type = V<Ts...>;
+
+  template <typename T>
+  typename std::enable_if<
+      !std::is_same<T, boost::detail::variant::void_>::value, bool>::type
+  try_load(handle src, bool convert) {
+    auto caster = make_caster<T>();
+    if (!load_success_ && caster.load(src, convert)) {
+      load_success_ = true;
+      value = cast_op<T>(caster);
+      return true;
+    }
+    return false;
+  }
+
+  template <typename T>
+  typename std::enable_if<std::is_same<T, boost::detail::variant::void_>::value,
+                          bool>::type
+  try_load(handle src, bool convert) {
+    return false;
+  }
+
+  bool load(handle src, bool convert) {
+    auto unused = {false, try_load<Ts>(src, convert)...};
+    (void)(unused);
+    return load_success_;
+  }
+
+  static handle cast(Type const &src, return_value_policy policy,
+                     handle parent) {
+    variant_caster_visitor visitor(policy, parent);
+    return boost::apply_visitor(visitor, src);
+  }
+
+  PYBIND11_TYPE_CASTER(Type, _("Variant"));
+  bool load_success_{false};
+};
+
+// Add specialization for concrete variant type
+template <class... Args>
+struct type_caster<boost::variant<Args...>>
+    : variant_caster<boost::variant<Args...>> {};
+
+}  // namespace detail
+}  // namespace pybind11
+
+namespace paddle {
+namespace pybind {
+
+using namespace paddle::framework;  // NOLINT
+
+template <typename T>
+static py::bytes SerializeMessage(T &self) {
+  // Check IsInitialized in Python
+  std::string retv;
+  PADDLE_ENFORCE(self.Proto()->SerializePartialToString(&retv),
+                 "Cannot serialize message");
+  return retv;
+}
+
+// Bind Methods
+void BindProgramDesc(py::module &m) {
+  py::class_<ProgramDesc>(m, "ProgramDesc", "")
+      .def(py::init<>())
+      .def("__init__",
+           [](ProgramDesc &self, const ProgramDesc &other) {
+             new (&self) ProgramDesc(other);
+           })
+      .def("__init__",
+           [](ProgramDesc &self, const py::bytes &binary_str) {
+             std::string str(binary_str);
+             new (&self) ProgramDesc(str);
+           })
+      .def("append_block", &ProgramDesc::AppendBlock,
+           py::return_value_policy::reference)
+      .def("append_backward",
+           [](ProgramDesc &program_desc, const VarDesc &target,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
+           })
+      .def("block", &ProgramDesc::MutableBlock,
+           py::return_value_policy::reference)
+      .def("num_blocks", &ProgramDesc::Size)
+      .def("serialize_to_string", SerializeMessage<ProgramDesc>)
+      .def("parse_from_string",
+           [](ProgramDesc &program_desc, const std::string &data) {
+             proto::ProgramDesc *desc = program_desc.Proto();
+             PADDLE_ENFORCE(desc->ParseFromString(data),
+                            "Fail to parse ProgramDesc from string. This could "
+                            "be a bug of Paddle.");
+           });
+}
+
+void BindBlockDesc(py::module &m) {
+  py::class_<BlockDesc>(m, "BlockDesc", "")
+      .def_property_readonly("id", &BlockDesc::ID)
+      .def_property_readonly("parent", &BlockDesc::Parent)
+      .def("append_op", &BlockDesc::AppendOp,
+           py::return_value_policy::reference)
+      .def("prepend_op", &BlockDesc::PrependOp,
+           py::return_value_policy::reference)
+      .def("remove_op", &BlockDesc::RemoveOp)
+      .def("var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.Var(name);
+           },
+           py::return_value_policy::reference)
+      .def("has_var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVar(name);
+           })
+      .def("has_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.HasVarRecursive(name);
+           })
+      .def("find_var",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var_recursive",
+           [](BlockDesc &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVarRecursive(name);
+           },
+           py::return_value_policy::reference)
+      .def("all_vars", &BlockDesc::AllVars, py::return_value_policy::reference)
+      .def("op_size", &BlockDesc::OpSize)
+      .def("op", &BlockDesc::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", SerializeMessage<BlockDesc>);
+}
+
+void BindVarDsec(py::module &m) {
+  py::enum_<proto::DataType>(m, "DataType", "")
+      .value("BOOL", proto::DataType::BOOL)
+      .value("INT16", proto::DataType::INT16)
+      .value("INT32", proto::DataType::INT32)
+      .value("INT64", proto::DataType::INT64)
+      .value("FP16", proto::DataType::FP16)
+      .value("FP32", proto::DataType::FP32)
+      .value("FP64", proto::DataType::FP64);
+
+  py::class_<VarDesc> var_desc(m, "VarDesc", "");
+  var_desc
+      .def("name",
+           [](const VarDesc &self) {
+             py::bytes name = self.Name();
+             return name;
+           },
+           py::return_value_policy::reference)
+      .def("set_name", &VarDesc::SetName)
+      .def("set_shape", &VarDesc::SetShape)
+      .def("set_dtype", &VarDesc::SetDataType)
+      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("type", &VarDesc::GetType)
+      .def("set_type", &VarDesc::SetType)
+      .def("serialize_to_string", SerializeMessage<VarDesc>)
+      .def("persistable", &VarDesc::Persistable)
+      .def("set_persistable", &VarDesc::SetPersistable);
+
+  py::enum_<proto::VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", proto::VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", proto::VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", proto::VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
+      .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
+      .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
+}
+
+void BindOpDesc(py::module &m) {
+  py::enum_<proto::AttrType>(m, "AttrType", "")
+      .value("INT", proto::AttrType::INT)
+      .value("INTS", proto::AttrType::INTS)
+      .value("FLOAT", proto::AttrType::FLOAT)
+      .value("FLOATS", proto::AttrType::FLOATS)
+      .value("STRING", proto::AttrType::STRING)
+      .value("STRINGS", proto::AttrType::STRINGS)
+      .value("BOOL", proto::AttrType::BOOLEAN)
+      .value("BOOLS", proto::AttrType::BOOLEANS)
+      .value("BLOCK", proto::AttrType::BLOCK);
+
+  py::class_<OpDesc> op_desc(m, "OpDesc", "");
+  op_desc
+      .def("__init__", [](OpDesc &self) { new (&self) OpDesc(); },
+           py::return_value_policy::reference)
+      .def("copy_from", &OpDesc::CopyFrom)
+      .def("type", &OpDesc::Type)
+      .def("set_type", &OpDesc::SetType)
+      .def("input", &OpDesc::Input)
+      .def("input_names", &OpDesc::InputNames)
+      .def("output", &OpDesc::Output)
+      .def("output_names", &OpDesc::OutputNames)
+      .def("set_input", &OpDesc::SetInput)
+      .def("set_output", &OpDesc::SetOutput)
+      .def("input_arg_names", &OpDesc::InputArgumentNames)
+      .def("output_arg_names", &OpDesc::OutputArgumentNames)
+      .def("rename_input", &OpDesc::RenameInput)
+      .def("rename_output", &OpDesc::RenameOutput)
+      .def("has_attr", &OpDesc::HasAttr)
+      .def("attr_type", &OpDesc::GetAttrType)
+      .def("attr_names", &OpDesc::AttrNames)
+      .def("set_attr", &OpDesc::SetAttr)
+      .def("attr", &OpDesc::GetAttr)
+      .def("set_block_attr", &OpDesc::SetBlockAttr)
+      .def("set_serialized_attr",
+           [](OpDesc &self, const std::string &name,
+              const py::bytes &seriralized) {
+             std::string ser(seriralized);
+             self.SetAttr(name, ser);
+           })
+      .def("block_attr", &OpDesc::GetBlockAttr)
+      .def("check_attrs", &OpDesc::CheckAttrs)
+      .def("infer_shape", &OpDesc::InferShape)
+      .def("infer_var_type", &OpDesc::InferVarType)
+      .def("serialize_to_string", SerializeMessage<OpDesc>)
+      .def("block", &OpDesc::Block, py::return_value_policy::reference);
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/protobuf.h b/paddle/pybind/protobuf.h
new file mode 100644
index 0000000000..9e747e9ea6
--- /dev/null
+++ b/paddle/pybind/protobuf.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+#include "paddle/platform/variant.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+namespace pybind {
+
+void BindProgramDesc(py::module& m);
+void BindBlockDesc(py::module& m);
+void BindVarDsec(py::module& m);
+void BindOpDesc(py::module& m);
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
new file mode 100644
index 0000000000..490397afdd
--- /dev/null
+++ b/paddle/pybind/pybind.cc
@@ -0,0 +1,503 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pybind/protobuf.h"
+
+#include <mutex>  // for call_once
+#include <unordered_map>
+#include "paddle/framework/backward.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/prune.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/cond_op.h"
+#include "paddle/operators/net_op.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+#include "paddle/platform/profiler.h"
+#include "paddle/pybind/const_value.h"
+#include "paddle/pybind/exception.h"
+#include "paddle/pybind/pybind.h"
+#include "paddle/pybind/tensor_py.h"
+#include "paddle/string/to_string.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
+#include "paddle/platform/gpu_info.h"
+#endif
+
+// disable auto conversion to list in Python
+PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
+
+namespace paddle {
+namespace pybind {
+static size_t UniqueIntegerGenerator(const std::string &prefix) {
+  static std::unordered_map<std::string, std::atomic<size_t>> generators;
+  return generators[prefix].fetch_add(1);
+}
+
+bool IsCompiledWithCUDA() {
+#ifndef PADDLE_WITH_CUDA
+  return false;
+#else
+  return true;
+#endif
+}
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of PaddlePaddle");
+
+  // using framework in this function. Since it is inside a function, it will
+  // not cause namespace pollution.
+  using namespace paddle::framework;  // NOLINT
+
+  BindException(m);
+
+  py::class_<Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def("get_dims",
+           [](const Tensor &self) { return vectorize(self.dims()); })
+      .def("set_dims",
+           [](Tensor &self, const std::vector<int64_t> &dim) {
+             self.Resize(make_ddim(dim));
+           })
+      .def("set_layout",
+           [](Tensor &self, const std::string &layout) {
+             self.set_layout(StringToDataLayout(layout));
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CPUPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_int",
+           [](Tensor &self, paddle::platform::CUDAPlace &place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", PyCPUTensorSetFromArray<float>)
+      .def("set", PyCPUTensorSetFromArray<int>)
+      .def("set", PyCPUTensorSetFromArray<double>)
+      .def("set", PyCPUTensorSetFromArray<int64_t>)
+      .def("set", PyCPUTensorSetFromArray<bool>)
+#ifdef PADDLE_WITH_CUDA
+      .def("set", PyCUDATensorSetFromArray<float>)
+      .def("set", PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<double>)
+      .def("set", PyCUDATensorSetFromArray<int64_t>)
+      .def("set", PyCUDATensorSetFromArray<bool>)
+#endif
+      .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
+      .def("set_float_element", TensorSetElement<float>)
+      .def("get_float_element", TensorGetElement<float>)
+      .def("set_double_element", TensorSetElement<double>)
+      .def("get_double_element", TensorGetElement<double>)
+      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
+
+  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
+      .def_buffer(
+          [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
+      .def(
+          "__init__",
+          [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
+#ifndef PADDLE_WITH_CUDA
+            new (&instance) LoDTensor(lod);
+#else
+             LoD new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             new (&instance) LoDTensor(new_lod);
+#endif
+          })
+      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
+      .def("set_lod",
+           [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
+#ifndef PADDLE_WITH_CUDA
+             self.set_lod(lod);
+#else
+             LoD new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             self.set_lod(new_lod);
+#endif
+           })
+      .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
+#ifndef PADDLE_WITH_CUDA
+        return self.lod();
+#else
+           auto lod = self.lod();
+           std::vector<std::vector<size_t>> new_lod;
+           new_lod.reserve(lod.size());
+           std::transform(lod.begin(), lod.end(), std::back_inserter(new_lod),
+               [](Vector<size_t> item) ->
+                   std::vector<size_t> {
+                 std::vector<size_t> v;
+                 v.reserve(item.size());
+                 std::copy(item.begin(), item.end(), std::back_inserter(v));
+                 return v;
+               });
+           return new_lod;
+#endif
+      });
+
+  py::class_<SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
+      .def("__init__",
+           [](SelectedRows &instance, const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) SelectedRows(rows, height);
+           })
+      .def("get_tensor",
+           [](SelectedRows &self) { return self.mutable_value(); },
+           py::return_value_policy::reference)
+      .def("set_height", &SelectedRows::set_height)
+      .def("height", &SelectedRows::height)
+      .def("set_rows",
+           [](SelectedRows &self, std::vector<int64_t> rows) {
+#ifndef PADDLE_WITH_CUDA
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
+      .def("rows", [](SelectedRows &self) {
+#ifndef PADDLE_WITH_CUDA
+        return self.rows();
+#else
+         auto rows = self.rows();
+         std::vector<int64_t> new_rows;
+         new_rows.reserve(rows.size());
+         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+         return new_rows;
+#endif
+      });
+
+  py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const Variable &var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
+      .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
+      .def("set_float",
+           [](Variable &var, float val) -> void {
+             *var.GetMutable<float>() = val;
+           })
+      .def("get_float",
+           [](const Variable &var) -> float { return var.Get<float>(); })
+      .def("get_tensor",
+           [](Variable &self) -> LoDTensor * {
+             return self.GetMutable<LoDTensor>();
+           },
+           py::return_value_policy::reference)
+      .def("get_lod_rank_table",
+           [](Variable &self) { return self.GetMutable<LoDRankTable>(); },
+           py::return_value_policy::reference)
+      .def("get_selected_rows",
+           [](Variable &self) -> SelectedRows * {
+             return self.GetMutable<SelectedRows>();
+           },
+           py::return_value_policy::reference)
+      .def("get_lod_tensor_array",
+           [](Variable &self) { return self.GetMutable<LoDTensorArray>(); },
+           py::return_value_policy::reference)
+#ifdef PADDLE_WITH_CUDA
+      .def("get_communicator",
+           [](Variable &self) -> platform::Communicator * {
+             return self.GetMutable<platform::Communicator>();
+           },
+           py::return_value_policy::reference)
+#endif
+      .def("get_net",
+           [](Variable &self) -> operators::NetOp * {
+             return self.GetMutable<operators::NetOp>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<Scope>(m, "Scope", "")
+      .def("var",
+           [](Scope &self, const std::string &name) -> Variable * {
+             return self.Var(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope", [](Scope &self) -> Scope * { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("drop_kids", &Scope::DropKids);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
+    std::vector<py::bytes> ret_values;
+    for (auto &iter : OpInfoMap::Instance().map()) {
+      auto &info = iter.second;
+      if (info.HasOpProtoAndChecker()) {
+        std::string str;
+        PADDLE_ENFORCE(
+            info.Proto().SerializeToString(&str),
+            "Serialize OpProto Error. This could be a bug of Paddle.");
+        ret_values.emplace_back(str);
+      }
+    }
+    return ret_values;
+  });
+  m.def(
+      "get_grad_op_desc", [](const OpDesc &op_desc,
+                             const std::unordered_set<std::string> &no_grad_set,
+                             const std::vector<BlockDesc *> &grad_sub_block) {
+        std::unordered_map<std::string, std::string> grad_to_var;
+        std::vector<std::unique_ptr<OpDesc>> grad_op_descs =
+            framework::OpInfoMap::Instance()
+                .Get(op_desc.Type())
+                .GradOpMaker()(op_desc, no_grad_set, &grad_to_var,
+                               grad_sub_block);
+        std::vector<OpDesc *> grad_op_desc_ptrs(grad_op_descs.size());
+        std::transform(grad_op_descs.begin(), grad_op_descs.end(),
+                       grad_op_desc_ptrs.begin(),
+                       [](std::unique_ptr<OpDesc> &p) { return p.release(); });
+        return std::make_pair(grad_op_desc_ptrs, grad_to_var);
+      });
+  m.def("prune", [](const ProgramDesc &origin,
+                    const std::vector<std::array<size_t, 2>> &targets) {
+    ProgramDesc prog_with_targets(origin);
+    for (const auto &t : targets) {
+      prog_with_targets.MutableBlock(t[0])->Op(t[1])->MarkAsTarget();
+    }
+    proto::ProgramDesc pruned_desc;
+    Prune(*prog_with_targets.Proto(), &pruned_desc);
+    return new ProgramDesc(pruned_desc);
+  });
+  m.def("inference_optimize", [](ProgramDesc &origin) {
+    proto::ProgramDesc pruned_desc;
+    InferenceOptimize(*(origin.Proto()), &pruned_desc);
+    return new ProgramDesc(pruned_desc);
+  });
+  m.def("empty_var_name", []() { return framework::kEmptyVarName; });
+  m.def("grad_var_suffix", []() { return framework::kGradVarSuffix; });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", []() { return kEmptyVarName; })
+      .def("temp", []() { return kTempVarName; });
+  // clang-format off
+  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::CUDAPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifndef PADDLE_WITH_CUDA
+                    PADDLE_THROW("CUDAPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+// clang-format on
+
+#ifdef PADDLE_WITH_CUDA
+  py::class_<platform::Communicator>(m, "Communicator").def(py::init<>());
+#endif
+  py::class_<platform::CUDAPlace>(m, "CUDAPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::CUDAPlace &>);
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);
+
+  py::class_<platform::Place>(m, "Place")
+      .def(py::init<>())
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
+             self = gpu_place;
+           });
+
+  py::class_<OperatorBase>(m, "Operator")
+      .def_static("create",
+                  [](py::bytes protobin) {
+                    proto::OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    return OpRegistry::CreateOp(desc);
+                  })
+      .def("backward",
+           [](const OperatorBase &forwardOp,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             return Backward(forwardOp, no_grad_vars).release();
+           })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CPUPlace &place) { self.Run(scope, place); })
+      .def("run",
+           [](OperatorBase &self, const Scope &scope,
+              const platform::CUDAPlace &place) { self.Run(scope, place); })
+      .def("type",
+           [](const OperatorBase &op) -> std::string { return op.Type(); })
+      .def("outputs",
+           [](const OperatorBase &op)
+               -> std::map<std::string, std::vector<std::string>> {
+                 return op.Outputs();
+               })
+      .def("output_vars",
+           [](const OperatorBase &op) { return op.OutputVars(true); })
+      .def("inputs", [](const OperatorBase &op) { return op.Inputs(); })
+      .def("input_vars", [](const OperatorBase &op) { return op.InputVars(); })
+      .def("__str__", &OperatorBase::DebugString)
+      .def("no_intermediate_outputs",
+           [](const OperatorBase &op) { return op.OutputVars(false); })
+      .def("support_gpu", &OperatorBase::SupportGPU);
+
+  py::class_<operators::NetOp, OperatorBase>(m, "Net")
+      .def_static("create",
+                  []() -> operators::NetOp * {
+                    auto *retv = new operators::NetOp;
+                    retv->SetType("plain_net");
+                    return retv;
+                  })
+      .def("append_op", [](operators::NetOp &self,
+                           const OperatorBase &op) { self.AppendOp(op); })
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
+        self->CompleteAddOp();
+      });
+
+  // cond_op
+  py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::CondOp * {
+                    proto::OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto cond_op = OpRegistry::CreateOp(desc);
+                    return static_cast<operators::CondOp *>(cond_op.release());
+                  })
+      .def("set_truenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_truenet(net.Clone());
+           })
+      .def("set_falsenet",
+           [](operators::CondOp &self, const operators::NetOp &net) -> void {
+             self.set_falsenet(net.Clone());
+           });
+
+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<const platform::Place &>())
+      .def("run",
+           (void (Executor::*)(const ProgramDesc &, Scope *, int, bool, bool)) &
+               Executor::Run);
+
+  m.def("unique_integer", UniqueIntegerGenerator);
+  m.def("init_gflags", framework::InitGflags);
+  m.def("init_glog", framework::InitGLOG);
+  m.def("init_devices", &framework::InitDevices);
+
+  m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
+
+  m.def("set_feed_variable", framework::SetFeedVariable);
+  m.def("get_fetch_variable", framework::GetFetchVariable);
+
+  BindProgramDesc(m);
+  BindBlockDesc(m);
+  BindVarDsec(m);
+  BindOpDesc(m);
+  BindConstValue(m);
+
+  py::class_<framework::LoDRankTable>(m, "LodRankTable")
+      .def("items", [](framework::LoDRankTable &table) {
+        std::vector<std::pair<size_t, size_t>> res;
+        for (auto &item : table.items()) {
+          res.push_back({item.index, item.length});
+        }
+        return res;
+      });
+
+  py::class_<LoDTensorArray>(m, "LoDTensorArray")
+      .def("__getitem__",
+           [](LoDTensorArray &self, size_t i) { return &self.at(i); },
+           py::return_value_policy::reference)
+      .def("__len__", [](LoDTensorArray &self) { return self.size(); })
+      .def("__setitem__",
+           [](LoDTensorArray &self, size_t i, const LoDTensor &t) {
+             PADDLE_ENFORCE_LT(i, self.size());
+             self[i].ShareDataWith(t);
+             self[i].set_lod(t.lod());
+           })
+      .def("append", [](LoDTensorArray &self, const LoDTensor &t) {
+        self.emplace_back();
+        self.back().ShareDataWith(t);
+        self.back().set_lod(t.lod());
+      });
+
+  m.def("op_support_gpu", OpSupportGPU);
+#ifdef PADDLE_WITH_CUDA
+  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
+#endif
+
+  py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
+      .value("kDisabled", platform::ProfilerState::kDisabled)
+      .value("kCPU", platform::ProfilerState::kCPU)
+      .value("kCUDA", platform::ProfilerState::kCUDA)
+      .export_values();
+
+  py::enum_<platform::EventSortingKey>(m, "EventSortingKey", py::arithmetic())
+      .value("kDefault", platform::EventSortingKey::kDefault)
+      .value("kCalls", platform::EventSortingKey::kCalls)
+      .value("kTotal", platform::EventSortingKey::kTotal)
+      .value("kMin", platform::EventSortingKey::kMin)
+      .value("kMax", platform::EventSortingKey::kMax)
+      .value("kAve", platform::EventSortingKey::kAve)
+      .export_values();
+
+  m.def("enable_profiler", platform::EnableProfiler);
+  m.def("disable_profiler", platform::DisableProfiler);
+  m.def("reset_profiler", platform::ResetProfiler);
+  return m.ptr();
+}
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
new file mode 100644
index 0000000000..3b5210e2b9
--- /dev/null
+++ b/paddle/pybind/tensor_py.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "paddle/platform/device_context.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
+        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
+            tensor.dims(), platform::CPUPlace()));
+
+        platform::DeviceContextPool &pool =
+            platform::DeviceContextPool::Instance();
+        auto dev_ctx = static_cast<const platform::CUDADeviceContext *>(
+            pool.Get(tensor.place()));
+
+        paddle::platform::GpuMemcpyAsync(
+            dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(),
+            cudaMemcpyDeviceToHost, dev_ctx->stream());
+#else
+        PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+      } else if (paddle::platform::is_cpu_place(tensor.place())) {
+        dst_tensor = tensor;
+      }
+      return py::buffer_info(dst_tensor.data<CUR_TYPE>(), sizeof(CUR_TYPE),
+                             py::format_descriptor<CUR_TYPE>::format(),
+                             (size_t)framework::arity(dst_tensor.dims()),
+                             dims_outside, strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info =
+      details::CastToPyBufferImpl<true, 0, float, int, double, int64_t, bool>()(
+          tensor);
+  return buffer_info;
+}
+
+template <typename T>
+T TensorGetElement(framework::Tensor &self, size_t offset) {
+  if (platform::is_cpu_place(self.place())) {
+    return self.data<T>()[offset];
+  } else {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    return dst->data<T>()[offset];
+  }
+}
+
+// TODO(dzhwinter) : fix the redundent Tensor allocate and free
+template <typename T>
+void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+  if (platform::is_gpu_place(self.place())) {
+    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Copy(self, platform::CPUPlace(), dst.get());
+    dst->data<T>()[offset] = elem;
+    framework::Copy(*dst.get(), self.place(), &self);
+
+  } else if (platform::is_cpu_place(self.place())) {
+    self.data<T>()[offset] = elem;
+  }
+}
+
+template <typename T>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CUDAPlace &place) {
+  std::vector<int64_t> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(place));
+  paddle::platform::GpuMemcpyAsync(dst, array.data(), sizeof(T) * array.size(),
+                                   cudaMemcpyHostToDevice, dev_ctx->stream());
+}
+#endif
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 66a46e1883..68cb5a19f9 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -1,17 +1,7 @@
 configure_file(submit_local.sh.in
-    submit_local.sh
+    paddle
     @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/paddle DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle)
-
-configure_file(tools/usage_stat/usage.sh
-    usage.sh
-    @ONLY)
-
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
-        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle_usage)
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh
new file mode 100755
index 0000000000..af16b84ca8
--- /dev/null
+++ b/paddle/scripts/check_env.sh
@@ -0,0 +1,261 @@
+#!/bin/bash
+
+if [ "`uname -s`" != "Linux" ]; then
+  echo "Current scenario only support in Linux yet!"
+  exit 0
+fi
+
+echo "========================= Hardware Information ========================="
+sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l`
+cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l`
+ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs`
+physical_cores=$((sockets * cores_per_socket))
+virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
+numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs`
+echo "CPU Name               : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`"
+echo "CPU Family             : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`"
+echo "Socket Number          : $sockets"
+echo "Cores Per Socket       : $cores_per_socket"
+echo "Total Physical Cores   : $physical_cores"
+echo "Total Virtual Cores    : $virtual_cores"
+if [ $ht -eq 1 ]; then
+  echo "Hyper Threading        : OFF"
+  if [ $physical_cores -ne $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+else
+  echo "Hyper Threading        : ON"
+  if [ $physical_cores -ge $virtual_cores ]; then
+    echo "Error: HT logical error"
+  fi
+fi
+echo "NUMA Nodes             : $numa_nodes"
+if [ $numa_nodes -lt $sockets ]; then
+  echo "Warning: NUMA node is not enough for the best performance,\
+ at least $sockets"
+fi
+
+echo "-------------------------- Memory Information --------------------------"
+# dmidecode support start from 2.11
+dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs`
+if [ $dmi_ver -lt 2 ]; then
+  echo "Error: dmidecode unknown or version is too old"
+  exit 0
+fi
+if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then
+  echo "Error: need root to run dmidecode"
+  exit 0
+fi
+max_dimms=0
+num_dimms_installed=0
+for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do
+  num_refered=`dmidecode |grep -wc "$dimm_id"`
+  # the actual dimm id should be refered only once
+  if [ $num_refered -eq 1 ]; then
+    num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0};
+      /Unknown/ {f=1};
+      /Manufacturer/ {if (s==1) {print f; exit 0;}};'`
+    if [ $num_unknown -eq 0 ]; then
+      dimms_installed="$dimms_installed \n $dimm_id"
+      ((num_dimms_installed++))
+    else
+      dimms_uninstalled="$dimms_uninstalled \n $dimm_id"
+    fi
+    ((max_dimms++))
+  fi
+done
+echo "Installed DIMM number  : $num_dimms_installed"
+num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l`
+if [ $num_dimms_installed -ne $num_dimms_mapped ]; then
+  echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped"
+fi
+num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"`
+if [ $num_dimms_installed -ne $num_clock_configed ]; then
+  echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed"
+fi
+echo -e "Installed DIMMs Locator: $dimms_installed"
+echo -e "Not installed DIMMs    : $dimms_uninstalled"
+max_dimm_slots=`dmidecode | grep -c "Bank Locator"`
+echo "DIMMs max slots        : $max_dimm_slots"
+if [ $max_dimms -ne $max_dimm_slots ]; then
+  echo "Error: The max dimm slots do not match the max dimms: $max_dimms"
+fi
+free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'`
+free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'`
+if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then
+  mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+  mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" 
+  swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB"
+  total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB"
+else
+  mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`
+  swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`
+  total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`
+fi
+echo "Memory Size            : $mem_sz"
+echo "Swap Memory Size       : $swap_sz"
+echo "Total Memory Size      : $total_sz"
+echo "Max Memory Capacity    : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`"
+# DIMMs fequency
+clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs`
+echo "Configed Clock Speed   : $clock_speeds"
+num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l`
+if [ $num_clock_type -ne 1 ]; then
+  echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds"
+fi
+
+echo "-------------------------- Turbo Information  --------------------------"
+scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver`
+echo "Scaling Driver         : $scaling_drive"
+if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then
+  turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo`
+  if [ $turbo -eq 1 ]; then
+    echo "Turbo Status           : OFF"
+  else
+    echo "Turbo Status           : ON"
+  fi
+else
+  echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS"
+  echo "Turbo Status           : Unknown"
+fi
+# cpu frequency
+num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l`
+num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l`
+if [ $num_max_freq -ne 1 ]; then
+  echo "Error: the max_frequency of all CPU should be equal"
+fi
+if [ $num_min_freq -ne 1 ]; then
+  echo "Error: the min_frequency of all CPU should be equal"
+fi
+max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz
+max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz
+min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz
+min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz
+echo "CPU Max Frequency      : $max_freq GHz"
+echo "CPU Min Frequency      : $min_freq GHz"
+# cpu governor
+num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l`
+if [ $num_governor -ne 1 ]; then
+  echo "Error: the governor of all CPU should be the same"
+fi
+governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq`
+echo "CPU Freq Governor      : $governor"
+
+
+echo "========================= Software Information ========================="
+echo "BIOS Release Date      : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`"
+echo "OS Version             : `cat /etc/redhat-release`"
+echo "Kernel Release Version : `uname -r`"
+echo "Kernel Patch Version   : `uname -v`"
+echo "GCC Version            :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`"
+if command -v cmake >/dev/null 2>&1; then 
+  cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`
+else
+  cmake_ver=" Not installed"
+fi
+echo "CMake Version          :$cmake_ver"
+echo "------------------ Environment Variables Information -------------------"
+kmp_affinity=`env | grep KMP_AFFINITY`
+omp_dynamic=`env | grep OMP_DYNAMIC`
+omp_nested=`env | grep OMP_NESTED`
+omp_num_threads=`env | grep OMP_NUM_THREADS`
+mkl_num_threads=`env | grep MKL_NUM_THREADS`
+mkl_dynamic=`env | grep MKL_DYNAMIC`
+if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi
+if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi
+if [ ! $omp_nested ]; then omp_nested="unset"; fi
+if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi
+if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi
+if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi
+echo "KMP_AFFINITY           : $kmp_affinity"
+echo "OMP_DYNAMIC            : $omp_dynamic"
+echo "OMP_NESTED             : $omp_nested"
+echo "OMP_NUM_THREADS        : $omp_num_threads"
+echo "MKL_NUM_THREADS        : $mkl_num_threads"
+echo "MKL_DYNAMIC            : $mkl_dynamic"
+# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH
+for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do
+  mkldnn_found=`find $path -name "libmkldnn.so"`
+  if [ "$mkldnn_found" ]; then
+    echo "Found MKL-DNN          : $mkldnn_found"
+  fi
+  mklml_found=`find $path -name "libmklml_intel.so"`
+  if [ "$mklml_found" ]; then
+    echo "Found MKLML            : $mklml_found"
+  fi
+  iomp_found=`find $path -name "libiomp5.so"`
+  if [ "$iomp_found" ]; then
+    echo "Found IOMP             : $iomp_found"
+  fi
+done
+
+# dump all details for fully check
+lscpu > lscpu.dump
+dmidecode > dmidecode.dump
+
+# The expected result would be like:
+# ========================= Hardware Information =========================
+# CPU Name               : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz
+# CPU Family             : 6
+# Socket Number          : 2
+# Cores Per Socket       : 20
+# Total Physical Cores   : 40
+# Total Virtual Cores    : 40
+# Hyper Threading        : OFF
+# NUMA Nodes             : 2
+# -------------------------- Memory Information --------------------------
+# Installed DIMM number  : 12
+# Installed DIMMs Locator:
+#  CPU1_DIMM_A1
+#  CPU1_DIMM_B1
+#  CPU1_DIMM_C1
+#  CPU1_DIMM_D1
+#  CPU1_DIMM_E1
+#  CPU1_DIMM_F1
+#  CPU2_DIMM_A1
+#  CPU2_DIMM_B1
+#  CPU2_DIMM_C1
+#  CPU2_DIMM_D1
+#  CPU2_DIMM_E1
+#  CPU2_DIMM_F1
+# Not installed DIMMs    :
+#  CPU1_DIMM_A2
+#  CPU1_DIMM_B2
+#  CPU1_DIMM_C2
+#  CPU1_DIMM_D2
+#  CPU1_DIMM_E2
+#  CPU1_DIMM_F2
+#  CPU2_DIMM_A2
+#  CPU2_DIMM_B2
+#  CPU2_DIMM_C2
+#  CPU2_DIMM_D2
+#  CPU2_DIMM_E2
+#  CPU2_DIMM_F2
+# DIMMs max slots        : 24
+# Memory Size            : 376G
+# Swap Memory Size       : 4.0G
+# Total Memory Size      : 380G
+# Max Memory Capacity    : 2304 GB
+# Configed Clock Speed   : 2666 MHz
+# -------------------------- Turbo Information  --------------------------
+# Scaling Driver         : intel_pstate
+# Turbo Status           : ON
+# CPU Max Frequency      : 3.70 GHz
+# CPU Min Frequency      : 1.00 GHz
+# CPU Freq Governor      : performance
+# ========================= Software Information =========================
+# BIOS Release Date      : 03/10/2017
+# OS Version             : CentOS Linux release 7.3.1611 (Core)
+# Kernel Release Version : 3.10.0-514.el7.x86_64
+# Kernel Patch Version   : #1 SMP Tue Nov 22 16:42:41 UTC 2016
+# GCC Version            : 4.8.5 20150623 (Red Hat 4.8.5-11)
+# CMake Version          : 3.5.2
+# ------------------ Environment Variables Information -------------------
+# KMP_AFFINITY           : unset
+# OMP_DYNAMIC            : unset
+# OMP_NESTED             : unset
+# OMP_NUM_THREADS        : unset
+# MKL_NUM_THREADS        : unset
+# MKL_DYNAMIC            : unset
diff --git a/paddle/scripts/cluster_train/paddle.py b/paddle/scripts/cluster_train/paddle.py
index 9b03ed1d8f..ba313ac6a1 100644
--- a/paddle/scripts/cluster_train/paddle.py
+++ b/paddle/scripts/cluster_train/paddle.py
@@ -80,168 +80,3 @@ def job_prepare(jobdir, data=None):
             #create job dir
             run('rm ' + jobdir + ' -fr && ' + 'mkdir -p ' + jobdir)
             #push data and paddle bin
-            put(data + "/*", jobdir)
-            run("mkdir -p " + log)
-        run('rm -fr ' + log + "/*")
-
-    def set_nodefile(nodeid):
-        '''
-        create nodefile for later usage
-        '''
-        run('echo ' + str(nodeid) + ' > ' + jobdir + '/nodefile')
-
-    execute(job_create_workspace, jobdir, data, hosts=conf.HOSTS)
-    for i in xrange(len(conf.HOSTS)):
-        execute(set_nodefile, i, hosts=conf.HOSTS[i])
-    #clean rubbish caused by exception 
-    with settings(warn_only=True):
-        execute(kill_process, hosts=conf.HOSTS)
-
-
-def job_pserver(jobdir, pids=None):
-    '''
-    start all pservers
-    '''
-    pargs = " --num_gradient_servers=" + str(len(conf.HOSTS))
-    pargs += (" --nics=" + conf.PADDLE_NIC)
-    pargs += " --port=" + str(conf.PADDLE_PORT)
-    pargs += " --ports_num=" + str(conf.PADDLE_PORTS_NUM)
-    #always start sparse pserver by default
-    pargs += " --ports_num_for_sparse=" + str(conf.PADDLE_PORTS_NUM_FOR_SPARSE)
-    pargs += " --comment=" + "paddle_process_by_paddle"
-
-    def start_pserver(jobdir, pargs):
-        '''
-        start pserver process with fabric executor
-        '''
-        with prefix('export LD_LIBRARY_PATH=' + \
-                conf.LD_LIBRARY_PATH + \
-                ':$LD_LIBRARY_PATH'):
-            program = 'paddle pserver'
-            run('cd ' + jobdir + '; '  + \
-                'GLOG_logtostderr=0 GLOG_log_dir="./log" ' + \
-                'nohup ' + \
-                program + " " + pargs + ' > ./log/server.log 2>&1 < /dev/null & ',
-                pty=False)
-
-    execute(start_pserver, jobdir, pargs, hosts=conf.HOSTS)
-
-
-def job_trainer(jobdir, train_args_dict, pids=None):
-    '''
-    start paddle trainer
-    '''
-    args = " --num_gradient_servers=" + str(len(conf.HOSTS))
-    args += " --nics=" + conf.PADDLE_NIC
-    args += " --port=" + str(conf.PADDLE_PORT)
-    args += " --ports_num=" + str(conf.PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for i in xrange(len(conf.HOSTS)):
-        host = conf.HOSTS[i]
-        left = host.find("@")
-        right = host.find(':')
-        left = 0 if left == -1 else left + 1
-        right = len(host) if right == -1 else right
-        ip_string += (socket.gethostbyname(host[left:right]) + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    args += " " + args_ext
-
-    def start_trainer(jobdir, args):
-        '''
-        start trainer process with fabric executor
-        '''
-        with prefix('export LD_LIBRARY_PATH=' + \
-                conf.LD_LIBRARY_PATH + \
-                ':$LD_LIBRARY_PATH'):
-            program = 'paddle train'
-            run('cd ' + jobdir + '; '  + \
-                'GLOG_logtostderr=0 '
-                'GLOG_log_dir="./log" '
-                'nohup ' + \
-                program + " " + args + " > ./log/train.log 2>&1 < /dev/null & ",
-                pty=False)
-
-    for i in xrange(len(conf.HOSTS)):
-        train_args = copy.deepcopy(args)
-        train_args += " --trainer_id=" + str(i)
-        execute(start_trainer, jobdir, train_args, hosts=conf.HOSTS[i])
-
-
-def job_all(job_package, jobdir=None, train_args_dict=None):
-    '''
-    param job_package
-    param train_args_dict
-    '''
-    if jobdir is None:
-        timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
-        jobdir = conf.ROOT_DIR + "/JOB" + timestamp
-    job_prepare(jobdir, job_package)
-    job_pserver(jobdir)
-    time.sleep(5)  #wait until pservers completely start
-    job_trainer(jobdir, train_args_dict)
-    job_clean()
-
-
-def job_clean():
-    '''
-    if starting job failed from paddle internal, the framework always
-    is launched successfully since these process are daemon processes.
-    so this job_clean can alway clean job rubbish process with ctrl+c.
-    '''
-
-    def signal_handler(signal, frame):
-        '''
-        SIGINT handler
-        '''
-
-        def kill_process():
-            run("ps aux \
-                  | grep paddle_process_by_paddle \
-                  | grep -v grep  \
-                  | awk '{print $2}' \
-                  | xargs kill > /dev/null 2>&1")
-
-        with settings(warn_only=True):
-            execute(kill_process, hosts=conf.HOSTS)
-
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.pause()
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        prog="paddle.py", description='simple tool for cluster training')
-    parser.add_argument(
-        '-j',
-        '--job_workspace',
-        required=False,
-        default=None,
-        help='job workspace')
-    parser.add_argument(
-        '-p',
-        '--job_dispatch_package',
-        required=False,
-        default=None,
-        help='job package for dispatching to all other nodes')
-
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-
-    if args.job_workspace is not None:
-        #if assigned workspace, do not need to dispatch data,
-        #so job_local_package should be None
-        assert args.job_dispatch_package is None
-        job_all(None, args.job_workspace, train_args_dict)
-    elif args.job_dispatch_package is not None:
-        assert args.job_workspace is None
-        assert os.path.isdir(args.job_dispatch_package)
-        job_all(args.job_dispatch_package, None, train_args_dict)
-    else:
-        print "--job_workspace or --job_dispatch_package should be set"
diff --git a/v1_api_demo/mnist/train.sh b/paddle/scripts/cluster_train_v2/fabric/conf.py
old mode 100755
new mode 100644
similarity index 52%
rename from v1_api_demo/mnist/train.sh
rename to paddle/scripts/cluster_train_v2/fabric/conf.py
index ca2b1ad9eb..e96503d093
--- a/v1_api_demo/mnist/train.sh
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
@@ -1,4 +1,3 @@
-#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,21 +11,29 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
-config=vgg_16_mnist.py
-output=./mnist_vgg_model
-log=train.log
 
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=0 \
---trainer_count=1 \
---num_passes=100 \
---save_dir=$output \
-2>&1 | tee $log
-paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
-
-python -m paddle.utils.plotcurve -i $log > plot.png
+HOSTS = [
+    "root@10.1.9.7",
+    "root@10.1.18.7",
+    "root@10.1.32.9",
+]
+'''
+workspace configuration
+'''
+#root dir for workspace, can be set as any director with real user account
+ROOT_DIR = "/root"
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 1
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 1
+#trainer whether use gpu
+PADDLE_USE_GPU = "False"
+#environments setting for all processes in cluster job
+LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
new file mode 100644
index 0000000000..6606c01265
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
@@ -0,0 +1,11 @@
+FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
+RUN apt-get update && apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+
+RUN echo 'root:root' |chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
new file mode 100644
index 0000000000..0784b2d1b8
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
@@ -0,0 +1,23 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: ssh-servers
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: ssh-servers
+    spec:
+      containers:
+      - name: ssh-servers
+        image: docker.paddlepaddlehub.com/paddlessh
+        resources:
+          limits:
+            cpu: 500m
+            memory: 1Gi
+          requests:
+            cpu: 500m
+            memory: 1Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh
new file mode 100644
index 0000000000..f6324bcb13
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+python paddle.py \
+  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
+  --dot_period=10 \
+  --ports_num_for_sparse=1 \
+  --log_period=50 \
+  --num_passes=5 \
+  --trainer_count=2 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --save_dir=./output \
+  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
new file mode 100644
index 0000000000..c2f631bdf4
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -0,0 +1,43 @@
+# Build this image:  docker build -t mpi .
+#
+
+FROM paddlepaddle/paddle:0.10.0rc3
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y && \
+    apt-get upgrade -y && \
+    apt-get install -y openssh-server zip unzip vim sudo \
+gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
+pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
+mkdir /var/run/sshd && \
+echo 'root:tutorial' | chpasswd && \
+sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+# SSH login fix. Otherwise user is kicked off after login
+sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
+echo "export VISIBLE=now" >> /etc/profile && \
+adduser --disabled-password --gecos "" tutorial && \
+echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
+mkdir /home/tutorial/.ssh/
+
+ENV HOME /home/tutorial
+ENV NOTVISIBLE "in users profile"
+
+# ------------------------------------------------------------
+# Set-Up SSH with our Github deploy key
+# ------------------------------------------------------------
+
+ADD ssh/config /home/tutorial/.ssh/config
+ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
+
+#---------------------------------------------------------------
+#LD_LIBRARY_PATH
+#---------------------------------------------------------------
+
+RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
+
+WORKDIR /home/tutorial
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
new file mode 100644
index 0000000000..34835e5eb8
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
@@ -0,0 +1,25 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-header
+  labels:
+    app: mpi-header
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: mpi-header
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-header
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
new file mode 100644
index 0000000000..2fd5cb4d44
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
@@ -0,0 +1,26 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-nodes
+  labels:
+    app: mpi-nodes
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: mpi-nodes
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-nodes
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
+        imagePullPolicy: Always
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
new file mode 100644
index 0000000000..a9ecad07c3
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
@@ -0,0 +1 @@
+StrictHostKeyChecking no
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
new file mode 100644
index 0000000000..23768343ed
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
+1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
+O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
+36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
+mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
+bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
+OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
+TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
+79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
+YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
+mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
+lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
+rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
+DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
+44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
+fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
+cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
+g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
+yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
+PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
+v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
+hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
+sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
+zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
+yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
+-----END RSA PRIVATE KEY-----
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
new file mode 100644
index 0000000000..015f2b42e7
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
new file mode 100644
index 0000000000..c645495448
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# General trainning configurations
+
+NICS=eth0
+PADDLE_INIT_PORT=7164
+PADDLE_INIT_PORTS_NUM=1
+PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
+PADDLE_INIT_USE_GPU=False
+
+PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
+PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+PADDLE_CLUSTER_TRAIN=True
+
+env
+
+# start pserver
+stdbuf -oL nohup paddle pserver --port=$PADDLE_INIT_PORT --ports_num=$PADDLE_INIT_PORTS_NUM \
+  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE --nics=$NICS \
+  --comment=paddle_cluster_pserver \
+  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS &> logs/pserver.log &
+
+# start trainer
+# NOTE: train.py will use the above environment variables as configuration
+python train.py &> logs/train.log
+
+# kill background pservers when train finishes
+ps -ef | grep pserver | awk '{print $2}' | xargs kill
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
deleted file mode 100644
index 91620b1ee7..0000000000
--- a/paddle/scripts/deb/postinst
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-echo "Post install paddle debian package."
-echo "Install some python package used for paddle. You can run "
-echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-find /usr/ -name '*paddle*.whl' | xargs pip install
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 76bc30e59b..f0620498cf 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -2,178 +2,197 @@
 
 ## Goals
 
-We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+We want to make the building procedures:
 
-We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+1. Static, can reproduce easily.
+1. Generate python `whl` packages that can be widely use cross many distributions.
+1. Build different binaries per release to satisfy different environments:
+    - Binaries for different CUDA and CUDNN versions, like CUDA 7.5, 8.0, 9.0
+    - Binaries containing only capi
+    - Binaries for python with wide unicode support or not.
+1. Build docker images with PaddlePaddle pre-installed, so that we can run
+PaddlePaddle applications directly in docker or on Kubernetes clusters.
 
-We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+To achieve this, we created a repo: https://github.com/PaddlePaddle/buildtools
+which gives several docker images that are `manylinux1` sufficient. Then we
+can build PaddlePaddle using these images to generate corresponding `whl`
+binaries.
 
-We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+## Run The Build
 
-Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+### Build Environments
 
-We want the procedure and tools also work with testing, continuous integration, and releasing.
+The pre-built build environment images are:
 
+| Image | Tag |
+| ----- | --- |
+| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn5 |
+| paddlepaddle/paddle_manylinux_devel | cuda8.0_cudnn5 |
+| paddlepaddle/paddle_manylinux_devel | cuda7.5_cudnn7 |
+| paddlepaddle/paddle_manylinux_devel | cuda9.0_cudnn7 |
 
-## Docker Images
-
-So we need two Docker images for each version of PaddlePaddle:
-
-1. `paddle:<version>-dev`
-
-   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+### Start Build
 
-   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
-   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
-   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+Choose one docker image that suit your environment and run the following
+command to start a build:
 
-   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
-
-  The development image should include the following tools:
-
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" -e "PYTHON_ABI=cp27-cp27mu" paddlepaddle/paddle_manylinux_devel /paddle/paddle/scripts/docker/build.sh
+```
 
-   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+After the build finishes, you can get output `whl` package under
+`build/python/dist`.
 
-1. `paddle:<version>`
+This command mounts the source directory on the host into `/paddle` in the container, then run the build script `/paddle/paddle/scripts/docker/build.sh`
+in the container. When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
 
-   This is the production image, generated using the development image. This image might have multiple variants:
+### Build Options
 
-   - GPU/AVX   `paddle:<version>-gpu`
-   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
-   - no-GPU/AVX  `paddle:<version>`
-   - no-GPU/no-AVX  `paddle:<version>-noavx`
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
 
-   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+| Option | Default | Description |
+| ------ | -------- | ----------- |
+| `WITH_GPU` | OFF | Generates NVIDIA CUDA GPU code and relies on CUDA libraries. |
+| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
+| `WITH_TESTING` | ON | Build unit tests binaries. |
+| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
+| `WITH_GOLANG` | ON | Build fault-tolerant parameter server written in go. |
+| `WITH_SWIG_PY` | ON | Build with SWIG python API support. |
+| `WITH_C_API` | OFF | Build capi libraries for inference. |
+| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
+| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
+| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
+| `RUN_TEST` | OFF | Run unit test immediently after the build. |
+| `WITH_DOC` | OFF | Build docs after build binaries. |
+| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
 
-   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
 
+## Docker Images
 
-## Development Environment
+You can get the latest PaddlePaddle docker images by
+`docker pull paddlepaddle/paddle:<version>` or build one by yourself.
 
-Here we describe how to use above two images.  We start from considering our daily development environment.
+### Official Docker Releases
 
-Developers work on a computer, which is usually a laptop or desktop:
+Official docker images at
+[here](https://hub.docker.com/r/paddlepaddle/paddle/tags/),
+you can choose either latest or images with a release tag like `0.10.0`,
+Currently available tags are:
 
-<img src="doc/paddle-development-environment.png" width=500 />
+|   Tag  | Description |
+| ------ | --------------------- |
+| latest | latest CPU only image |
+| latest-gpu | latest binary with GPU support |
+| 0.10.0 | release 0.10.0 CPU only binary image |
+| 0.10.0-gpu | release 0.10.0 with GPU support |
 
-or, they might rely on a more sophisticated box (like with GPUs):
+### Build Your Own Image
 
-<img src="doc/paddle-development-environment-gpu.png" width=500 />
+Build PaddlePaddle docker images are quite simple since PaddlePaddle can
+be installed by just running `pip install`. A sample `Dockerfile` is:
 
-A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+```dockerfile
+FROM nvidia/cuda:7.5-cudnn5-runtime-centos6
+RUN yum install -y centos-release-SCL
+RUN yum install -y python27
+# This whl package is generated by previous build steps.
+ADD python/dist/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl /
+RUN pip install /paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl && rm -f /*.whl
+```
 
+Then build the image by running `docker build -t [REPO]/paddle:[TAG] .` under
+the directory containing your own `Dockerfile`.
 
-## Usages
+- NOTE: note that you can choose different base images for your environment, you can find all the versions [here](https://hub.docker.com/r/nvidia/cuda/).
 
-### Build the Development Docker Image
+### Use Docker Images
 
-The following commands check out the source code to the host and build the development image `paddle:dev`:
+Suppose that you have written an application program `train.py` using
+PaddlePaddle, we can test and run it using docker:
 
 ```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-docker build -t paddle:dev .
+docker run --rm -it -v $PWD:/work paddlepaddle/paddle /work/a.py
 ```
 
-The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
-
-Users can specify a Ubuntu mirror server for faster downloading:
-
-```bash
-docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
-```
+But this works only if all dependencies of `train.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
 
-### Build PaddlePaddle from Source Code
+### Run PaddlePaddle Book In Docker
 
-Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+Our [book repo](https://github.com/paddlepaddle/book) also provide a docker
+image to start a jupiter notebook inside docker so that you can run this book
+using docker:
 
 ```bash
-docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" paddle:dev
+docker run -d -p 8888:8888 paddlepaddle/book
 ```
 
-This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
-
-`build.sh` builds the following:
-
-- PaddlePaddle binaries,
-- `$PWD/build/paddle-<version>.deb` for production installation, and
-- `$PWD/build/Dockerfile`, which builds the production Docker image.
+Please refer to https://github.com/paddlepaddle/book if you want to build this
+docker image by your self.
 
-Users can specify the following Docker build arguments with either "ON" or "OFF" value:
-- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
-- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
-- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
-  ```bash
-    docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
-  ```
-- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
+### Run Distributed Applications
 
-### Build the Production Docker Image
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
 
-The following command builds the production image:
+Of course, we can manually build an application image and launch the job using the kubectl tool:
 
 ```bash
-docker build -t paddle -f build/Dockerfile ./build
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
 ```
 
-This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+## Docker Images for Developers
 
-### Run PaddlePaddle Applications
+We have a special docker image for developers:
+`paddlepaddle/paddle:<version>-dev`. This image is also generated from
+https://github.com/PaddlePaddle/buildtools
 
-Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+This a development image contains only the
+development tools and standardizes the building procedure.  Users include:
 
-```bash
-docker run --rm -it -v $PWD:/work paddle /work/a.py
-```
+- developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+- release engineers -- use this to build the official release from certain branch/tag on Github.com.
+- document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
 
-But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
 
-### Build and Run PaddlePaddle Applications
+The development image contains the following tools:
 
-We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
 
-```
-FROM paddlepaddle/paddle:<version>
-RUN pip install -U matplotlib jupyter ...
-COPY . /book
-EXPOSE 8080
-CMD ["jupyter"]
-```
+Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
 
-The book image is an example of PaddlePaddle application image.  We can build it
 
-```bash
-git clone https://github.com/paddlepaddle/book
-cd book
-docker build -t book .
-```
+### Development Workflow
 
-### Build and Run Distributed Applications
+Here we describe how the workflow goes on.  We start from considering our daily development environment.
 
-In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+Developers work on a computer, which is usually a laptop or desktop:
 
-Of course, we can manually build an application image and launch the job using the kubectl tool:
+<img src="doc/paddle-development-environment.png" width=500 />
 
-```bash
-docker build -f some/Dockerfile -t myapp .
-docker tag myapp me/myapp
-docker push
-kubectl ...
-```
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
 
 ### Reading source code with woboq codebrowser
+
 For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
 
 - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
 
 ```bash
-docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev
 ```
 
 - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2b48e4dc0f..fbae37b2ca 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -1,159 +1,220 @@
 #!/bin/bash
 
-set -xe
-
-# Set BASE_IMAGE according to env variables
-if [ ${WITH_GPU} == "ON" ]; then
-  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
-else
-  BASE_IMAGE="ubuntu:16.04"
-fi
+function cmake_gen() {
+    mkdir -p /paddle/build
+    cd /paddle/build
+
+    # build script will not fail if *.deb does not exist
+    rm *.deb 2>/dev/null || true
+    # delete previous built whl packages
+    rm -rf /paddle/paddle/dist 2>/dev/null || true
+
+    # Support build for all python versions, currently
+    # including cp27-cp27m and cp27-cp27mu.
+    PYTHON_FLAGS=""
+    if [ "$1" != "" ]; then
+        echo "using python abi: $1"
+        if [ "$1" == "cp27-cp27m" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
+        elif [ "$1" == "cp27-cp27mu" ]; then
+            export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
+            PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
+        -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
+        -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
+        fi
+    fi
 
-DOCKERFILE_GPU_ENV=""
-DOCKERFILE_CUDNN_DSO=""
-if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
-fi
-
-mkdir -p /paddle/build
-cd /paddle/build
-
-# build script will not fail if *.deb does not exist
-rm *.deb 2>/dev/null || true
-# delete previous built whl packages
-rm -rf /paddle/paddle/dist 2>/dev/null || true
-
-cat <<EOF
-========================================
-Configuring cmake in /paddle/build ...
-      -DCMAKE_BUILD_TYPE=Release
-      -DWITH_DOC=OFF
-      -DWITH_GPU=${WITH_GPU:-OFF}
-      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_SWIG_PY=ON
-      -DCUDNN_ROOT=/usr/
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-      -DWITH_TESTING=${WITH_TESTING:-OFF}
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-========================================
-EOF
-cmake .. \
-      -DCMAKE_BUILD_TYPE=Release \
-      -DWITH_DOC=OFF \
-      -DWITH_GPU=${WITH_GPU:-OFF} \
-      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_SWIG_PY=ON \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
-      -DWITH_TESTING=${WITH_TESTING:-OFF} \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-
-cat <<EOF
-========================================
-Building in /paddle/build ...
-   Build unit tests: ${WITH_TESTING:-OFF}
-========================================
+    cat <<EOF
+    ========================================
+    Configuring cmake in /paddle/build ...
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release}
+        ${PYTHON_FLAGS}
+        -DWITH_DOC=OFF
+        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
+        -DWITH_MKL=${WITH_MKL:-ON}
+        -DWITH_AVX=${WITH_AVX:-OFF}
+        -DWITH_GOLANG=${WITH_GOLANG:-ON}
+        -DWITH_SWIG_PY=ON
+        -DWITH_C_API=${WITH_C_API:-OFF}
+        -DWITH_PYTHON=${WITH_PYTHON:-ON}
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+        -DCUDNN_ROOT=/usr/
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
+        -DWITH_TESTING=${WITH_TESTING:-ON}
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    ========================================
 EOF
-make -j `nproc`
-if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
-    pip uninstall -y py-paddle paddle || true
-    ctest --output-on-failure
-fi
-
-
-cat <<EOF
-========================================
-Installing ...
-========================================
+    # Disable UNITTEST_USE_VIRTUALENV in docker because
+    # docker environment is fully controlled by this script.
+    # See /Paddle/CMakeLists.txt, UNITTEST_USE_VIRTUALENV option.
+    cmake .. \
+        -DCMAKE_BUILD_TYPE=${BUILD_TYPE:Release} \
+        ${PYTHON_FLAGS} \
+        -DWITH_DOC=OFF \
+        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
+        -DWITH_MKL=${WITH_MKL:-ON} \
+        -DWITH_AVX=${WITH_AVX:-OFF} \
+        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
+        -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
+        -DWITH_C_API=${WITH_C_API:-OFF} \
+        -DWITH_PYTHON=${WITH_PYTHON:-ON} \
+        -DCUDNN_ROOT=/usr/ \
+        -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
+        -DWITH_TESTING=${WITH_TESTING:-ON} \
+        -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+}
+
+function run_build() {
+    cat <<EOF
+    ============================================
+    Building in /paddle/build ...
+    ============================================
 EOF
-make install
-pip install /usr/local/opt/paddle/share/wheels/*.whl
-paddle version
-
+    make -j `nproc`
+}
 
-# To build documentation, we need to run cmake again after installing
-# PaddlePaddle.  This awkwardness is due to
-# https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
-# describes a solution.
-if [ ${WITH_DOC} == "ON" ]; then
+function run_test() {
+    if [ ${WITH_TESTING:-ON} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
     cat <<EOF
-========================================
-Building documentation ...
-   In /paddle/build_doc
-========================================
+    ========================================
+    Running unit tests ...
+    ========================================
 EOF
-    mkdir -p /paddle/build_doc
-    pushd /paddle/build_doc
-    cmake .. \
-          -DWITH_DOC=ON \
-          -DWITH_GPU=OFF \
-          -DWITH_AVX=${WITH_AVX:-ON} \
-          -DWITH_SWIG_PY=ON \
-          -DWITH_STYLE_CHECK=OFF
-    make paddle_docs paddle_docs_cn
-    popd
-fi
-
+        ctest --output-on-failure
+        # make install should also be test when unittest
+        make install -j `nproc`
+        pip install /usr/local/opt/paddle/share/wheels/*.whl
+        paddle version
+    fi
+}
+
+
+function gen_docs() {
+    if [[ ${WITH_DOC:-OFF} == "ON" ]]; then
+        cat <<EOF
+    ========================================
+    Building documentation ...
+    In /paddle/build_doc
+    ========================================
+EOF
+        mkdir -p /paddle/build_doc
+        pushd /paddle/build_doc
+        cmake .. \
+            -DWITH_DOC=ON \
+            -DWITH_GPU=OFF \
+            -DWITH_AVX=${WITH_AVX:-ON} \
+            -DWITH_SWIG_PY=ON \
+            -DWITH_STYLE_CHECK=OFF
+        make -j `nproc` gen_proto_py
+        make -j `nproc` paddle_python
+        make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` print_operators_doc
+        paddle/pybind/print_operators_doc > doc/en/html/operators.json
+        popd
+    fi
+
+
+    if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+        cat <<EOF
+    ========================================
+    Converting C++ source code into HTML ...
+    ========================================
+EOF
+        export WOBOQ_OUT=/paddle/build/woboq_out
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+            -b /paddle/build \
+            -a \
+            -o $WOBOQ_OUT \
+            -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    fi
+}
+
+
+function gen_dockerfile() {
+    # Set BASE_IMAGE according to env variables
+    if [[ ${WITH_GPU} == "ON" ]]; then
+    BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+    else
+    BASE_IMAGE="ubuntu:16.04"
+    fi
+
+    DOCKERFILE_GPU_ENV=""
+    DOCKERFILE_CUDNN_DSO=""
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+        DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:\${LD_LIBRARY_PATH}"
+        DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
+    fi
 
-if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
     cat <<EOF
-========================================
-Converting C++ source code into HTML ...
-========================================
+    ========================================
+    Generate /paddle/build/Dockerfile ...
+    ========================================
 EOF
-    export WOBOQ_OUT=/paddle/build/woboq_out
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-        -b /paddle/build \
-        -a \
-        -o $WOBOQ_OUT \
-        -p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-fi
 
-# generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-cat <<EOF
-========================================
-Generating .deb package ...
-========================================
+    cat > /paddle/build/Dockerfile <<EOF
+    FROM ${BASE_IMAGE}
+    MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+    ENV HOME /root
 EOF
-cpack -D CPACK_GENERATOR='DEB' ..
-
 
-cat <<EOF
-========================================
-Generate /paddle/build/Dockerfile ...
-========================================
+    if [[ ${WITH_GPU} == "ON"  ]]; then
+        NCCL_DEPS="apt-get install -y libnccl-dev &&"
+    else
+        NCCL_DEPS="" 
+    fi
+
+    cat >> /paddle/build/Dockerfile <<EOF
+    ADD python/dist/*.whl /
+    # run paddle version to install python packages first
+    RUN apt-get update &&\
+        ${NCCL_DEPS}\
+        apt-get install -y wget python-pip dmidecode python-tk && pip install -U pip && \
+        pip install /*.whl; apt-get install -f -y && \
+        apt-get clean -y && \
+        rm -f /*.whl && \
+        paddle version && \
+        ldconfig
+    ${DOCKERFILE_CUDNN_DSO}
+    ${DOCKERFILE_GPU_ENV}
+    ADD go/cmd/pserver/pserver /usr/bin/
+    ADD go/cmd/master/master /usr/bin/
+    # default command shows the paddle version and exit
+    CMD ["paddle", "version"]
 EOF
+}
+
+function gen_capi_package() {
+  if [[ ${WITH_C_API} == "ON" ]]; then
+    install_prefix="/paddle/build/capi_output"
+    rm -rf $install_prefix
+    make DESTDIR="$install_prefix" install
+    cd $install_prefix/usr/local
+    ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
+  fi
+}
 
-cat > /paddle/build/Dockerfile <<EOF
-FROM ${BASE_IMAGE}
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ENV HOME /root
-EOF
+set -xe
 
-if [[ -n ${APT_MIRROR} ]]; then
-cat >> /paddle/build/Dockerfile <<EOF
-RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
-EOF
-fi
+cmake_gen ${PYTHON_ABI:-""}
+run_build
+run_test
+gen_docs
+gen_dockerfile
+gen_capi_package
 
-cat >> /paddle/build/Dockerfile <<EOF
-# Use different deb file when building different type of images
-ADD *.deb /
-# run paddle version to install python packages first
-RUN apt-get update &&\
-    apt-get install -y python-pip && pip install -U pip && \
-    dpkg -i /*.deb ; apt-get install -f -y && \
-    apt-get clean -y && \
-    rm -f /*.deb && \
-    paddle version
-${DOCKERFILE_CUDNN_DSO}
-${DOCKERFILE_GPU_ENV}
-
-# default command shows the paddle version and exit
-CMD ["paddle", "version"]
-EOF
+if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
+  printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" 
+else
+  printf "If you need to install PaddlePaddle in develop docker image,"
+  printf "please make install or pip install build/python/dist/*.whl.\n"
+fi
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index bfa10c9155..cd13073a0c 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,25 +2,89 @@
 
 set -xe
 
-mkdir -p /paddle/build
-cd /paddle/build
-rm -f /paddle/install 2>/dev/null || true
-cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
-      -DANDROID_ABI=armeabi-v7a \
-      -DANDROID_ARM_NEON=ON \
-      -DANDROID_ARM_MODE=ON \
-      -DHOST_C_COMPILER=/usr/bin/gcc \
-      -DHOST_CXX_COMPILER=/usr/bin/g++ \
-      -DCMAKE_INSTALL_PREFIX=/paddle/install \
-      -DCMAKE_BUILD_TYPE=RelWithDebInfo \
-      -DCMAKE_C_FLAGS_RELWITHDEBINFO="-O3" \
-      -DCMAKE_CXX_FLAGS_RELWITHDEBINFO="-O3" \
-      -DWITH_C_API=ON \
-      -DWITH_SWIG_PY=OFF \
-      ..
-make -j `nproc`
-make install
+if [ $ANDROID_ABI == "arm64-v8a" ]; then
+  ANDROID_ARCH=arm64
+  if [ $ANDROID_API -lt 21 ]; then
+    echo "Warning: arm64-v8a requires ANDROID_API >= 21."
+    ANDROID_API=21
+  fi
+else # armeabi, armeabi-v7a
+  ANDROID_ARCH=arm
+fi
+
+ANDROID_STANDALONE_TOOLCHAIN=$ANDROID_TOOLCHAINS_DIR/$ANDROID_ARCH-android-$ANDROID_API
+
+cat <<EOF
+============================================
+Generating the standalone toolchain ...
+${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh
+      --arch=$ANDROID_ARCH
+      --platform=android-$ANDROID_API
+      --install-dir=${ANDROID_STANDALONE_TOOLCHAIN}
+============================================
+EOF
+${ANDROID_NDK_HOME}/build/tools/make-standalone-toolchain.sh \
+      --arch=$ANDROID_ARCH \
+      --platform=android-$ANDROID_API \
+      --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+
+BUILD_ROOT=/paddle/build_android
+DEST_ROOT=/paddle/install_android
 
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
+rm -rf $BUILD_ROOT 2>/dev/null || true
+mkdir -p $BUILD_ROOT
+cd $BUILD_ROOT
+
+if [ $ANDROID_ABI == "armeabi-v7a" ]; then
+  cmake -DCMAKE_SYSTEM_NAME=Android \
+        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+        -DANDROID_ABI=$ANDROID_ABI \
+        -DANDROID_ARM_NEON=ON \
+        -DANDROID_ARM_MODE=ON \
+        -DHOST_C_COMPILER=/usr/bin/gcc \
+        -DHOST_CXX_COMPILER=/usr/bin/g++ \
+        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DUSE_EIGEN_FOR_BLAS=ON \
+        -DWITH_C_API=ON \
+        -DWITH_SWIG_PY=OFF \
+        -DWITH_STYLE_CHECK=OFF \
+        ..
+elif [ $ANDROID_ABI == "arm64-v8a" ]; then
+  cmake -DCMAKE_SYSTEM_NAME=Android \
+        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+        -DANDROID_ABI=$ANDROID_ABI \
+        -DANDROID_ARM_MODE=ON \
+        -DHOST_C_COMPILER=/usr/bin/gcc \
+        -DHOST_CXX_COMPILER=/usr/bin/g++ \
+        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DUSE_EIGEN_FOR_BLAS=OFF \
+        -DWITH_C_API=ON \
+        -DWITH_SWIG_PY=OFF \
+        -DWITH_STYLE_CHECK=OFF \
+        ..
+elif [ $ANDROID_ABI == "armeabi" ]; then
+  cmake -DCMAKE_SYSTEM_NAME=Android \
+        -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+        -DANDROID_ABI=$ANDROID_ABI \
+        -DANDROID_ARM_MODE=ON \
+        -DHOST_C_COMPILER=/usr/bin/gcc \
+        -DHOST_CXX_COMPILER=/usr/bin/g++ \
+        -DCMAKE_INSTALL_PREFIX=$DEST_ROOT \
+        -DCMAKE_BUILD_TYPE=MinSizeRel \
+        -DWITH_C_API=ON \
+        -DWITH_SWIG_PY=OFF \
+        -DWITH_STYLE_CHECK=OFF \
+        ..
+else
+  echo "Invalid ANDROID_ABI: $ANDROID_ABI"
+fi
+
+cat <<EOF
+============================================
+Building in $BUILD_ROOT ...
+============================================
+EOF
+make -j `nproc`
+make install -j `nproc`
diff --git a/paddle/scripts/run_python_tests.sh b/paddle/scripts/run_python_tests.sh
deleted file mode 100755
index 1ed497aaec..0000000000
--- a/paddle/scripts/run_python_tests.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-pushd `dirname $0` > /dev/null
-SCRIPTPATH=$PWD
-popd > /dev/null
-
-USE_VIRTUALENV_FOR_TEST=$1; shift
-PYTHON=$1; shift
-
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-   rm -rf .test_env
-   virtualenv .test_env
-   unset PYTHONHOME
-   unset PYTHONPATH
-   source .test_env/bin/activate
-   PYTHON=python
-fi
-
-$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl
-
-if [ "X${PADDLE_PACKAGE_DIR}" != "X" ]; then
-   $PYTHON -m pip install ${PADDLE_PACKAGE_DIR}/*.whl
-else
-   export PYTHONPATH=$SCRIPTPATH/../../python/
-fi
-
-$PYTHON -m pip install ipython==5.3
-
-for fn in "$@"
-do
-  echo "test $fn"
-  $PYTHON $fn
-  if [ $? -ne 0 ]; then
-    exit 1
-  fi
-done
-
-if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
-    deactivate
-    rm -rf .test_env
-fi
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
old mode 100644
new mode 100755
index 12bf629ea9..80fa0c72af
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,6 +18,8 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
+        echo "    with_mkl: @WITH_MKL@"
+        echo "    with_mkldnn: @WITH_MKLDNN@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
@@ -41,6 +43,81 @@ function ver2num() {
   set +e
 }
 
+function cpu_config() {
+  # auto set KMP_AFFINITY and OMP_DYNAMIC from Hyper Threading Status
+  # only when MKL enabled
+  if [ "@WITH_MKL@" == "OFF" ]; then
+    return 0
+  fi
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
+  if [ $ht -eq 1 ]; then # HT is OFF
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,0,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="FALSE"
+    fi
+  else # HT is ON
+    if [ -z "$KMP_AFFINITY" ]; then
+      export KMP_AFFINITY="granularity=fine,compact,1,0"
+    fi
+    if [ -z "$OMP_DYNAMIC" ]; then
+      export OMP_DYNAMIC="True"
+    fi
+  fi
+}
+
+function threads_config() {
+  # auto set OMP_NUM_THREADS and MKL_NUM_THREADS
+  # according to trainer_count and total processors
+  # only when MKL enabled
+  # auto set OPENBLAS_NUM_THREADS when do not use MKL
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
+  trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
+  if [ -z $trainers ]; then
+    trainers=1
+  fi
+  threads=$((processors / trainers))
+  if [ $threads -eq 0 ]; then
+    threads=1
+  fi
+  if [ "@WITH_MKL@" == "ON" ]; then
+    if [ -z "$OMP_NUM_THREADS" ]; then
+      export OMP_NUM_THREADS=$threads
+    fi
+    if [ -z "$MKL_NUM_THREADS" ]; then
+      export MKL_NUM_THREADS=$threads
+    fi
+  else
+    if [ -z "$OPENBLAS_NUM_THREADS" ]; then
+      export OPENBLAS_NUM_THREADS=$threads
+    fi
+    if [ $threads -gt 1 ] && [ -z "$OPENBLAS_MAIN_FREE" ]; then
+      export OPENBLAS_MAIN_FREE=1
+    fi
+  fi
+  
+}
+
 PADDLE_CONF_HOME="$HOME/.config/paddle"
 mkdir -p ${PADDLE_CONF_HOME}
 
@@ -54,8 +131,7 @@ if [ -z "${PADDLE_NO_STAT+x}" ]; then
     fi
 fi
 
-
-MYDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+PADDLE_BIN_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 if [ ! -z "${DEBUGGER}" ]; then
     echo "Using debug command ${DEBUGGER}"
@@ -91,34 +167,20 @@ else:
   sys.exit(0)
 EOF
 
-if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-    echo "First time run paddle, need to install some python dependencies."
-    # setuptools normalizes package version, so we need to use normalized
-    # package version for paddle python package
-    PYTHON_PADDLE_VERSION=$(python -c 'import packaging.version
-import setuptools
-print str(packaging.version.Version("@PADDLE_VERSION@"))
-' 2>/dev/null)
-    BASEDIR=$(dirname "$0")
-    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
-    if [ $? -ne 0 ]; then
-	echo "pip install wheels failed. "
-	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-	echo "PaddlePaddle will install some python dependencies automatically."
-	exit 1
-    fi
-    echo "Python dependencies are installed."
-fi
+cpu_config
+# echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
     "train")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_trainer ${@:2}
+        threads_config $@
+        # echo $OMP_NUM_THREADS $MKL_NUM_THREADS $OPENBLAS_NUM_THREADS
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_trainer ${@:2}
         ;;
     "merge_model")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_merge_model ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_merge_model ${@:2}
         ;;
     "pserver")
-        ${DEBUGGER} $MYDIR/../opt/paddle/bin/paddle_pserver_main ${@:2}
+        ${DEBUGGER} $PADDLE_BIN_PATH/paddle_pserver_main ${@:2}
         ;;
     "dump_config")
         python -m paddle.utils.dump_config ${@:2}
@@ -126,9 +188,6 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
-    "usage")
-        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
-        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index c6cbbc4eef..f9bc8bf63a 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -5,4 +5,4 @@ docker run --rm \
        -e "WITH_AVX=ON" \
        -e "WITH_DOC=ON" \
        -e "WOBOQ=ON" \
-       ${1:-"paddledev/paddle:dev"}
+       ${1:-"paddlepaddle/paddle:latest-dev"}
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
deleted file mode 100755
index 7dbd1f5884..0000000000
--- a/paddle/scripts/tools/usage_stat/usage.sh
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/bin/bash
-
-ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
-KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
-# paddle config home dir, same as paddle
-PADDLE_CONF_HOME="$HOME/.config/paddle"
-# api url, mirror url(s) will be append later
-PD_URLS="http://api.paddlepaddle.org/version"
-
-usage()
-{
-    echo "Usage: `basename $0` [options]"
-    echo "Options:"
-    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
-    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
-    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
-    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
-    echo "  -v, -i                            Verbose output and interact with user when necessary"
-    echo " --help                             display this help message"
-}
-
-eval set -- "${ARGPARSE}"
-while true; do
-    case "$1" in
-        -l|--log-file)
-            log_file=$2
-            shift 2
-            ;;
-        -e|--exit-code)
-            exit_code=$2
-            shift 2
-            ;;
-        -u|--git-user)
-            github_user=$2
-            shift 2
-            ;;
-        -n|--task-name)
-            task=$2
-            shift 2
-            ;;
-        -v|-i)
-            v=1
-            shift
-            ;;
-        --dry-run)
-            dry_run=1
-            shift
-            ;;
-        --)
-            shift
-            break
-            ;;
-        --help)
-            usage
-            exit 0
-            ;;
-        *)
-            echo "Invalid option $1"
-            usage
-            exit 1
-            ;;
-    esac
-done
-
-# parse the log_file to get the time costs
-if [ -s "${log_file}" ]; then
-    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
-    {if(index($2,":")==3){
-        t=substr($2,1,8);
-        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
-        if(sec<last_sec-600){day+=1;sec+=86400;}
-        last_sec=sec;
-        if(min_sec==0 || min_sec>sec){min_sec=sec;}
-        if(max_sec==0 || max_sec<sec){max_sec=sec;}
-    }}
-    END{print max_sec-min_sec}' ${log_file}`
-else
-    duration=-1
-fi
-if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
-
-# try find the user/email if not given
-if [ -z "${github_user}" ]; then
-    # search for cached username
-    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
-        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
-        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
-    else
-        # search the github-user from git config
-        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
-        git_username=`git config --get user.name 2>/dev/null`
-        git_url=`git config --get remote.origin.url 2>/dev/null`
-        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
-            # under a git url, like https://github.com/user_xxx/proj_yyy.git
-            if [ "${v}" = "1" ]; then echo " from github url..."; fi
-            github_user=`echo ${git_url} | cut -d "/" -f 4`
-            if [ "${github_user}" = "PaddlePaddle" ]; then
-                github_user=
-            fi
-        fi
-        if [ -n "${git_username}" -a -z "${github_user}" ]; then
-            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
-            github_user=${git_username}
-        fi
-    fi
-fi
-# allow user to set the user name, if it's not found
-if [ -z "${github_user}" -a "${v}" = "1" ]; then
-    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
-    github_user=${REPLY}
-    if [ -z "${github_user}" ]; then
-        # empty input, consider as one anonymous user
-        github_user="${KEEP_ANONYMOUS}"
-    fi
-fi
-if [ -n "${github_user}" -a -z "${dry_run}" ]; then
-    # valid user and not in dry-run mode, then save to cache
-    mkdir -p ${PADDLE_CONF_HOME}
-    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
-fi
-if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
-if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
-    # anonymous user should keep the var empty.
-    github_user=
-fi
-
-# read local paddle version
-paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
-if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
-
-# read local system time
-system_time=`date "+%Y%m%d%H%M%S"`
-if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
-
-# make empty job_name as default value.
-if [ -z "${task}" ]; then
-    task="(unknown_task)"
-fi
-if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
-
-# concat the curl command
-params="content={\"data_type\":\"usage\",\
-\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
-\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
-\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
-}&type=1"
-curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
- -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
-
-if [ "${dry_run}" = "1" ]; then
-    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
-    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
-    exit 0
-else
-    for u in ${PD_URLS}; do
-        curl_cmd="${curl_cmd_prefix} ${u}"
-        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
-        ${curl_cmd} >/dev/null 2>&1
-        if [ $? -eq 0 ]; then
-            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
-            exit 0
-        else
-            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
-        fi
-    done
-    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
-    exit 1
-fi
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
deleted file mode 100755
index f2cbc56165..0000000000
--- a/paddle/scripts/travis/build_and_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-source ./common.sh
-
-NPROC=1
-export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
-export PYTHONHOME=/opt/python/2.7.12
-export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
-NRPOC=`nproc`
-make -j $NPROC
-make coveralls
-sudo make install
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
new file mode 100755
index 0000000000..9da71d1e8c
--- /dev/null
+++ b/paddle/scripts/travis/build_android.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+set -e
+
+ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
+TMP_DIR=$HOME/$JOB/tmp
+mkdir -p $TMP_DIR
+cd $TMP_DIR
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
+$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+cd $HOME
+rm -rf $TMP_DIR
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_android
+cd $TRAVIS_BUILD_DIR/build_android
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      ..
+
+make -j `nproc`
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/build_doc.sh
similarity index 76%
rename from paddle/scripts/travis/docs.sh
rename to paddle/scripts/travis/build_doc.sh
index c784293695..0db8d33bbc 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -1,15 +1,17 @@
 #!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build
+cd $TRAVIS_BUILD_DIR/build
 
-# Add set -e, cd to directory.
-source ./common.sh
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
-mkdir output
-make -j `nproc`
-find .. -name '*whl' | xargs pip install  # install all wheels.
-rm -rf *
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
-make paddle_docs paddle_docs_cn
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
+make -j `nproc` gen_proto_py
+make -j `nproc` paddle_python
+make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` print_operators_doc
+paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
@@ -21,7 +23,7 @@ SSH_REPO=${REPO/https:\/\/github.com\//git@github.com:}
 SHA=`git rev-parse --verify HEAD`
 
 # Documentation branch name
-# gh-pages branch is used for PaddlePaddle.org. The English version of 
+# gh-pages branch is used for PaddlePaddle.org. The English version of
 # documentation in `doc` directory, and the chinese version in `doc_cn`
 # directory.
 TARGET_BRANCH="gh-pages"
@@ -30,6 +32,7 @@ TARGET_BRANCH="gh-pages"
 SOURCE_BRANCH="master"
 
 # Clone the repo to output directory
+mkdir output
 git clone $REPO output
 cd output
 
@@ -47,18 +50,18 @@ function deploy_docs() {
 
   # checkout github page branch
   git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
-  
+
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
   rm -rf ${DIR}/doc ${DIR}/doc_cn
   set -e
-  mv ../doc/cn/html ${DIR}/doc_cn
-  mv ../doc/en/html ${DIR}/doc
+  cp -r ../doc/cn/html ${DIR}/doc_cn
+  cp -r ../doc/en/html ${DIR}/doc
   git add .
 }
 
-deploy_docs "master" "." 
+deploy_docs "master" "."
 deploy_docs "develop" "./develop/"
 
 # Check is there anything changed.
diff --git a/paddle/scripts/travis/build_ios.sh b/paddle/scripts/travis/build_ios.sh
new file mode 100755
index 0000000000..dee7cf7cbb
--- /dev/null
+++ b/paddle/scripts/travis/build_ios.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_ios
+cd $TRAVIS_BUILD_DIR/build_ios
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=iOS \
+      -DIOS_PLATFORM=OS \
+      -DCMAKE_OSX_ARCHITECTURES="arm64" \
+      -DWITH_C_API=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
+      -DWITH_TESTING=OFF \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_BUILD_TYPE=Release \
+      ..
+
+make -j 2
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
new file mode 100755
index 0000000000..e71d243efa
--- /dev/null
+++ b/paddle/scripts/travis/check_style.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+function abort(){
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+
+# install glide
+curl https://glide.sh/get | bash
+eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
+go get github.com/alecthomas/gometalinter
+gometalinter --install
+
+cd $TRAVIS_BUILD_DIR
+export PATH=/usr/bin:$PATH
+pre-commit install
+clang-format --version
+
+
+
+if ! pre-commit run -a ; then
+    git diff
+    exit 1
+fi
+
+trap : 0
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
deleted file mode 100755
index f05c7530a3..0000000000
--- a/paddle/scripts/travis/common.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-mkdir -p ../../../build
-cd ../../../build
-mkdir -p $HOME/third_party
-EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
deleted file mode 100755
index 13f2552d29..0000000000
--- a/paddle/scripts/travis/main.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-
-if [ ${JOB} == "BUILD_AND_TEST" ]; then
-  ./build_and_test.sh
-elif [ ${JOB} == "DOCS" ]; then
-  ./docs.sh
-elif [ ${JOB} == "PRE_COMMIT" ]; then
-  ./precommit.sh
-else
-  echo Unknown job ${JOB}
-  exit 1
-fi
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/precommit.sh
deleted file mode 100755
index 7a59b1131d..0000000000
--- a/paddle/scripts/travis/precommit.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-function abort(){
-    echo "Your commit not fit PaddlePaddle code style" 1>&2
-    echo "Please use pre-commit scripts to auto-format your code" 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-source common.sh
-cd ..
-export PATH=/usr/bin:$PATH
-pre-commit install
-clang-format --version
-
-if ! pre-commit run -a ; then
-  git diff  --exit-code
-fi
-
-trap : 0
diff --git a/paddle/string/.clang-format b/paddle/string/.clang-format
new file mode 120000
index 0000000000..7d28cb3924
--- /dev/null
+++ b/paddle/string/.clang-format
@@ -0,0 +1 @@
+../framework/.clang-format
\ No newline at end of file
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
new file mode 100644
index 0000000000..751776dbb5
--- /dev/null
+++ b/paddle/string/CMakeLists.txt
@@ -0,0 +1,10 @@
+cc_library(stringpiece SRCS piece.cc)
+cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
+cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
+
+if(NOT WITH_C_API AND WITH_FLUID)
+  file(GLOB STRING_HEADERS *.h)
+  install(FILES ${STRING_HEADERS} DESTINATION include/paddle/string)
+  install(FILES tinyformat/tinyformat.h DESTINATION include/paddle/string/tinyformat)
+endif()
diff --git a/paddle/string/piece.cc b/paddle/string/piece.cc
new file mode 100644
index 0000000000..330ca5f015
--- /dev/null
+++ b/paddle/string/piece.cc
@@ -0,0 +1,136 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/string/piece.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+
+namespace paddle {
+namespace string {
+
+Piece::Piece() : data_(NULL), size_(0) {}
+
+Piece::Piece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument("Piece requires len to be 0 for NULL data");
+}
+
+Piece::Piece(const char* s) : data_(s) { size_ = (s == NULL) ? 0 : strlen(s); }
+
+Piece::Piece(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+char Piece::operator[](size_t n) const {
+  if (n >= len()) throw std::invalid_argument("index out of Piece length");
+  return data_[n];
+}
+
+int Compare(Piece a, Piece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+
+bool operator==(Piece x, Piece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+
+bool operator!=(Piece x, Piece y) { return !(x == y); }
+
+bool operator<(Piece x, Piece y) { return Compare(x, y) < 0; }
+bool operator>(Piece x, Piece y) { return Compare(x, y) > 0; }
+
+bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
+bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
+
+bool HasPrefix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+
+bool HasSuffix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+
+Piece SkipPrefix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data() + n, s.len() - n);
+}
+
+Piece SkipSuffix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data(), s.len() - n);
+}
+
+Piece TrimPrefix(Piece s, Piece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+
+Piece TrimSuffix(Piece s, Piece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+
+bool Contains(Piece s, Piece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+
+size_t Index(Piece s, Piece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : Piece::npos;
+}
+
+size_t Find(Piece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return Piece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : Piece::npos;
+}
+
+size_t RFind(Piece s, char c, size_t pos) {
+  if (s.len() == 0) return Piece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return Piece::npos;
+}
+
+Piece SubStr(Piece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return Piece(s.data() + pos, n);
+}
+
+std::ostream& operator<<(std::ostream& o, Piece piece) {
+  return o << piece.ToString();
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
new file mode 100644
index 0000000000..f2bb6b2c76
--- /dev/null
+++ b/paddle/string/piece.h
@@ -0,0 +1,105 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+namespace paddle {
+namespace string {
+
+// Piece points into a std::string object but doesn't own the
+// string.  It is for efficient access to strings.  Like Go's string
+// type.  Not that Piece doesn't mutate the underlying string,
+// so it is thread-safe given that the underlying string doesn't
+// change.  Because Piece contains a little data members, and
+// its syntax is simple as it doesn't own/manage the string, it is
+// cheap to construct Pieces and pass them around.
+class Piece {
+ public:
+  static const size_t npos = static_cast<size_t>(-1);
+
+  // We provide non-explicit singleton constructors so users can
+  // pass in a "const char*" or a "string" wherever a "Piece"
+  // is expected.  These constructors ensure that if data_ is NULL,
+  // size_ is 0.
+  Piece();
+  Piece(const char* d, size_t n);
+  Piece(const char* d);         // NOLINT: accept C string into Piece.
+  Piece(const std::string& s);  // NOLINT: accept C++ string into Piece.
+
+  const char* data() const { return data_; }
+  size_t len() const { return size_; }
+
+  char operator[](size_t n) const;
+
+  // Piece doesn't own the string, so both iterator and const
+  // iterator are const char* indeed.
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  iterator begin() const { return data_; }
+  iterator end() const { return data_ + size_; }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+
+ private:
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+int Compare(Piece a, Piece b);
+
+bool operator==(Piece x, Piece y);
+bool operator!=(Piece x, Piece y);
+bool operator<(Piece x, Piece y);
+bool operator>(Piece x, Piece y);
+bool operator<=(Piece x, Piece y);
+bool operator>=(Piece x, Piece y);
+
+bool HasPrefix(Piece s, Piece prefix);
+bool HasSuffix(Piece s, Piece suffix);
+
+Piece SkipPrefix(Piece s, size_t n);
+Piece SkipSuffix(Piece s, size_t n);
+
+// Skip the prefix (or suffix) if it matches with the string.
+Piece TrimPrefix(Piece s, Piece prefix);
+Piece TrimSuffix(Piece s, Piece suffix);
+
+// Returns if s contains sub.  Any s except for empty s contains an
+// empty sub.
+bool Contains(Piece s, Piece sub);
+
+// Return the first occurrence of sub in s, or npos.  If both s and
+// sub is empty, it returns npos; otherwise, if only sub is empty, it
+// returns 0.
+size_t Index(Piece s, Piece sub);
+
+// Return the first occurrence of c in s[pos:end], or npos.
+size_t Find(Piece s, char c, size_t pos);
+
+// Search range is [0..pos] inclusive.  If pos == npos, search everything.
+size_t RFind(Piece s, char c, size_t pos);
+
+Piece SubStr(Piece s, size_t pos, size_t n);
+
+// allow Piece to be logged
+std::ostream& operator<<(std::ostream& o, Piece piece);
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/piece_test.cc b/paddle/string/piece_test.cc
new file mode 100644
index 0000000000..250f26d61f
--- /dev/null
+++ b/paddle/string/piece_test.cc
@@ -0,0 +1,293 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/string/piece.h"
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+TEST(StringPiece, Construct) {
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ(NULL, s.data());
+    EXPECT_EQ(0U, s.len());
+  }
+  {
+    EXPECT_THROW(paddle::string::Piece s(NULL, 10000U), std::invalid_argument);
+  }
+  {
+    paddle::string::Piece s(NULL);
+    EXPECT_EQ(0U, s.len());
+  }
+  {
+    std::string a;
+    EXPECT_EQ(0U, a.size());
+    paddle::string::Piece s(a);
+    EXPECT_EQ(0U, s.len());
+  }
+}
+
+TEST(StringPiece, CopyAndAssign) {
+  paddle::string::Piece empty;
+  EXPECT_EQ(0U, empty.len());
+
+  paddle::string::Piece a("hello");
+  paddle::string::Piece b = a;
+  EXPECT_EQ(b.len(), strlen("hello"));
+  EXPECT_EQ(a, b);
+
+  std::string storage("hello");
+  paddle::string::Piece c(storage);
+  EXPECT_EQ(a, c);
+  EXPECT_NE(a.data(), c.data());
+}
+
+TEST(StringPiece, Compare) {
+  {
+    paddle::string::Piece a("hello");
+    paddle::string::Piece b("world");
+    EXPECT_TRUE(a != b);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a < b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_FALSE(a > b);
+    EXPECT_FALSE(a >= b);
+    EXPECT_LT(Compare(a, b), 0);
+    EXPECT_GT(Compare(b, a), 0);
+  }
+  {
+    paddle::string::Piece a, b;
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_FALSE(a < b);
+    EXPECT_FALSE(a > b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_TRUE(a >= b);
+    EXPECT_EQ(0, Compare(a, b));
+    EXPECT_EQ(0, Compare(b, a));
+  }
+}
+
+TEST(StringPiece, ToString) {
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::string::Piece s(NULL);
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::string::Piece s("hello");
+    EXPECT_EQ(std::string("hello"), s.ToString());
+  }
+}
+
+TEST(StringPiece, HasPrefixSuffix) {
+  using paddle::string::HasPrefix;
+  using paddle::string::HasSuffix;
+  {
+    paddle::string::Piece s;
+    EXPECT_FALSE(HasPrefix(s, "something"));
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_FALSE(HasSuffix(s, "something"));
+    EXPECT_TRUE(HasSuffix(s, ""));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_TRUE(HasPrefix(s, "a"));
+    EXPECT_TRUE(HasPrefix(s, "ap"));
+    EXPECT_TRUE(HasPrefix(s, "app"));
+
+    EXPECT_TRUE(HasSuffix(s, ""));
+    EXPECT_TRUE(HasSuffix(s, "p"));
+    EXPECT_TRUE(HasSuffix(s, "pp"));
+    EXPECT_TRUE(HasSuffix(s, "app"));
+  }
+}
+
+TEST(StringPiece, SkipPrefixSuffix) {
+  using paddle::string::SkipPrefix;
+  using paddle::string::SkipSuffix;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ("", SkipPrefix(s, 0));
+    EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
+
+    EXPECT_EQ("", SkipSuffix(s, 0));
+    EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ("app", SkipPrefix(s, 0));
+    EXPECT_EQ("pp", SkipPrefix(s, 1));
+    EXPECT_EQ("p", SkipPrefix(s, 2));
+    EXPECT_EQ("", SkipPrefix(s, 3));
+    EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument);
+
+    EXPECT_EQ("app", SkipSuffix(s, 0));
+    EXPECT_EQ("ap", SkipSuffix(s, 1));
+    EXPECT_EQ("a", SkipSuffix(s, 2));
+    EXPECT_EQ("", SkipSuffix(s, 3));
+    EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument);
+  }
+}
+
+TEST(StringPiece, TrimPrefixSuffix) {
+  using paddle::string::TrimPrefix;
+  using paddle::string::TrimSuffix;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ("", TrimPrefix(s, ""));
+    EXPECT_EQ("", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("", TrimSuffix(s, ""));
+    EXPECT_EQ("", TrimSuffix(s, "something"));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ("app", TrimPrefix(s, ""));
+    EXPECT_EQ("pp", TrimPrefix(s, "a"));
+    EXPECT_EQ("p", TrimPrefix(s, "ap"));
+    EXPECT_EQ("", TrimPrefix(s, "app"));
+    EXPECT_EQ("app", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("app", TrimSuffix(s, ""));
+    EXPECT_EQ("ap", TrimSuffix(s, "p"));
+    EXPECT_EQ("a", TrimSuffix(s, "pp"));
+    EXPECT_EQ("", TrimSuffix(s, "app"));
+    EXPECT_EQ("app", TrimSuffix(s, "something"));
+  }
+}
+
+TEST(StringPiece, Contains) {
+  using paddle::string::Contains;
+  {
+    paddle::string::Piece s;
+    EXPECT_FALSE(Contains(s, ""));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_TRUE(Contains(s, ""));
+    EXPECT_TRUE(Contains(s, "a"));
+    EXPECT_TRUE(Contains(s, "p"));
+    EXPECT_TRUE(Contains(s, "ap"));
+    EXPECT_TRUE(Contains(s, "pp"));
+    EXPECT_TRUE(Contains(s, "app"));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+}
+
+TEST(StringPiece, Index) {
+  using paddle::string::Index;
+  auto npos = paddle::string::Piece::npos;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ(npos, Index(s, ""));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ(0U, Index(s, ""));
+    EXPECT_EQ(0U, Index(s, "a"));
+    EXPECT_EQ(1U, Index(s, "p"));
+    EXPECT_EQ(0U, Index(s, "ap"));
+    EXPECT_EQ(1U, Index(s, "pp"));
+    EXPECT_EQ(0U, Index(s, "app"));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+}
+
+TEST(StringPiece, Find) {
+  using paddle::string::Find;
+  auto npos = paddle::string::Piece::npos;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ(npos, Find(s, 'a', 0U));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ(0U, Find(s, 'a', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 1U));
+    EXPECT_EQ(2U, Find(s, 'p', 2U));
+    EXPECT_EQ(npos, Find(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, RFind) {
+  using paddle::string::RFind;
+  auto npos = paddle::string::Piece::npos;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ(npos, RFind(s, 'a', 0U));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ(2U, RFind(s, 'p', 2U));
+    EXPECT_EQ(0U, RFind(s, 'a', 2U));
+    EXPECT_EQ(1U, RFind(s, 'p', 1U));
+    EXPECT_EQ(0U, RFind(s, 'a', 0));
+    EXPECT_EQ(npos, RFind(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, SubStr) {
+  using paddle::string::SubStr;
+  {
+    paddle::string::Piece s;
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 0, 1));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+  }
+  {
+    paddle::string::Piece s("app");
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+    EXPECT_EQ("", SubStr(s, 2, 0));
+    EXPECT_EQ("", SubStr(s, 3, 0));
+
+    EXPECT_EQ("a", SubStr(s, 0, 1));
+    EXPECT_EQ("p", SubStr(s, 1, 1));
+    EXPECT_EQ("p", SubStr(s, 2, 1));
+    EXPECT_EQ("", SubStr(s, 3, 1));
+
+    EXPECT_EQ("ap", SubStr(s, 0, 2));
+    EXPECT_EQ("pp", SubStr(s, 1, 2));
+    EXPECT_EQ("p", SubStr(s, 2, 2));
+    EXPECT_EQ("", SubStr(s, 3, 2));
+
+    EXPECT_EQ("app", SubStr(s, 0, 3));
+    EXPECT_EQ("pp", SubStr(s, 1, 3));
+    EXPECT_EQ("p", SubStr(s, 2, 3));
+    EXPECT_EQ("", SubStr(s, 3, 3));
+  }
+}
+
+TEST(StringPiece, StreamOutput) {
+  using paddle::string::Piece;
+
+  std::stringstream o;
+  o << paddle::string::Piece();
+  EXPECT_EQ("", o.str());
+
+  o << paddle::string::Piece("hello");
+  EXPECT_EQ("hello", o.str());
+
+  o << paddle::string::Piece();
+  EXPECT_EQ("hello", o.str());
+}
diff --git a/paddle/string/printf.h b/paddle/string/printf.h
new file mode 100644
index 0000000000..03809d2209
--- /dev/null
+++ b/paddle/string/printf.h
@@ -0,0 +1,97 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Compared with std::stringstream, there are primary purpose of
+// string::Printf:
+//
+// 1. Type-safe printing, with why and how explained in
+//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
+//    Implementation includes
+//
+//    https://github.com/c42f/tinyformat
+//    boost::format
+//    std::stringstream
+//
+//    std::stringstream is not convenient enough in many cases.  For example:
+//
+//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+//
+//    boost::format is the most convenient one.  We can have
+//
+//      std::cout << format("%2% %1%") % 36 % 77;
+//
+//    or
+//
+//      format fmter("%2% %1%");
+//      fmter % 36; fmter % 77;
+//      std::cout << fmter.c_str();
+//
+//    But the overloading of % might be overkilling and it would be
+//    more efficient if it can write to std::cout directly.
+//
+//    tinyformat has an interface compatible with the C-printf style,
+//    and it can writes to a stream or returns a std::string:
+//
+//      std::cout << tfm::printf(
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+//    or
+//
+//      tfm::format(std::cout,
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+// 2. High-performance -- most printed strings are not too long and
+//    doens't need dynamic memory allocation.  Many StringPrintf
+//    implementations doesn't enforce type-safe, but are
+//    high-performance, including
+//
+//    https://developers.google.com/optimization/reference/base/stringprintf/
+//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
+//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
+//
+// According to
+// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
+// boost::format runs too slow and results in large executable binary
+// files.  So here we port tinyformat.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "paddle/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+
+namespace paddle {
+namespace string {
+
+template <typename... Args>
+void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
+  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, fmt, args...);
+  return oss.str();
+}
+
+template <typename... Args>
+void Printf(const char* fmt, const Args&... args) {
+  Fprintf(std::cout, fmt, args...);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
new file mode 100644
index 0000000000..b5ad35513b
--- /dev/null
+++ b/paddle/string/printf_test.cc
@@ -0,0 +1,29 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/string/printf.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringPrintf, StringPrintf) {
+  std::string weekday = "Wednesday";
+  const char* month = "July";
+  size_t day = 27;
+  long hour = 14;
+  int min = 44;
+  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
+            paddle::string::Sprintf("%s, %s %d, %.2d:%.2d", weekday, month, day,
+                                    hour, min));
+}
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
new file mode 100644
index 0000000000..d1a2c47f1a
--- /dev/null
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -0,0 +1,892 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// tinyformat.h
+// Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
+//
+// Boost Software License - Version 1.0
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+//------------------------------------------------------------------------------
+// Tinyformat: A minimal type safe printf replacement
+//
+// tinyformat.h is a type safe printf replacement library in a single C++
+// header file.  Design goals include:
+//
+// * Type safety and extensibility for user defined types.
+// * C99 printf() compatibility, to the extent possible using std::ostream
+// * Simplicity and minimalism.  A single header file to include and distribute
+//   with your projects.
+// * Augment rather than replace the standard stream formatting mechanism
+// * C++98 support, with optional C++11 niceties
+//
+//
+// Main interface example usage
+// ----------------------------
+//
+// To print a date to std::cout:
+//
+//   std::string weekday = "Wednesday";
+//   const char* month = "July";
+//   size_t day = 27;
+//   long hour = 14;
+//   int min = 44;
+//
+//   tfm::printf("%s, %s %d, %.2d:%.2d\n", weekday, month, day, hour, min);
+//
+// The strange types here emphasize the type safety of the interface; it is
+// possible to print a std::string using the "%s" conversion, and a
+// size_t using the "%d" conversion.  A similar result could be achieved
+// using either of the tfm::format() functions.  One prints on a user provided
+// stream:
+//
+//   tfm::format(std::cerr, "%s, %s %d, %.2d:%.2d\n",
+//               weekday, month, day, hour, min);
+//
+// The other returns a std::string:
+//
+//   std::string date = tfm::format("%s, %s %d, %.2d:%.2d\n",
+//                                  weekday, month, day, hour, min);
+//   std::cout << date;
+//
+// These are the three primary interface functions.  There is also a
+// convenience function printfln() which appends a newline to the usual result
+// of printf() for super simple logging.
+//
+//
+// User defined format functions
+// -----------------------------
+//
+// Simulating variadic templates in C++98 is pretty painful since it requires
+// writing out the same function for each desired number of arguments.  To make
+// this bearable tinyformat comes with a set of macros which are used
+// internally to generate the API, but which may also be used in user code.
+//
+// The three macros TINYFORMAT_ARGTYPES(n), TINYFORMAT_VARARGS(n) and
+// TINYFORMAT_PASSARGS(n) will generate a list of n argument types,
+// type/name pairs and argument names respectively when called with an integer
+// n between 1 and 16.  We can use these to define a macro which generates the
+// desired user defined function with n arguments.  To generate all 16 user
+// defined function bodies, use the macro TINYFORMAT_FOREACH_ARGNUM.  For an
+// example, see the implementation of printf() at the end of the source file.
+//
+// Sometimes it's useful to be able to pass a list of format arguments through
+// to a non-template function.  The FormatList class is provided as a way to do
+// this by storing the argument list in a type-opaque way.  Continuing the
+// example from above, we construct a FormatList using makeFormatList():
+//
+//   FormatListRef formatList = tfm::makeFormatList(weekday, month, day, hour,
+//   min);
+//
+// The format list can now be passed into any non-template function and used
+// via a call to the vformat() function:
+//
+//   tfm::vformat(std::cout, "%s, %s %d, %.2d:%.2d\n", formatList);
+//
+//
+// Additional API information
+// --------------------------
+//
+// Error handling: Define TINYFORMAT_ERROR to customize the error handling for
+// format strings which are unsupported or have the wrong number of format
+// specifiers (calls assert() by default).
+//
+// User defined types: Uses operator<< for user defined types by default.
+// Overload formatValue() for more control.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+
+namespace paddle {
+namespace string {
+namespace tinyformat {
+
+#ifndef TINYFORMAT_ERROR
+#define TINYFORMAT_ERROR(reason) assert(0 && reason)
+#endif
+
+//------------------------------------------------------------------------------
+namespace detail {
+
+// Test whether type T1 is convertible to type T2
+template <typename T1, typename T2>
+struct is_convertible {
+ private:
+  // two types of different size
+  struct fail {
+    char dummy[2];
+  };
+  struct succeed {
+    char dummy;
+  };
+  // Try to convert a T1 to a T2 by plugging into tryConvert
+  static fail tryConvert(...);
+  static succeed tryConvert(const T2 &);
+  static const T1 &makeT1();
+
+ public:
+  // Standard trick: the (...) version of tryConvert will be chosen from
+  // the overload set only if the version taking a T2 doesn't match.
+  // Then we compare the sizes of the return types to check which
+  // function matched.  Very neat, in a disgusting kind of way :)
+  static const bool value = sizeof(tryConvert(makeT1())) == sizeof(succeed);
+};
+
+// Format the value by casting to type fmtT.  This default implementation
+// should never be called.
+template <typename T, typename fmtT,
+          bool convertible = is_convertible<T, fmtT>::value>
+struct formatValueAsType {
+  static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
+};
+// Specialized version for types that can actually be converted to fmtT, as
+// indicated by the "convertible" template parameter.
+template <typename T, typename fmtT>
+struct formatValueAsType<T, fmtT, true> {
+  static void invoke(std::ostream &out, const T &value) {
+    out << static_cast<fmtT>(value);
+  }
+};
+
+// Convert an arbitrary type to integer.  The version with convertible=false
+// throws an error.
+template <typename T, bool convertible = is_convertible<T, int>::value>
+struct convertToInt {
+  static int invoke(const T & /*value*/) {
+    TINYFORMAT_ERROR(
+        "tinyformat: Cannot convert from argument type to "
+        "integer for use as variable width or precision");
+    return 0;
+  }
+};
+// Specialization for convertToInt when conversion is possible
+template <typename T>
+struct convertToInt<T, true> {
+  static int invoke(const T &value) { return static_cast<int>(value); }
+};
+
+// Format at most ntrunc characters to the given stream.
+template <typename T>
+inline void formatTruncated(std::ostream &out, const T &value, int ntrunc) {
+  std::ostringstream tmp;
+  tmp << value;
+  std::string result = tmp.str();
+  out.write(result.c_str(),
+            (std::min)(ntrunc, static_cast<int>(result.size())));
+}
+#define TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(type)                       \
+  inline void formatTruncated(std::ostream &out, type *value, int ntrunc) { \
+    std::streamsize len = 0;                                                \
+    while (len < ntrunc && value[len] != 0) ++len;                          \
+    out.write(value, len);                                                  \
+  }
+// Overload for const char* and char*.  Could overload for signed & unsigned
+// char too, but these are technically unneeded for printf compatibility.
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(const char)
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
+#undef TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Variable formatting functions.  May be overridden for user-defined types if
+// desired.
+
+/// Format a value into a stream, delegating to operator<< by default.
+///
+/// Users may override this for their own types.  When this function is called,
+/// the stream flags will have been modified according to the format string.
+/// The format specification is provided in the range [fmtBegin, fmtEnd).  For
+/// truncating conversions, ntrunc is set to the desired maximum number of
+/// characters, for example "%.7s" calls formatValue with ntrunc = 7.
+///
+/// By default, formatValue() uses the usual stream insertion operator
+/// operator<< to format the type T, with special cases for the %c and %p
+/// conversions.
+template <typename T>
+inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,
+                        const char *fmtEnd, int ntrunc, const T &value) {
+  // The mess here is to support the %c and %p conversions: if these
+  // conversions are active we try to convert the type to a char or const
+  // void* respectively and format that instead of the value itself.  For the
+  // %p conversion it's important to avoid dereferencing the pointer, which
+  // could otherwise lead to a crash when printing a dangling (const char*).
+  const bool canConvertToChar = detail::is_convertible<T, char>::value;
+  const bool canConvertToVoidPtr =
+      detail::is_convertible<T, const void *>::value;
+  if (canConvertToChar && *(fmtEnd - 1) == 'c')
+    detail::formatValueAsType<T, char>::invoke(out, value);
+  else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p')
+    detail::formatValueAsType<T, const void *>::invoke(out, value);
+  else if (ntrunc >= 0) {
+    // Take care not to overread C strings in truncating conversions like
+    // "%.4s" where at most 4 characters may be read.
+    detail::formatTruncated(out, value, ntrunc);
+  } else
+    out << value;
+}
+
+// Overloaded version for char types to support printing as an integer
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType)                      \
+  inline void formatValue(std::ostream &out, const char * /*fmtBegin*/,   \
+                          const char *fmtEnd, int /**/, charType value) { \
+    switch (*(fmtEnd - 1)) {                                              \
+      case 'u':                                                           \
+      case 'd':                                                           \
+      case 'i':                                                           \
+      case 'o':                                                           \
+      case 'X':                                                           \
+      case 'x':                                                           \
+        out << static_cast<int>(value);                                   \
+        break;                                                            \
+      default:                                                            \
+        out << value;                                                     \
+        break;                                                            \
+    }                                                                     \
+  }
+// per 3.9.1: char, signed char and unsigned char are all distinct types
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(signed char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(unsigned char)
+#undef TINYFORMAT_DEFINE_FORMATVALUE_CHAR
+
+//------------------------------------------------------------------------------
+// Tools for emulating variadic templates in C++98.  The basic idea here is
+// stolen from the boost preprocessor metaprogramming library and cut down to
+// be just general enough for what we need.
+
+#define TINYFORMAT_ARGTYPES(n) TINYFORMAT_ARGTYPES_##n
+#define TINYFORMAT_VARARGS(n) TINYFORMAT_VARARGS_##n
+#define TINYFORMAT_PASSARGS(n) TINYFORMAT_PASSARGS_##n
+#define TINYFORMAT_PASSARGS_TAIL(n) TINYFORMAT_PASSARGS_TAIL_##n
+
+// To keep it as transparent as possible, the macros below have been generated
+// using python via the excellent cog.py code generation script.  This avoids
+// the need for a bunch of complex (but more general) preprocessor tricks as
+// used in boost.preprocessor.
+//
+// To rerun the code generation in place, use `cog.py -r tinyformat.h`
+// (see http://nedbatchelder.com/code/cog).  Alternatively you can just create
+// extra versions by hand.
+
+/*[[[cog
+maxParams = 16
+
+def makeCommaSepLists(lineTemplate, elemTemplate, startInd=1):
+    for j in range(startInd,maxParams+1):
+        list = ', '.join([elemTemplate % {'i':i} for i in range(startInd,j+1)])
+        cog.outl(lineTemplate % {'j':j, 'list':list})
+
+makeCommaSepLists('#define TINYFORMAT_ARGTYPES_%(j)d %(list)s',
+                  'class T%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_VARARGS_%(j)d %(list)s',
+                  'const T%(i)d& v%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_%(j)d %(list)s', 'v%(i)d')
+
+cog.outl()
+cog.outl('#define TINYFORMAT_PASSARGS_TAIL_1')
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_TAIL_%(j)d , %(list)s',
+                  'v%(i)d', startInd = 2)
+
+cog.outl()
+cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
+         ' '.join(['m(%d)' % (j,) for j in range(1,maxParams+1)]))
+]]]*/
+#define TINYFORMAT_ARGTYPES_1 class T1
+#define TINYFORMAT_ARGTYPES_2 class T1, class T2
+#define TINYFORMAT_ARGTYPES_3 class T1, class T2, class T3
+#define TINYFORMAT_ARGTYPES_4 class T1, class T2, class T3, class T4
+#define TINYFORMAT_ARGTYPES_5 class T1, class T2, class T3, class T4, class T5
+#define TINYFORMAT_ARGTYPES_6 \
+  class T1, class T2, class T3, class T4, class T5, class T6
+#define TINYFORMAT_ARGTYPES_7 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7
+#define TINYFORMAT_ARGTYPES_8 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8
+#define TINYFORMAT_ARGTYPES_9                                           \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9
+#define TINYFORMAT_ARGTYPES_10                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10
+#define TINYFORMAT_ARGTYPES_11                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11
+#define TINYFORMAT_ARGTYPES_12                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12
+#define TINYFORMAT_ARGTYPES_13                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13
+#define TINYFORMAT_ARGTYPES_14                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14
+#define TINYFORMAT_ARGTYPES_15                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15
+#define TINYFORMAT_ARGTYPES_16                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15, class T16
+
+#define TINYFORMAT_VARARGS_1 const T1 &v1
+#define TINYFORMAT_VARARGS_2 const T1 &v1, const T2 &v2
+#define TINYFORMAT_VARARGS_3 const T1 &v1, const T2 &v2, const T3 &v3
+#define TINYFORMAT_VARARGS_4 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4
+#define TINYFORMAT_VARARGS_5 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5
+#define TINYFORMAT_VARARGS_6                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6
+#define TINYFORMAT_VARARGS_7                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7
+#define TINYFORMAT_VARARGS_8                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8
+#define TINYFORMAT_VARARGS_9                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9
+#define TINYFORMAT_VARARGS_10                                           \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10
+#define TINYFORMAT_VARARGS_11                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11
+#define TINYFORMAT_VARARGS_12                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12
+#define TINYFORMAT_VARARGS_13                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13
+#define TINYFORMAT_VARARGS_14                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14
+#define TINYFORMAT_VARARGS_15                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15
+#define TINYFORMAT_VARARGS_16                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15, const T16 &v16
+
+#define TINYFORMAT_PASSARGS_1 v1
+#define TINYFORMAT_PASSARGS_2 v1, v2
+#define TINYFORMAT_PASSARGS_3 v1, v2, v3
+#define TINYFORMAT_PASSARGS_4 v1, v2, v3, v4
+#define TINYFORMAT_PASSARGS_5 v1, v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_6 v1, v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_7 v1, v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_8 v1, v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_9 v1, v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_10 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_11 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_12 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_13 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_14 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_15 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_16 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_PASSARGS_TAIL_1
+#define TINYFORMAT_PASSARGS_TAIL_2 , v2
+#define TINYFORMAT_PASSARGS_TAIL_3 , v2, v3
+#define TINYFORMAT_PASSARGS_TAIL_4 , v2, v3, v4
+#define TINYFORMAT_PASSARGS_TAIL_5 , v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_TAIL_6 , v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_TAIL_7 , v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_TAIL_8 , v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_TAIL_9 , v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_TAIL_10 , v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_TAIL_11 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_TAIL_12 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_TAIL_13 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_TAIL_14 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_TAIL_15 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_TAIL_16 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_FOREACH_ARGNUM(m)                                         \
+  m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \
+      m(15) m(16)
+//[[[end]]]
+
+namespace detail {
+
+// Type-opaque holder for an argument to format(), with associated actions on
+// the type held as explicit function pointers.  This allows FormatArg's for
+// each argument to be allocated as a homogenous array inside FormatList
+// whereas a naive implementation based on inheritance does not.
+class FormatArg {
+ public:
+  FormatArg() {}
+
+  template <typename T>
+  FormatArg(const T &value)
+      : m_value(static_cast<const void *>(&value)),
+        m_formatImpl(&formatImpl<T>),
+        m_toIntImpl(&toIntImpl<T>) {}
+
+  void format(std::ostream &out, const char *fmtBegin, const char *fmtEnd,
+              int ntrunc) const {
+    m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
+  }
+
+  int toInt() const { return m_toIntImpl(m_value); }
+
+ private:
+  template <typename T>
+  static void formatImpl(std::ostream &out, const char *fmtBegin,
+                         const char *fmtEnd, int ntrunc, const void *value) {
+    formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
+  }
+
+  template <typename T>
+  static int toIntImpl(const void *value) {
+    return convertToInt<T>::invoke(*static_cast<const T *>(value));
+  }
+
+  const void *m_value;
+  void (*m_formatImpl)(std::ostream &out, const char *fmtBegin,
+                       const char *fmtEnd, int ntrunc, const void *value);
+  int (*m_toIntImpl)(const void *value);
+};
+
+// Parse and return an integer from the string c, as atoi()
+// On return, c is set to one past the end of the integer.
+inline int parseIntAndAdvance(const char *&c) {
+  int i = 0;
+  for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0');
+  return i;
+}
+
+// Print literal part of format string and return next format spec
+// position.
+//
+// Skips over any occurrences of '%%', printing a literal '%' to the
+// output.  The position of the first % character of the next
+// nontrivial format spec is returned, or the end of string.
+inline const char *printFormatStringLiteral(std::ostream &out,
+                                            const char *fmt) {
+  const char *c = fmt;
+  for (;; ++c) {
+    switch (*c) {
+      case '\0':
+        out.write(fmt, c - fmt);
+        return c;
+      case '%':
+        out.write(fmt, c - fmt);
+        if (*(c + 1) != '%') return c;
+        // for "%%", tack trailing % onto next literal section.
+        fmt = ++c;
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+// Parse a format string and set the stream state accordingly.
+//
+// The format mini-language recognized here is meant to be the one from C99,
+// with the form "%[flags][width][.precision][length]type".
+//
+// Formatting options which can't be natively represented using the ostream
+// state are returned in spacePadPositive (for space padded positive numbers)
+// and ntrunc (for truncating conversions).  argIndex is incremented if
+// necessary to pull out variable width and precision .  The function returns a
+// pointer to the character after the end of the current format spec.
+inline const char *streamStateFromFormat(std::ostream &out,
+                                         bool &spacePadPositive, int &ntrunc,
+                                         const char *fmtStart,
+                                         const detail::FormatArg *formatters,
+                                         int &argIndex, int numFormatters) {
+  if (*fmtStart != '%') {
+    TINYFORMAT_ERROR(
+        "tinyformat: Not enough conversion specifiers in format string");
+    return fmtStart;
+  }
+  // Reset stream state to defaults.
+  out.width(0);
+  out.precision(6);
+  out.fill(' ');
+  // Reset most flags; ignore irrelevant unitbuf & skipws.
+  out.unsetf(std::ios::adjustfield | std::ios::basefield |
+             std::ios::floatfield | std::ios::showbase | std::ios::boolalpha |
+             std::ios::showpoint | std::ios::showpos | std::ios::uppercase);
+  bool precisionSet = false;
+  bool widthSet = false;
+  int widthExtra = 0;
+  const char *c = fmtStart + 1;
+  // 1) Parse flags
+  for (;; ++c) {
+    switch (*c) {
+      case '#':
+        out.setf(std::ios::showpoint | std::ios::showbase);
+        continue;
+      case '0':
+        // overridden by left alignment ('-' flag)
+        if (!(out.flags() & std::ios::left)) {
+          // Use internal padding so that numeric values are
+          // formatted correctly, eg -00010 rather than 000-10
+          out.fill('0');
+          out.setf(std::ios::internal, std::ios::adjustfield);
+        }
+        continue;
+      case '-':
+        out.fill(' ');
+        out.setf(std::ios::left, std::ios::adjustfield);
+        continue;
+      case ' ':
+        // overridden by show positive sign, '+' flag.
+        if (!(out.flags() & std::ios::showpos)) spacePadPositive = true;
+        continue;
+      case '+':
+        out.setf(std::ios::showpos);
+        spacePadPositive = false;
+        widthExtra = 1;
+        continue;
+      default:
+        break;
+    }
+    break;
+  }
+  // 2) Parse width
+  if (*c >= '0' && *c <= '9') {
+    widthSet = true;
+    out.width(parseIntAndAdvance(c));
+  }
+  if (*c == '*') {
+    widthSet = true;
+    int width = 0;
+    if (argIndex < numFormatters)
+      width = formatters[argIndex++].toInt();
+    else
+      TINYFORMAT_ERROR(
+          "tinyformat: Not enough arguments to read variable width");
+    if (width < 0) {
+      // negative widths correspond to '-' flag set
+      out.fill(' ');
+      out.setf(std::ios::left, std::ios::adjustfield);
+      width = -width;
+    }
+    out.width(width);
+    ++c;
+  }
+  // 3) Parse precision
+  if (*c == '.') {
+    ++c;
+    int precision = 0;
+    if (*c == '*') {
+      ++c;
+      if (argIndex < numFormatters)
+        precision = formatters[argIndex++].toInt();
+      else
+        TINYFORMAT_ERROR(
+            "tinyformat: Not enough arguments to read variable precision");
+    } else {
+      if (*c >= '0' && *c <= '9')
+        precision = parseIntAndAdvance(c);
+      else if (*c == '-')  // negative precisions ignored, treated as zero.
+        parseIntAndAdvance(++c);
+    }
+    out.precision(precision);
+    precisionSet = true;
+  }
+  // 4) Ignore any C99 length modifier
+  while (*c == 'l' || *c == 'h' || *c == 'L' || *c == 'j' || *c == 'z' ||
+         *c == 't')
+    ++c;
+  // 5) We're up to the conversion specifier character.
+  // Set stream flags based on conversion specifier (thanks to the
+  // boost::format class for forging the way here).
+  bool intConversion = false;
+  switch (*c) {
+    case 'u':
+    case 'd':
+    case 'i':
+      out.setf(std::ios::dec, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'o':
+      out.setf(std::ios::oct, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'X':
+      out.setf(std::ios::uppercase);
+    case 'x':
+    case 'p':
+      out.setf(std::ios::hex, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'E':
+      out.setf(std::ios::uppercase);
+    case 'e':
+      out.setf(std::ios::scientific, std::ios::floatfield);
+      out.setf(std::ios::dec, std::ios::basefield);
+      break;
+    case 'F':
+      out.setf(std::ios::uppercase);
+    case 'f':
+      out.setf(std::ios::fixed, std::ios::floatfield);
+      break;
+    case 'G':
+      out.setf(std::ios::uppercase);
+    case 'g':
+      out.setf(std::ios::dec, std::ios::basefield);
+      // As in boost::format, let stream decide float format.
+      out.flags(out.flags() & ~std::ios::floatfield);
+      break;
+    case 'a':
+    case 'A':
+      TINYFORMAT_ERROR(
+          "tinyformat: the %a and %A conversion specs "
+          "are not supported");
+      break;
+    case 'c':
+      // Handled as special case inside formatValue()
+      break;
+    case 's':
+      if (precisionSet) ntrunc = static_cast<int>(out.precision());
+      // Make %s print booleans as "true" and "false"
+      out.setf(std::ios::boolalpha);
+      break;
+    case 'n':
+      // Not supported - will cause problems!
+      TINYFORMAT_ERROR("tinyformat: %n conversion spec not supported");
+      break;
+    case '\0':
+      TINYFORMAT_ERROR(
+          "tinyformat: Conversion spec incorrectly "
+          "terminated by end of string");
+      return c;
+    default:
+      break;
+  }
+  if (intConversion && precisionSet && !widthSet) {
+    // "precision" for integers gives the minimum number of digits (to be
+    // padded with zeros on the left).  This isn't really supported by the
+    // iostreams, but we can approximately simulate it with the width if
+    // the width isn't otherwise used.
+    out.width(out.precision() + widthExtra);
+    out.setf(std::ios::internal, std::ios::adjustfield);
+    out.fill('0');
+  }
+  return c + 1;
+}
+
+//------------------------------------------------------------------------------
+inline void formatImpl(std::ostream &out, const char *fmt,
+                       const detail::FormatArg *formatters, int numFormatters) {
+  // Saved stream state
+  std::streamsize origWidth = out.width();
+  std::streamsize origPrecision = out.precision();
+  std::ios::fmtflags origFlags = out.flags();
+  char origFill = out.fill();
+
+  for (int argIndex = 0; argIndex < numFormatters; ++argIndex) {
+    // Parse the format string
+    fmt = printFormatStringLiteral(out, fmt);
+    bool spacePadPositive = false;
+    int ntrunc = -1;
+    const char *fmtEnd =
+        streamStateFromFormat(out, spacePadPositive, ntrunc, fmt, formatters,
+                              argIndex, numFormatters);
+    if (argIndex >= numFormatters) {
+      // Check args remain after reading any variable width/precision
+      TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
+      return;
+    }
+    const FormatArg &arg = formatters[argIndex];
+    // Format the arg into the stream.
+    if (!spacePadPositive)
+      arg.format(out, fmt, fmtEnd, ntrunc);
+    else {
+      // The following is a special case with no direct correspondence
+      // between stream formatting and the printf() behaviour.  Simulate
+      // it crudely by formatting into a temporary string stream and
+      // munging the resulting string.
+      std::ostringstream tmpStream;
+      tmpStream.copyfmt(out);
+      tmpStream.setf(std::ios::showpos);
+      arg.format(tmpStream, fmt, fmtEnd, ntrunc);
+      std::string result = tmpStream.str();  // allocates... yuck.
+      for (size_t i = 0, iend = result.size(); i < iend; ++i)
+        if (result[i] == '+') result[i] = ' ';
+      out << result;
+    }
+    fmt = fmtEnd;
+  }
+
+  // Print remaining part of format string.
+  fmt = printFormatStringLiteral(out, fmt);
+  if (*fmt != '\0')
+    TINYFORMAT_ERROR(
+        "tinyformat: Too many conversion specifiers in format string");
+
+  // Restore stream state
+  out.width(origWidth);
+  out.precision(origPrecision);
+  out.flags(origFlags);
+  out.fill(origFill);
+}
+
+}  // namespace detail
+
+/// List of template arguments format(), held in a type-opaque way.
+///
+/// A const reference to FormatList (typedef'd as FormatListRef) may be
+/// conveniently used to pass arguments to non-template functions: All type
+/// information has been stripped from the arguments, leaving just enough of a
+/// common interface to perform formatting as required.
+class FormatList {
+ public:
+  FormatList(detail::FormatArg *formatters, int N)
+      : m_formatters(formatters), m_N(N) {}
+
+  friend void vformat(std::ostream &out, const char *fmt,
+                      const FormatList &list);
+
+ private:
+  const detail::FormatArg *m_formatters;
+  int m_N;
+};
+
+/// Reference to type-opaque format list for passing to vformat()
+typedef const FormatList &FormatListRef;
+
+namespace detail {
+
+// Format list subclass with fixed storage to avoid dynamic allocation
+template <int N>
+class FormatListN : public FormatList {
+ public:
+  template <typename... Args>
+  FormatListN(const Args &... args)
+      : FormatList(&m_formatterStore[0], N),
+        m_formatterStore{FormatArg(args)...} {
+    static_assert(sizeof...(args) == N, "Number of args must be N");
+  }
+
+ private:
+  FormatArg m_formatterStore[N];
+};
+
+// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
+template <>
+class FormatListN<0> : public FormatList {
+ public:
+  FormatListN() : FormatList(0, 0) {}
+};
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Primary API functions
+
+/// Make type-agnostic format list from list of template arguments.
+///
+/// The exact return type of this function is an implementation detail and
+/// shouldn't be relied upon.  Instead it should be stored as a FormatListRef:
+///
+///   FormatListRef formatList = makeFormatList( /*...*/ );
+template <typename... Args>
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
+  return detail::FormatListN<sizeof...(args)>(args...);
+}
+
+/// Format list of arguments to the stream according to the given format string.
+///
+/// The name vformat() is chosen for the semantic similarity to vprintf(): the
+/// list of format arguments is held in a single function argument.
+inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
+  detail::formatImpl(out, fmt, list.m_formatters, list.m_N);
+}
+
+/// Format list of arguments to the stream according to given format string.
+template <typename... Args>
+void format(std::ostream &out, const char *fmt, const Args &... args) {
+  vformat(out, fmt, makeFormatList(args...));
+}
+
+/// Format list of arguments according to the given format string and return
+/// the result as a string.
+template <typename... Args>
+std::string format(const char *fmt, const Args &... args) {
+  std::ostringstream oss;
+  format(oss, fmt, args...);
+  return oss.str();
+}
+
+/// Format list of arguments to std::cout, according to the given format string
+template <typename... Args>
+void printf(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+}
+
+template <typename... Args>
+void printfln(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+  std::cout << '\n';
+}
+
+}  // namespace tinyformat
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/to_string.h b/paddle/string/to_string.h
new file mode 100644
index 0000000000..178edc1895
--- /dev/null
+++ b/paddle/string/to_string.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+#include <typeindex>
+
+namespace paddle {
+namespace string {
+inline std::ostream& operator<<(std::ostream& s, const std::type_index& t) {
+  s << t.name();
+  return s;
+}
+
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+template <>
+inline std::string to_string(std::type_index t) {
+  return t.name();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
new file mode 100644
index 0000000000..4956bd96fa
--- /dev/null
+++ b/paddle/string/to_string_test.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+
+constexpr char kOutputString[] = "User Defined Output";
+class UserDefinedClass {
+ public:
+};
+
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << kOutputString;
+  return s;
+}
+
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ("10", to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+  ASSERT_EQ("1.2", to_string(1.2));
+}
+
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(kOutputString, to_string(instance));
+}
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index c47add04b0..77f84cd43b 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -2,7 +2,10 @@
 
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util gen_proto_cpp)
+  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT MOBILE_INFERENCE)
+    cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init paddle_memory gtest gflags)
+  endif()
 endif()
diff --git a/paddle/testing/TestUtil.cpp b/paddle/testing/TestUtil.cpp
index c691fe2625..cfb8c713d9 100644
--- a/paddle/testing/TestUtil.cpp
+++ b/paddle/testing/TestUtil.cpp
@@ -33,6 +33,7 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
                                  bool withValue,
                                  bool useGpu,
                                  bool equalNnzPerSample) {
+#ifndef PADDLE_MOBILE_INFERENCE
   std::vector<int64_t> ids(height);
   std::vector<int64_t> indices(height + 1);
   indices[0] = 0;
@@ -84,6 +85,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
     }
     return mat;
   }
+#endif
+  return nullptr;
 }
 
 void generateSequenceStartPositions(size_t batchSize,
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
new file mode 100644
index 0000000000..a2f21e37e4
--- /dev/null
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/init.h"
+#include "paddle/memory/memory.h"
+
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  for (int i = 0; i < argc; ++i) {
+    new_argv.push_back(argv[i]);
+  }
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::CUDAPlace(0));
+#endif
+
+  paddle::framework::InitDevices();
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 06c019f0a9..72911695bd 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TRAINER_SOURCES
         ParameterUpdater.cpp
         ParamUtil.cpp
         RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
@@ -16,6 +17,7 @@ set(TRAINER_HEADERS
         ParameterUpdater.h
         ParamUtil.h
         RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
         Tester.h
         TesterConfig.h
         Trainer.h
@@ -24,6 +26,13 @@ set(TRAINER_HEADERS
         ThreadParameterUpdater.h
         TrainerConfigHelper.h)
 
+if(NOT WITH_GOLANG)
+  list(REMOVE_ITEM TRAINER_SOURCES
+          NewRemoteParameterUpdater.cpp)
+  list(REMOVE_ITEM TRAINER_HEADERS
+          NewRemoteParameterUpdater.h)
+endif()
+
 add_library(paddle_trainer_lib STATIC
     ${TRAINER_SOURCES})
 
@@ -32,7 +41,8 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
     ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    paddle_proto
+    ${external_project_dependencies})
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
@@ -40,19 +50,29 @@ macro(add_paddle_exe TARGET_NAME)
   link_paddle_exe(${TARGET_NAME})
 endmacro()
 
-add_paddle_exe(paddle_trainer
-    TrainerMain.cpp)
+if(WITH_TESTING)
+  add_subdirectory(tests)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+  add_paddle_exe(paddle_trainer TrainerMain.cpp)
+  add_paddle_exe(paddle_merge_model MergeModel.cpp)
 
-add_paddle_exe(paddle_merge_model
-    MergeModel.cpp)
+  install(TARGETS paddle_trainer paddle_merge_model
+          RUNTIME DESTINATION opt/paddle/bin
+          PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+          GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
 
-if(WITH_TESTING)
-    add_subdirectory(tests)
+  set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+  set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+endif()
+
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
 endif()
-install(TARGETS paddle_trainer paddle_merge_model
-    RUNTIME DESTINATION opt/paddle/bin
-    PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
-        GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ)
 
-set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
-set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+if(WITH_GOLANG)
+  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
+  target_link_libraries(paddle_trainer paddle_pserver_cclient)
+endif(WITH_GOLANG)
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 91d89b61a3..56c38015fb 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 
 DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(config_file, "", "Config file for the model");
 DEFINE_string(model_file, "", "File for merged model file");
 
 using namespace paddle;  // NOLINT
@@ -28,8 +29,16 @@ using namespace std;     // NOLINT
 int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
-  string confFile = TrainerConfigHelper::getConfigNameFromPath(FLAGS_model_dir);
-#ifdef PADDLE_ONLY_CPU
+
+  if (FLAGS_model_dir.empty() || FLAGS_config_file.empty() ||
+      FLAGS_model_file.empty()) {
+    LOG(INFO) << "Usage: ./paddle_merge_model --model_dir=pass-00000 "
+                 "--config_file=config.py --model_file=out.paddle";
+    return 0;
+  }
+
+  string confFile = FLAGS_config_file;
+#ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;
 #endif
   auto config = std::make_shared<TrainerConfigHelper>(confFile);
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
new file mode 100644
index 0000000000..410ac6d95c
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+
+  // create parameter server client.
+  if (useEtcd_) {
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
+
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO(wuyi): Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
+    for (int i = 0; i < parameterSize(); ++i) {
+      // FIXME(typhoonzero): paramConfig always have default values,
+      // how to check if it's default?
+      // TODO(typhoonzero): log output: optimizerConfigV2.DebugString();
+      LOG(INFO) << "trainerConfig_: " << trainerConfig_.DebugString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+
+void NewRemoteParameterUpdater::startPass() {}
+
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
new file mode 100644
index 0000000000..6223ba427c
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "OptimizerConfig.pb.h"
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+private:
+  int parameterSize() { return (int)parameters_.size(); }
+
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+
+protected:
+  const OptimizationConfig& trainerConfig_;
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
+};
+
+}  // namespace paddle
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 80664fa877..16e676d602 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -175,7 +175,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
     }
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
     FILE* fp = fopen(featFile.c_str(), "ab+");
-    PCHECK(!ferror(fp)) << "Fail to open " << featFile;
+    CHECK(!ferror(fp)) << "Fail to open " << featFile;
 
     size_t sampleNum = featMatrices[0]->getHeight();
     for (size_t i = 0; i < sampleNum; ++i) {
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index b68e29cd5e..3e4a2b5fa8 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -137,6 +137,10 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
     }
   }
 
+  if (FLAGS_use_mkldnn) {
+    CHECK_EQ(FLAGS_trainer_count, 1) << "MKLDNN only need 1 trainer";
+  }
+
   if (testing) {
     LOG(INFO) << "trainer: in testing mode";
     if (config_->getOptConfig().use_sparse_remote_updater() ||
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a1..2b68d89e48 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -28,6 +28,8 @@ DECLARE_bool(with_cost);
 DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -44,6 +46,8 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
   configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkl_packed=" << FLAGS_use_mkl_packed
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
@@ -62,11 +66,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 08b2d8a38e..bd518d8598 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,20 +1,17 @@
-################# test_Compare ############################
-add_unittest_without_exec(test_Compare
-    test_Compare.cpp)
-add_test(NAME test_Compare
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
+function(trainer_test TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
 
-################# test_Trainer ###########################
-add_unittest_without_exec(test_Trainer
-    test_Trainer.cpp)
-add_test(NAME test_Trainer
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
-        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+trainer_test(test_Compare)
+trainer_test(test_PyDataProviderWrapper)
+trainer_test(test_recurrent_machine_generation)
+trainer_test(test_Trainer)
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -23,60 +20,13 @@ if(WITH_PYTHON)
   add_unittest_without_exec(test_TrainerOnePass
       test_TrainerOnePass.cpp)
   add_test(NAME test_TrainerOnePass
-    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
-############### test_CompareTwoOpts ###################
-add_unittest_without_exec(test_CompareTwoOpts
-    test_CompareTwoOpts.cpp)
-add_test(NAME test_CompareTwoOpts
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
-            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
-            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-endif()
-################# test_recurrent_machine_generation ###############
-add_unittest_without_exec(test_recurrent_machine_generation
-    test_recurrent_machine_generation.cpp)
-add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
-#################### test_PyDataProviderWrapper #########################
-add_unittest_without_exec(test_PyDataProviderWrapper
-    test_PyDataProviderWrapper.cpp)
-
-add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
+        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/trainer/tests/chunking.conf b/paddle/trainer/tests/chunking.conf
deleted file mode 100644
index d88df919df..0000000000
--- a/paddle/trainer/tests/chunking.conf
+++ /dev/null
@@ -1,125 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-TrainData(ProtoData(
-  files = 'trainer/tests/train_files.txt',
-  usage_ratio = 1.0,
-))
-
-TestData(ProtoData(
-  files = 'trainer/tests/test_files.txt'
-))
-
-default_initial_std(1)
-default_decay_rate(4e-4)
-default_device(0)
-
-Inputs("features", "word", "pos", "chunk")
-
-Outputs("crf")
-
-Layer(
-    name = "features",
-    type = "data",
-    size = 4339,
-)
-
-Layer(
-    name = "word",
-    type = "data",
-    size = 478,
-)
-
-Layer(
-    name = "pos",
-    type = "data",
-    size = 45
-)
-
-Layer(
-    name = "chunk",
-    type = "data",
-    size = 23
-)
-
-Layer(
-    name = "output",
-    type = "mixed",
-    size = 23,
-    bias = False,
-    device = -1,
-    inputs = [
-        FullMatrixProjection("features", parameter_name="feature_weights"),
-    #    TableProjection("word"),
-    #    TableProjection("pos"),
-    ],
-)
-
-Layer(
-    name = "crf",
-    type = "crf",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Layer(
-    name = "crf_decoding",
-    type = "crf_decoding",
-    size = 23,
-    device = -1,
-    inputs = [
-        Input("output", parameter_name="crfw"),
-        "chunk"
-    ]
-)
-
-Evaluator(
-    name = "error",
-    type = "sum",
-    inputs = "crf_decoding",
-)
-
-'''
-# chuck evaluator cannot be used for GPU training
-Evaluator(
-    name = "chunk_f1",
-    type = "chunk",
-    inputs = ["crf_decoding", "chunk"],
-    chunk_scheme = "IOB",
-    num_chunk_types = 11,
-)
-'''
-
-Settings(
-    algorithm = 'sgd',
-    batch_size = 100,
-    average_window = 0.5,
-    max_average_window = 2500,
-    learning_rate = 1e-1,
-    learning_rate_decay_a = 5e-7,
-    learning_rate_decay_b = 0.75,
-    l1weight = 0,
-    l2weight = 1,
-    c1 = 0.0001,
-    backoff = 0.5,
-    owlqn_steps = 100,
-    max_backoff = 5,
-)
diff --git a/paddle/trainer/tests/data_bin_part b/paddle/trainer/tests/data_bin_part
deleted file mode 100644
index 66ede391b0..0000000000
--- a/paddle/trainer/tests/data_bin_part
+++ /dev/null
@@ -1,214 +0,0 @@
-F
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-��X
-���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��H��C��=��T��F��T��Iַ;��W��8��T��;��8��T��J��J��8��T&$��H��=��T��F��T��I��W��8Ю+��J��J��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I86��8��T��8��T��&�9��C��6��H��C��=��T��F��T��Iַ;��B��T&$��8��8��&Ӗ5��H��=��T��F��T��I��B��T���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��IMK��H��C��=��T��F��T��Iַ;ٟ@��1��7ȣ8��Gȣ8�/��>��7��;��B��A��U��Q��U��T��0A?��H��=��T��F��T��Iٟ@��1��7��G�/��>��7��;��B��A��U��Q��U��T��0���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I����.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;����'���J��A��-��E�J��@��8��T��-��Eބ2�4��8��TYW��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I����A��M��1��8��Mބ2�4��8���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��K��H��=��T��F��T��I��@��K���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I#!��1��4��UƕT��6��.��Q��8��T��@Ԛ<��1��4ƕT��6��.��Q��8��@Ԛ<���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IVT��H��C��=��T��F��T��Iַ;��8��T��8��TͅT�T��8��T��&�8��6�;��8��T��@�N��8��T��8��T;9��H��=��T��F��T��I��8��8��8��8��&�8��6�;��8��@�N��8��8��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;ܥ6��H��=��T��F��T��Iܥ6���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I;9��H��C��=��T��F��T��Iַ;��Q��;��B�� �������������!��H��=��T��F��T��I��Q��B���H��C��=��T��F��T��Iַ;��H��=��T��F��T��IYW��.��8��T˔I͚4��8��T��N��8��T��E��9��8��T��W��8��T��&��6ͅT�T��H��C��=��T��F��T��Iַ;><��.��8˔I͚4��8��+��E��9��8��W��8��&��6��8��H��=��T��F��T��I��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I53��H��W��8��T��;��8��T��8��T��H��C��=��T��F��T��Iַ;#!��H��W��8Ю+��8��H��=��T��F��T��I���H��C��=��T��F��T��Iַ;��H��=��T��F��T��I ��H��C��=��T��F��T��Iַ;��@��?��H��=��T��F��T��I��@��H��C��=��T��F��T��Iַ;��H��=��T��F��T��I&$��H��C��=��T��F��T��Iַ;��V��G��D��; ��H��=��T��F��T��I��V��G��D��;�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G����G͡S�<��%����&б��̣ ��Fۧ1��1ņAǧ1ņAņA�<��6ҥ3߫U��V�K��T��V��U��6��>��V��M��U��F��>��M��5��%��������������̋'wu��G͡S�<��%������̣ ��Fۧ1��1ņAǧ1ņAņA�<��6��U��V�K��T��V��6��>��V��M��U��F��>ʶM��%��������������̋'�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G̣ ��'��@��@��@	���@��@�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G&$��O��4��=ӪN��/��>��K��/��;��8�,��T ��O��4��=ӪN��/��>��K��;��,��T�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G><��,��9��O��8��.̣ ������T��B����0��O��!��.�/��W��D��S��W53��,��9��O��8��.��T��B����0��O��!��.�/��W��D��S��W�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G��:��=��X̣ ��Q��U��T��G܂=��X̣ ��Q��T��G�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G)'��=������	��0̣ ��M��6ͅT��O��,��@Ԛ<#!��=ؐ��0̣ ��M��6ͅT��O��,��@Ԛ<�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G/-��=������	��0̣ ��M��6ͅT��O��,��D��S�D��A)'��=ؐ��0̣ ��M��6ͅT��O��,��D��S�D��A�	̣ ��O��G	̣ ��O��G&$��Eʌ3��O��X��M��Q̣ ��Jʌ3��D��4��T#!��Eʌ3��O��X��M��Q̣ ��Jʌ3��U��T	̣ ��O��G	̣ ��O��G	̣ Ҧ)��G��G���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ20��4��A�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P&$��4��A�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��R��4�Q��>��.ŞG��GщQ��6��?��@Ԛ<#!��R��4�Q��>.��GщQ��6��?��@Ԛ<���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��4�Q��.ŞG��J�I��GщQ��D��S�D��A#!��4�Q.��J�I��GщQ��D��S�D��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ&$��.ŞGٟ@��6��G��5�I��GщQ��A�7��B.ٟ@��6��G��5�I��GщQ��+���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4�Q��>��.ŞG��D��A��P��;��0��T��?��6��T��)����! ��4�Q��>.��A��;��T��6��T��)���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ><��4��9��K�Q��.ŞG��R��G��D��9��H�O��K�J��A��.ŞG��=�R��J/-��4�-�Q.��R��G��D��9��H��K�J��A.�R��J���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ53��4��A��I�Q��.ŞGщQ��H��A��V��T��J��D��8��D��A��P)'��4��A��I�Q.щQ��H��A��V��T��D��8��A���4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ ��4�Q��.ŞG����6��P����6��T��4�Q.����6��P����6��4�Q��>��.ŞG��GщQ��4�Q��>.��GщQ/-��4��=�R��4�Q��>��A��E��.ŞG��C��/��W��9��9 ��4�R��4�Q��>����C��/��W��9���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5��U��P��H���>��G��@Ԛ<��U��P��H���>��G��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��A��H���A��C��P��G��@Ԛ<��A��H���A��C��P��@Ԛ<���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>“J��M��B��>��W��M��L��G��,��@Ԛ<MK��H���FșK��>��7��Q��K�H��.��C��Q��R��>“J��M��B��>��W��M��L��G��,��@Ԛ<���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��553��A��H��M��D��P�5��8��Qٟ@�H��3��/��A��@��@��@/-��A��H��M��D��P��8��Qٟ@�H��3��/��A��@��@���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5#!��A��H���A��C��P��G��D��S�D��A ��A��H���A��C��P��D��S�D��A���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5YW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪYW��I��=��=�R��>��H���/��/��G�M��>ϪJ�R��K��2��2��U׵A��H��T��U��A��6�����)��ʪ���C��P��H��5��C��P��H��5;9��H��9��1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ86��H���1��G��R��F��P�.ܤK��H��U��A��6�����)��ʪ��C��P��H��5��C��P��H��5 ��6��P��H����>�5��H��O��A��B ��6��P��H����>�5��H��O��A��B���C��P��H��5��C��P��H��5&$��C��P��H��A���>��G��D��S��PԮK߀3#!��C��P��H��A���>��G��D��S��PٮK��C��P��H��5��C��P��H��5��H���G��2��2��A��@��@��@��H���G��2��2��A��@��@�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O20����N߹-��7��B�O��1ַ;��L߹-��N��A��7��O��Iַ;)'����N߹-��7��B�O��1��;߹-��N��A��7��I�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*����N߹-��B�O߹-��7�O߹-ַ;�OʈF��<��4)'����N߹-��B�O߹-��7߹-ַ;�OʈF��<��4�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O&$��A����N߹-��B�O��>��8ֽHٟ@��@Ԛ<#!��A����N߹-��B�O��>��8ٟ@��@Ԛ<�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-�
-������N߹-��C��7��F��B�O��R��1��:��?��T)'�
-������Nں-��7��B�O��R��1��:��?��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O��߹-��7�O߹-��B��T��߹-��7߹-��B�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O/-����N߹-��B�O��7��F�O��O��?��L߹-��OǧB��T)'����N߹-��B�O��7�O��O��?��L߹-��O��T�����B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O><߹-����N��L��B��7��F�O��QӮD��D�A��4��0�A��T����(����",*߹-����N��L���O��QӮD��D��A��0�A��T�������B߹-�O����B߹-�O߹-��B��T��C�O��@��L�:߹-��B��C�O��@��L�:����B߹-�O����B߹-�O,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4,*��߹-��7��B�OİU��1��>��C��B��B�U��Q��4�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��8�,��T#!����N��9��1��L��N��/��J��D��,��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/b`��1��R��L��D��A¶7��/�
-������J��0��E��K��B��8�/��/��O��E��Kю2��E��,��/��W��T�����)��ʪDB��1��L��D��N��/����J��0��K��B��8�/��O��Eю2��E�������)��ʪ�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/20�
-������1��R��L��A¶7��/��J��0��E��O����@��K&$����1��L��N��/��J��0��E��O����@��K�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/��>��T��7�O��=��P��;��>��7��=��P�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-��D��A¶7��/��1��R��L��JʡH��W��W��T�%����! ��D��N��/��1��L��JʡH��W��WՄO�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/><�
-������N��1��R��L��A¶7��C��H��2��3��1��R��L��A¶7��/��/&$����N��1��L��N޻/��2��3��1��L��N��/�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��//-����L��G��R��1��¶7��/��1��7�>��>��G��<��T)'����L��+��¶7��/��1��7�>��>��G��<��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/JH��A¶7��/��C��1��R��L��H��7��/����N����=��,��:�:��8��4��S��Q��H�9��T86��N��/��C��1��L��+����N����=��,ў8��4��S��Q��H�9��T�����L��1��7��A¶7��J��/����L��1��7��N��J��/GE��/��1��R��L��A¶7��CʡH����=��;��>��W��=ѾC�
-��:��K��4��8��?��:��T86��/��1��L��N��CʡH����=��.��=ѾC�
-��:��4��8��?��:��T����L��1��7��A¶7��J��/����L��1��7��N��J��/DB�
-������/��@ʡH��9��H��1��R��L��A¶7��/��J��D��O��E��J��<��B��B,*����N��9��1��L��N��/��J��D��E��J��<��B��B���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K\Z��E��@��T΂:��:��T����T΂:��8��/��6����T΂:��8��/��K����T��8��/��;����T΂:��/��8��E��@JH��E��@��T΂:��:��T����T����6����T����K����T��8��;����T΂:��/��E��@���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K ����N��,΂:��8��/��K��4��?�I����N��,����K��4��?�I���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K86��΂:��8��/΂:��8��/��6��H΂:��/��8��K΂:��8��/��C��T#!��������6��H΂:��/��K����C��T���T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K><����N��T΂:��8��/��C��T΂:��K��T΂:��W��J��T΂:ì,��U��W��J&$����N��T����C��T�:��Tژ<��TЂ:�����T����N��?��8��/��K��T����N��?��8��KJH��P����T΂:��8��/��K����N��T΂:��8��/��C��T΂:��8��4��T΂:��8��/��;��653��P����T����K����N��T����C��Tނ:��4��T����;��6��T����N��?��8��/��K��T����N��?��8��K����N΂:��8��/��K�K΂:����N����K�K΂:�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ\Z��R��<��L��#��%��6��K��9��T��V��4��6��V��6��#����6��#��%��6��#��
��6��$����6��#�8���8GE��R��<��L��#��6��K��9��V��6��V��6��#��6��#��6��#��6����6��#�8���8�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ;9��>�R��>��%��B��>ڜ>��A��9��T��K�9�1��A��#��%��@��@��@20��>�R��>��%��B��>ڜ>��A��9��K�9�1��A��#��@��@�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��9��T��Kڜ>��B��E�I��U��T��#��9��Kڜ>��B��E�I��U�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ#!��#��%��K��9��T��D��0��6�O��@Ԛ<��#��K��9��D��0��6��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ��#��%��9��T��CۚK��@Ԛ<��#��9��CۚK��@Ԛ<�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQGE��6��W��#��%��>��9��T�?��#��%��6��O�/�O��O�/��U��!�'��B�8��>ڜ>;9��6��W��#��>��9�?��#��6��O�/��O�/��U��!�'��B�8��>ڜ>�)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQ&$��#��%��9��T��@��A��6��W��D��P�D��A ��#��9��@��A��6��W��D��P�D��A)'��#��%��H��K��9��T��>��B��D��J��9��9щQ#!��#��H��K��9��>��B��D��J��9��9щQYW��#��%��9��T��>��K��-��A��9��6��T��W��B��:��O��S��R��Q��9��#����%ѾC��H��T��L��6��L��TJH��#��9��>��K��A��9��6��T��W��B��:��O��S��R��Q��9��#����%��5��L��6��L��T�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��, ؓ���=��BܤK��S��/��C��8��Tœ�=��BܤK��S��8��T,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,DB��G��D��G��>��W��-��3��M�8��F�=��Bٟ@��6��S��9ܤK��ȟN��	��U��686��G��D��G��>��W��3�8��F�=��B��5��S��9ܤK��ȟN��	��U�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,><��E��R�=��B��6�O��7��>��T��H��H�8��@��9��F��F��S��A��@Ԛ<53��E��R�=��B��6��7��>��H��H�8��@��9��F��F��A��@Ԛ<,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,PN��8��4��C�8�1�=��B��R��V��T��6��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GDB��8��C�8�1�=��B��R��V��T��C��A��E��:��6�L��U��U��NԛL��@��6��G�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,JH��H�=��B��/��-�8��>ܤK��D��A��9��=��S˱U�8��Q��TָU��J�����)��ʪDB��H�=��B��/�8��>ܤK��D��A��9��=��S˱U�8��Q��T�U�����)��ʪ,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,GE��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR��R��D��O��6ө����ۆ	;9��A��B��R��B��E�9��A��6��BϜ>�8�=��B��6ץR����6ө��,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,;9��R��Q��S��A��E��M�8�=��B��>ץR��9��)��N��U��6��!��G��J53��R��Q��S��A��E��C�=��B��>ץR��9��)��N��U��6��!��1,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,/-��V��J��V�1�8�=��B��R��6��?��#��%��@��@��@)'��V��J��V�1�8�=��B��R��6��?��#��@��@�,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����D��>��EȊ5��6��R��T���8��J��F�=��B��K��T��:�8��J�=��B��R��F��K��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9�9��I��S��D��P�D��Azx��D��>��Eˊ5��R��T���8��S�=��BАT��:�8��J�=��B��R��F��,��3��4��D��H��@��CӽDҾW��K��?��>��S��@��9��I��S��D��P�D��A,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,,*��E��6��F��A��6ܤK��J��V�8�=��B��>��S��,����V��6�8��B��Xʉ5�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��G��V����X��G��V���8��G������&��Ξ)��V��E����B��V��Ɣ>��X��V����U��8—P��=ۚK��C��>��J��U̟K��O��4��>��L����V��6�8��B��X�=��B��>ܤK��%��������&��Ξ)ʉ5��V����T��V��E����X��V����X��V���8��G������&��Ξ)��V��E����B��V��۔>��V����U��8��=��C��J�.��4��>���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C��G��R��@��N����D��C��G��@��N���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����6��C��G��Dʉ5��>��R������#!����6��C��G��Dʉ5��>��R���������H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X86������G��R�>��R��P��>��R��6��9�9��V��A��D��S�D��A20����G��R�>��R��P��>��R��6��9��V��A��D��S�D��A���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X#!����D��R߻W�9��9������@��@��@����D��R߻W�9��9����@��@���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����U��V�1�;��2��X��4����U��V�1�;��2��X���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��Xnl��>��A����6߻W��$��6��X��T��6�/ҥ3��)��T��:��6��X��-��6��M��E��@��E��U��%�������!�����)�������!MK��>��A����6߻W��$��6‰XɺRҥ3��?��:��6��X��-��6��E��@��E�������)����P���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��XA?������6��=��C߻W��E��D��>�3��K֟MȬT��T��(����#����$����!,*����6��=��C߻W��E��D��>�3��K֟MȬT��T��
���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X;9����6��6��GȂ3ʉ5��>��R��>��B��������C��T��6��;��3��D53����6��6��GȂ3ʉ5��>��R��>��B������C��T��;��3��D���H����D��6߻W��X��H����D��6߻W��X����C߻W��X��@Ԛ<����C߻W��X��@Ԛ<��H����D��6߻W��X��H����D��6߻W��X����D��C߻W��R��1��@��K����D��C�W��1��@��K�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6GE��6��/��K�Oٟ@—P��=��>��8��E�9��R�B��H��A��V��T��J��D��8��D��A��P53��6��/�Oٟ@��=��>��8��E��R�B��H��A��V��T��D��8��A�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6&$��C��K�O�I�9��R�B��2��S��C��I��9��C�O�I��R�B��2��S��C��9�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6)'��L��P��K�O�9��R�B����6��P����6��T ��L��P�O��R�B����6��P����6�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6PN��6��K�O�9��R�B��E��I��T��6��>��S��K��?��K��I—P��=��>��K��I��9��0��C��9��T><��6�O��R�B��E��I��T��6��>��S��K��?��K��=��>��K��9��0��C��T�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*кB��P��K�O��K��=��9��F�9��R��H��G��8��T#!кB��P�O��K��=��9��F��R��H��G��8�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6JH��H��K�O��>��6��/—P��=�9��R��H��>��D��A��P��;��0��T��?��6��T��)����!/-��H�O��>��6��/��=��R��H��>��A��;��T��6��T��)�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6MK��K�O��6��/—P��=��K�O��R��D�B��6�O��K��K�O��6�9��6��K�O��6щQ��@Ԛ<53�O��6��/��=�O��R��D�B��6��K�O��9�O��6щQ��@Ԛ<�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6,*��6��/��K�O�9��R�B��DǬ<��C��I��I�?��9 ��6��/�O��R�B��DǬ<��Cڗ?��9�,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6��@��@��@,*��S��P��K�O—P��=��D�9��R�B��5�9��6��6#!��S��P�O��=��D��R�B��5�9��6��6><��A��2��I��P��K�O��K��G��C��=�9��R�B��D�9��6��D��P�D��A20��A��2��I��P�O��K��G��C��R�B��D��6��D��P�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=SQ��$����(����0�?�=��6��C�9��A��T��0��7��R��L��@��;��I��6��U��L��I��G��8��2��TMK��$������0�?�=��6��C�9��A��T��0��7��R��L��@��I��6��U��L��I��G��8��2��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=86��C�9��A��T��0�?�9��-�8ٟ@��6ǽ=��E��X��Eŧ;��>��P/-��C�9��A��T��0�?�9��-�8ٟ@��6��E��E��>��P���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=20ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@��@/-ʻ?��<��C�9��A��T��0��6��AщQ��J��@��X��@��@���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��0�?��<��6��C�9��A��T��A�7��B��0�?��<��6��C�9��A��T��+���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=JH��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��?��B��RÙK��B��TA?��D��>��0�?��9��<�9�=��C�9��A��T��V��B��$�����/��B��E��B���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=��0�?��<��C�9��A��T��0�?��<��C�9��A��T���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=PN�9��T��0�?��5��5�=��-щQ��X��S��C��A��E��/��:��6�L��U��U��NԛL��@��;��6��GGE�9��T��0�?��5��5�=��-щQƇX��C��A��E��:��6�L��U��U��NԛL��@��6��G���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A,*��0�?��<��6�9�=��C�9��A��T��D��S�D��A���C�9��A��T��0�?�=��C�9��A��T��0�?�=#!��C�9��A��T��0�?�9��-�8��@Ԛ<#!��C�9��A��T��0�?�9��-�8��@Ԛ<��C�9��A��T��0�?�=��C�9��A��T��0�?�=)'��0�?�9��<��C�9��A��T��6�O��P��B��6#!��0�?�9��<��C�9��A��T��6��P��B���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��D��6��@Ԛ<��B��7��Uח>��D��6��@Ԛ<���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��Uח>��@��K��Uח>��@��K���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>ͦB��O��E��R��B��7��Uח>ͦB��O��E��R���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��8��;��BٖT��T��B��7��Uח>��8��;��B��T���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>86��A��H��F��S��=��@��=՞R��U��7��0ח>��G��D��S��PԮK߀320��A��H��F��=��@��=՞R��U��7��0ח>��G��D��S��PٮK���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח> ��B��7��Uח>��D��T����(����"��B��7��Uח>��D��T�����6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��6��R��T��U��7��HˮD�D��A��6��T��U��7��HˮD�D��A���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>��B��7��Uח>��B��7��Uח>���6��R��T��U��7ח>��6��T��U��7ח>\Z��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��-��:��6��6����������(��UʡH��9��6��6SQ��H����S��=��HˮD��>��7��K�O��U��JҲ.щQ��H��T��:��6����������(��UʡH��9��6��6��R��T��U��7ח>��6��T��U��7ח>20��A��H��F��S��=��@��=՞R��U��7��0ח>��G��P��B��6,*��A��H��F��=��@��=՞R��U��7��0ח>��G��P��B�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	MK��7��<��X��7��Q��L��4��4��R�W��5���������Q��-��<��>��;��������G��B;9ӱ��Q��L��4ߩ7��5���������Q��-��<��>��;��������G�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	��N��<��;��0��@��K��,��N��;��0��K��,�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	#!��<��X��4��R�W��>��4��8��@��@��@��<ߩ7��>��4��8��@��@�	��<��X��	��<��	53��7��<��X��7��N��4��R�W������1��E������A��T��B ӱ��Nߩ7������E������A��B	��<��X��	��<��	/-�4��P��P��PՈP��R��L��1�4��:����N��P��P��P&$�4��P��P��PՈP���:����N��P��P��P���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��3ϊX��4��C��3��7��Q��7����3ϊX��@����@��@ ԊX��4��C��3��3��ԊX��@����@���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K ��RН?��3ϊX��Q׆N��S��?��4�8��RН?ԊX��Q��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��Q��T��3ϊX��7��Q��7׆N��S��?��4�8��QԊX��3��N��?��4�8���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K/-��I��7��Q����F��7��3ϊX��>��F�9�Q��?��WɤK��IԊX��>��F�9�Q��?��WɤK���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K&$��R��3ϊX��4��6߻W��L��Q��G��8��@Ԛ< ��RԊX��4��6߻W��L��Q��G��@Ԛ<���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K><��3ϊX��R��7��Q��7��@��4��7��5�@����:ȥ����B��@����A��T/-ԊX��R��3��@��4��7��5�@����:ȥ����B����A���3ϊX��Q��K	ԊX��Q��K,*��3ϊX��1��7��Q��7��G��/׆N��8����G��F̛<ԊX��1��3��G��/�N����G��F��3ϊX��Q��K	ԊX��Q��K����"����!��F��>��"��F���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��720��D��N��/��E��L��>�7��Aڶ>��F��7��C��Dƹ;��@Ԛ<,*��D��NȜM��L��>�7��Aڶ>��F��7��C��4��@Ԛ<���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7)'��H��N��/��K��N��/ڶ>��F��7��=��A�7��B#!��H��N��/��K��N��/ڶ>��F��7��=��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7—P��=��D��G��@��K��H��/��6��7��=��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7ܤK��K��A�7��B	ܤK��K��+���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7�����7��>��1��T֛7ٟ@��9��F��6��U��>ʔ7��1��/��>ٟ@��6��L��D��7��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5�9Ԛ<ڶ>��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A�����7��>��1֛7ٟ@��9��F��6��U��>ʔ7��1��/��>��5��L��D��>��/��I��/��>��=щQ��D��D��H��I��N��.��/��5��1��S��-��=��D��N��@��U��W��=��-щQܭD�H��T��D��S��=��D��S�D��A���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��=�8��T��=��4ڶ>��F��7��S��@��@��@)'��D��N��=�8��T��=��4ڶ>��F��7��S��@��@���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7 ��H��/��6��7��Dƹ;��D��G��@��K��H��/��6��7��4��D��@��K���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7><��D��1ƹ;��T��Dƹ;��>��S��=��>��7ʗ7��4��=��>��S��B��7��S��T86��D��1ƹ;��T��4��>��S��=��>��7ʗ7��4��>��S��B��7��S��T���N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7,*��D��N��.ی'��7�9Ԛ<��=��/ڶ>��J��7��@Ԛ<)'��D��N��.ی'��7��1��=��/ڶ>��J��7��@Ԛ<��N߀3��/ڶ>��F��7��N߀3��/ڶ>��F��7��V��/��6��7��=��D��G��@��K��V��/��6��7��=��D��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A><��E��R�9��U��5��1����$��L��2��;��N��@��6�1��O��D��S�D��A86��E��R�9��U��5��1����L��2��N��@��6�1��O��D��S�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��ASQ��$��U��S�/��1��6��1��.��T�9��A��6��A��PɺD��E��X��>��EѾC��T��8��6��V��O��T��BA?��$��U��S�/��6��T�9��A��6��A��PɺD��E��X��E��T��8��V��O��T��B���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��E��R�9�1��@��5��H��1��Bٟ@��4�9��A��E��@��@��@/-��E��R�9�1��@��5��H��1��@��4�9��A��E��@��@���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<,*��$�9���6��5��6��5��4�9�Q��5؂=��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��ADB��"��Ξ)����6�9��$��R��5��4�9��A��Iٟ@��T��N��>��C��J��@��@Ԛ<><��"������6�9��$��R��5��4�9��A��@��T��N��>��C��J��@��@Ԛ<���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K;9��E��4��W��N��$��R��B��5��H��4��L��D��L��IĪN��C��S��@��K���$��5��H��1��4�9��A��$��5��H��1��4�9��Aec������$��/��4��U��R��5��R��H������$��>��#��=��1��,��1��>��Bٟ@��T�9��A��L��Kٟ@��6��J�=��@Ԛ<\Z������$��/��4��U��5��R��H������$��>��#��=��1��,��1��>��@��T�9��A��L��Kٟ@��6�=��@Ԛ<��$��5��H��1��4�9��A��$��5��H��1��4�9��A53��@��;��5��R��H����$��U��L��T�9��A��6��D��P�D��A/-��@��5��R��H����U��L��T�9��A��6��D��P�D��A���$��5��H��1��4�9��A��$��5��H��1��4�9��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A/-��$��U��5��/��8��=��4�9��Aٟ@��5��D��S�D��A��$��5��H��1��4�9��A��$��5��H��1��4�9��A86��D��P��>��E��5��H��"����$ĪN��L��=��4�9��6��A�7��B/-�P��>��E��5��H��"����$ĪN��L��=��4�9��6��+���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H/-��:��/��SʡH��9��9��9��?��9��9��?��D����6��T��:��S��9��9��9��9��D����6���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��9��?��<��V��V��:��S��9��9��<��V���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H20��S��:��/��SʡH��9��9��9��?ёC؄/��H��;��0��D��T��:��S��9��9ґC��H��;��D��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H)'��:��/��SʡH��9��9��S��:��?��B��6݆.��T��:��S��9��:��B݆.��T���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��9��?��:��/��SʡH��9��9��?��T��9��:��S��9��?��T���:��/��SʡH��9��9��S��H��:��S��9��HDB��S��W�J��9��?��9��?��:��/��SʡH��9��9��:��/��SʡH��9��9��5ܛ?��M)'��W�J��9��9��:��S��9��:��S��9��5ܛ?��M��:��/��SʡH��9��9��S��H��:��S��9��H��B��<��V�?��-��=�R��J��B��<��V�?�R��J���:��/��SʡH��9��9��S��H��:��S��9��H&$��S��V��:��/��SʡH��9��9��S����6��T��V��:��S��9����6��:��/��SʡH��9��9��S��H��:��S��9��H#!��S��:��/��SʡH��9��9��9��?Έ;��F��:��S��9��9Έ;��F���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��Kwu��7��R��D��H��>��<��>��K���,��0��7������R��2��������
-�.��T������ʆ��L��@ϡS��4��,ܢE��M��,�.��O��2��J��6MK������R��2��������
-�.��T������ʆ��L��@ϡS��4��,��E��,�.��O��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KMK��R��D��H��>��<��>��K�,��0��I��O��9��4��9��1یV��0—P��=—P��H��>�.�E��6A?��R��H��>��<��>��K�,��0��I��O��9��4��V��0��=—P��H��>�.�E��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K#!��R��D��>��H��<��K��1��Q��@��@��@��R��>��H��<��K��1��Q��@��@���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K/-��D��H��>��K��=��<��,��D��6�R��=��4��,��@Ԛ<&$��D��5�4��,��D��6�R��=��4��,��@Ԛ<���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K����7��R��D��H��>��<��>��K���2��>��7�.ʆ��J��6������ʆ��G������1��?������—P��=��1��?����I��2����K��7����>��>����M��G����MߎM������6��>��J��Rʆ�.��J��6�~�.ʆ��J��6������ʆ��G������1��?��������=��1��?����I����K��7����>����M��G����MߎM������6��>��J��Rʆ�.��J��6���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K20��R��D��H��>��<��>��K�2��>��J��6��/��;��I��N��9,*��R��H��>��<��>��K�2��>��J��6��/��;��N��9���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��K_]��R��D��H��>��<��>��K�2��>ʆ��>��I��2́���N��4��TȇN��4��T��I��(—Pބ2��>�N��4ʆ�N��4GE��R��H��>��<��>��K�2��>ʆ��>��I��(�N��4ȇN��4��I��(܉2��>��Nʆ��N���D��H��>��<��>��K��D��H��>��<��>��KJH��R��D��>��H��<��Kʆ��R��D��>��H��<��K��@ϡS��R��D��>��H��<��K�.��/��4A?��R��>��H��<��Kʆ��R��>��H��<��K��@ϡS��R��>��H��<��K�.��/��4��D��H��>��<��>��K��D��H��>��<��>��KGE��R��D��H��>��<��>��K�2��>��J��>��I��2ˏR��3˰(��I��B��>—P�3ˏR��2;9��R��H��>��<��>��K�2��>�J��IˏR��3˰(��I��B��>�3ˏR��2�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��> ��L��N��L��BʰD��B��N��@��@��@��L��N��L��B��@��@�ʰD��B��N��M��G��>��B��M��G��>JHɵO��9��F��D��S��C��4ʰD��B��N��5��>��3��5��-��=�9��O��2���:��@��@��@53ɵO��9��D��S��C��4��B��5��>��I��-��=��O��2�G��@��@ʰD��B��N��M��G��>��B��M��G��>)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H)'�$������"��;��0��Q��8ҐJ��9ҽ6��W��H���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A ��4��A��R��=��J��	��D��G��@��K��4��A��R��=��	��D��@��K���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��/��4��A��R��Q��=��J��B��4��/��4��A��R��Q��=��B���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A><İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��WН?��/Н?��T,*İF��B��T��V��L��8��A��R��O��8��L��AН?��-���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A20İF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W&$İF��B��T��V��L��8��A��R��O��8��L��A���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��APNİF��E��1��;��T��V��L��8��A��R��O��8��L��0��A��W��WН?��W��?��U��U��W��TН?��>;9İF��B��T��V��L��8��A��R��O��8��L��A��W��W��?��U��U��W��?���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A,*İF��E��1��;��T��V��L��8��O��3߫UТ@��H��T&$İF��B��T��V��L��8��O��3߫UТ@��H��T���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��A��R��4��J��A��R��4��J���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��L��/��4��A��R��Q��>��L��/��4��A��R��Q��>���C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A��4��A��R��Q��J��O��A��4��A��R��Q��J��O��C��A��R��V��C��W��O��A��W��C��A��R��V��C��W��O��A&$ŷ5��/��B��A��R��4��J��X��>��<��B��B#!ŷ5��/��B��A��R��4��X��>��<��B��B���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��H��H��9��X�RʹE��>��B��H��H��9��;��>��B���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��C��R��9��Xnj8��@Ԛ<��C��R��9nj8��@Ԛ<���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9����9��X��U��T��I��9��X��N�S��;��U��O��Iַ;��U��R��I��I��I�K��I��H��B��O��F��;��F��;��<��U��O��U��M��U��B��I��O��U��-��4��I��9��P��;��P��-��7��;��U��R��I��4��;��Vnl��9��U��I��9��S��G��O��I��U��I��I��I��B��O��F��F��<��U��O��U��U��I��O��U��-��I��9��;��P��7��G��R��I��4��;��V���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��C�5��I��9��1ӛ?��6��9�; ��9��F�5��I��9��1ӛ?��6��9�;���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��U��C��;��-��9��U��C��-���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9\Z��G��9��X��W�F��5ԎB��@��J��P��1��1�<��T��I��:��2��O��:��9��X��C��E��I��>��.��3��>��7��2PN��G��9��W�F��5�B��J��P��1��1�<��T��I��:��2��O��:��9��C��I��>��.��3��>��7��2���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�986��9��X��B�9ԎB��@��@��O��L��W�F�R��9��B��9��<��C��T/-��9��B�9�B��@��O��L��W�F�R��9��B��9��<��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��9��X��5��9�Q��C��ͦ(����!��9��9��5ƋQ��C�����9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��?��9��@���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9YW��9��X��B�9ԎB��@��>��5��4��W�F�R��9��B��9�� ��I��C��7��0��FŔ6��A�D��M��Iַ;��7��0DB��9��B�9�B��>��5��4��W�F�R��9��B��9�� ��I�C��0��FŔ6��1��I��7���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9DB��9��X��C��K��2�9�R��5��>��9��X��W��A��/��1��C��2��O��D�K��O��D53��9��C��2��R��5��>��9��W��A��1��C��2��O��D�K��O��D���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9&$��9��X��>��K��T��CΚI�RН?��>��A��T��9��>��K��CΚI�R��?��A���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�920��C��C��T��C��7��V��C��E��I��Cַ;��C��;��-��C��T��C��C�C��VĸIַ;��C��-��C���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9DB��H��W��:��9��X��B�9ԎB��@��=ʼnE��D��W�F�R��9��B��9��9��X��C��T86��H��W��:��9��B�9�B��=ʼnE��D��W�F�R��9��B��9��9��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9��9��X��@��T��9��X��R��0ܥ6��9��@��T��9��Rܥ6���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9��G��7��;��C��T��G��7��;��C���9��X��R�9	��9��R�9&$�K��X��/��9��C�R��=��U�9��3��A��T��X��9��C�R��=��U�9��3��A��9��X��R�9	��9��R�9A?��Hʜ2��R��A��@�R��S�9��@��>��9��X��3��>��)��כ$��>��;��G��B;9��Hʜ2��R��A��@�R��S�9��@��>��9��3��>��)��כ$��>��;��G���9��X��R�9	��9��R�9PN��M��R��F��=��:��9��X�9��4��.б��H�>��N̛<��;��T�TН?��T��(����"��'����!53��M��R��F��=��:��9�9��4��.б��H�>��N��;�Tܞ?����9��X��R�9	��9��R�9><����N�R��=��9��X��C��9��S�9��9׵A��A��K��E��A��A��B��C��/;9����N�R��=��9��C��9��S�9��9׵A��A��K��E��A��A��B��C��/���=��@��K��E��=��=��@��E��=><��@��Q��0��H��@��KûA��Q��H��@��KûA��Q��,��H��P��H��C��B��020��@��0��H��@ûA��Q��H��@ûA��Q��,��H��P��H��B��0��=��@��K��E��=��=��@��E��=��=��@��J��@��K��I��5��@��=��@��J��@��I��5��@���=��@��K��E��=��=��@��E��=/-��7ûA��K��3��@��3��@��K��7��K��K��3������!#!��7ûA��K��3��@��3��@��7��K��3����=��@��K��E��=��=��@��E��=86��=��@��K��A��K��C��K��-��3��O��?��3��3��7��7����C��T)'��=��@��A��K��C��K��-��.��?�.��7����C���=��@��K��E��=��=��@��E��=/-��K��6��S�5��@��K��E��=��4��I��,��S��@��@��@)'��K��6��S�5��@��E��=��4��I��,��S��@��@��=��@��K��E��=��=��@��E��=��@��K��@��?��@��@���=��@��K��E��=��=��@��E��=)'��C��@ַ;��C��@��G��C��@��K��=��@��A��B&$��C��@ַ;��C��@��G��C��@��=��@��A��B��=��@��K��E��=��=��@��E��=DB��I��K��@��K��Q�Oַ;�O��E��6��V��=ԋ
����J��>��J��T��7��L��J��653��I��K��@��Q�Oַ;�O��E��6��V��=�J��J��7��L��J��6�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��Q��8ȘI��K��5ܢE��4��N��>��4��O�J��A��Q��8��K��5����>��4�O�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��Q�1��U��?��TܢE��4��N��=��D��.��4ԃP��;߽4��G��3 ��Q�1��U��?����D��4��A߽4��G�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5��N��C��>��4ԃP��;��Q��:��3��3ȘI��J��8��2��T��H��A><��A��H��Q��8��K��5��N��C��>��4��A��Q��:��3��3ȘI��J��8��2��H�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��L��-��TܢE��4��N��C��4�1��TН?��> ��A��H��L��-��T����C��4�1��?�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4 ��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+)'ԃP��;��E��7��2��TܢE��4��NŇ7̛<��U��T��A��E��7��T��Ň7̛<��U�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��3��Q��T)'��A��H��Q��8��K��5����C��>��4б��3��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��T��?��T��CܢE��0&$��A��H��Q��8��K��5����C��T��?��T��/�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��XН?��2��J&$��A��H��Q��8��K��5����C��>��4��X��2�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��E��T)'��A��H��Q��8��K��5����C��>��4��?��E��T�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?̛<��U��T&$��A��H��Q��8��K��5����C��>��4��?��U�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+#!��Q�1��NÚQ��8ȘI��K��TԃP��;��4��Q�1��N��8��K��T��A��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+53��-ԃP��;��HܢE��4��N��C��;�>��C��1��A��1��J��>��=)'��-��A��H����C��;�>��C��1��Aܹ1��>��=�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+�~��4����N��U��.̤3��@��>ϥJ��=��T��.��-��0ܢE��4��N��5��H��0�1ԃP��;��R��:��?��=��N��.̤3��@��>��P��T��T��>��J��F��F��8��G��3b`��4����N��U��.��LϥJ��=��T��.��-��0����5��H��0�1��A��R��:��=��N��.��L��P��T��>��J��F��F��8��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��U��/��T&$��A߽4��Q��8��K��5����,��4��U��/��T�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+,*��Q�1ʡH��9��B�X��TܢE��4��NН?̛<��7��T��Q�1��9�X��T����?��7�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4Н?��>��R��T&$��A��H��Q��8��K��5����C��>��4��?��R�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4��X��Q��T#!��A߽4��Q��8��K��5����,��4��X��Q�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+MKԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��C��-��HН?̛<��&���#��#����!)'��A��H��Q��8��K��5����C��>��4��*��?���ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+SQԃP��;��H��QʡHɤU��B��U��H��MܢE��4��N��C��T۹/��8��H��M��T��>��J�T��8��:��G��3><��A��H��QʡHɤU��B��U��H����C��T۹/��8��H��T��>��J�T��:��G�ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+ecԃP��;߽4��Q��8ȘI��K��5ܢE��4��N��,��4ԃP��;ܢE��4��N��5��NģC��F��4��Q��O�1��M��J��Eа.��TН?��>;9��A߽4��Q��8��K��5����,��4��A������Q��O��-��Eа.��T��?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+/-ԃP��;��H��UܢE��4��N��C��R��K��D��?��TيR̛<&$��A��H��U����C��R��K��D��?��TيR̛<�ԃP��;ܢE��4��J��Aˑ+��Dֈ;��0��O��F��ԃP��;ܢE��4��J��Aˑ+,*��7ԃP��;��E��7��2��TܢE��4��NН?��>��A��T��7��A��E��7��T����?��A�ԃP��;ܢE��4��J��Aˑ+86��Q���F��M�1��UܢE��4��NԃP��;��O��4��HН?��U��,��T#!��Q���F��M�1��U����A��O��H��,ԃP��;ܢE��4��J��Aˑ+86߹-�JН?̛<ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4&$��-��?��A��H��Q��8��K��5����C��>��4�ԃP��;ܢE��4��J��Aˑ+ ԃP��;�1ܢE��4��NН?̛<��E��T��A�1����?��E��TԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4б��X��Q��T)'��A��H��Q��8��K��5����C��>��4б��X��Q�ԃP��;ܢE��4��J��Aˑ+86ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4׶K��2�1��T)'��A��H��Q��8��K��5����C��>��4׶K��2��1ԃP��;ܢE��4��J��Aˑ+#!ԃP��;߽4��U��L��6��.��TܢE��4��N��A߽4��U��L��6��T���ԃP��;ܢE��4��J��Aˑ+><ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��HН?Н?��>��H��T,*��A��H��Q��8��K��5����C��>��4��H��?��H��TԃP��;ܢE��4��J��Aˑ+20ԃP��;߽4��U��7��2��TܢE��4��NԃP��;߽4��TН?��T ��A߽4��U��7��T����A߽4��Tܞ?�ԃP��;ܢE��4��J��Aˑ+53ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��TН?��T&$��A��H��Q��8��K��5����C��>��4��Tܞ?ԃP��;ܢE��4��J��Aˑ+;9ԃP��;��H��Q��8ȘI��K��5ܢE��4��N��C��>��4��
��0̛<��Q��T)'��A��H��Q��8��K��5����C��>��4��
��0��Q���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��Tބ2��B��J��7��6��8��T��7��P��4��J#!��T��I�O��Tބ2��BќJ��6��8��7��4���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��JH��R��T��I�O��T��4�/����>��BԚ<��P��D��U���%����!��,��9��=��9��@Ԛ<;9��R��T��I�O��T��4�/����>Ԛ<��P��D��U��,��9��=��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��#!��T��I�O��Tބ2��B��>��T��V��>��T#!��T��I�O��Tބ2��B��>��T��V��>��T���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��SQ��R��P��4��D��3��T��MɾS��B��T��I�O��T��L��;��U��$����N��,�����%����!��@Ԛ<;9��R��4��D��3��T��M��B��T��I�O��T��L��;��U����N��,��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��9��T��I�O��T��>����B��K��1١-��J��L�;��@��@��@/-��R��9��T��I�O��T��>����B��K��1١-��8��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��tr��T��>��I�O��Tմ2�O̤@��R�O��W��BǞV��<��>��MɾS��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��Aki��T��>��I�O��Tմ2��@��R�O��W��BȞV��>��M��3��D����U��J��D��P��>��W��>��5ֈD��,��D��L��9��A��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��T��I�O��T��*��B��6��J��7��6��8��T��7��P��4��J��2)'��T��I�O��T��*��B��6ќJ��6��8��7��4��2���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��,*��T��I�O��T��>����,��:���%����!��@Ԛ< ��T��I�O��T��>����,��:��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��A?��T��7��I�O��T��>����3��D��,��R��,��S��U��U��P��4��J��@��@��@53��T��7��I�O��T��>����3��D��,��R��,��S��U��4��@��@���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��86��R��T��>��I�O��T��K��>��S��F��>����P��4��J��@��@��@)'��R��T��>��I�O��T��K��S��>����4��@��@���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��/-��T��I�O��T����B��6��2��L��C��P��4��J��>��T#!��T��I�O��T����B��6��2��C��4��>���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��MK��9��Q��D��T��7��I�O��T��>��S��F��D��U��>��F��>����;��/��?��B��RÙK��B��T><��9��Q��D��T��7��I�O��T��S��D��U��>��F��>����;��/��B��E��B���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��20��P��4��J��T��I�O��T��S��U��Xߢ?��U��,��6��X��T&$��4��T��I�O��T��S��U��X��?��6��X��T���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��20��T��I�O��T��4��7��>��4����3��D��F��D��S�D��A,*��T��I�O��T��4��>����3��D��F��D��S�D��A���T��I�O��T����T��I�O��T��&$��T��I�O��T��0��Q��7��J��6��J����7&$��T��I�O��T��0��Q��7��J��6��J����7��T��I�O��T����T��I�O��T��20��T��I�O��T��B��6��J��7��6��8��T��7��P��4��B��T)'��T��I�O��T��B��6ќJ��6��8��7��4��B��T���T��I�O��T����T��I�O��T�� ��R��T��I�O��T��>����U��@Ԛ< ��R��T��I�O��T��>����U��@Ԛ<��T��I�O��T����T��I�O��T��hf��R��T��>��I�O��T��>����U��P��4��>�4—P��=��A��N��,��:��L���%����!��*��*��P��4��>٬J��=��$��@Ԛ<SQ��R��T��>��I�O��T��>����U��4��>�4��=��A��N��,��:��L��*��*��4��>٬J��=��$��@Ԛ<���T��I�O��T����T��I�O��T��53��R��P��4��J��T��I�O��T��>����Sߢ?��U��>��9��@Ԛ<,*��R��4��T��I�O��T��>����S��?��>��9��@Ԛ<��T��I�O��T����T��I�O��T��DB��R��P��4��J��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A><��R��4��T��I�O��T�>��M�K��J��I�O��T��K��K����D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A��G	��%��A��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M����&����'��������%��I��I��A��$��ۏ"��&����'��������%��I��I����A��G��&����'��������%��I��I����:��A��G��D�3��A��T��(����%����!����A��G}{��&����'��������%��I��I��A����&����'��������%��I��I����A��&����'��������%��I��I����:��A��D�3��A��T��V����A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A��%����A������ ��%��A��A������ ���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��C��V��2��%��0��J��%��2��C��W��F��T��O��W��W)'��%��C��V��2��%��0��%��2��W��F��T��O��9���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��B��F��%��J��W����D��G��%��A��G��@��F��:��=#!��%��<��%��J����D��%��A��@��:��=���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��MJH��D��9��G��M��>��A��Qٟ@��D��B�U��,�G߇;�G��3��M��Vٟ@��6��D��P�D��A><��DٚG��>��A��Qٟ@��D��K��,�G߇;�G��3��M��V��5��D��P�D��A���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<\Z��A��P��I��D��K��4��,�G��,�G��,�G߇;��5��>��,��V��Cʿ7��N��P��I��>��>��>��V��0��>��@Ԛ<��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M/-����A����G��%��;̽>��MŹ��(Źʿ��@��@��@)'����A����G��%��;�>Ź��(Źʿ��@��@���A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M��%��A������%����A��%��A������A��A��,�G߇;�G߇;��%��>��M��A��,�G߇;�G߇;��%��>��M20��%��D��J��W��.��>��=��V��%��J��W����G��%����A)'��%��D��J��W��.��>��=��V��%��J����G��A��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��86��$����&���� ��C��2̙EϪJֈD��T�9��J��9��@����A��B/-������ ��C��2̙EϪJֈD��T��J��9��@����A��B��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'�� ��2��E��C��$����&��E̛<��0��>��W��T�� ��2��E��C����E��0��>��W��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��)'��$����&���� ��C��2��G��E��9ֈD��@Ԛ<#!������ ��C��2��G��E��9ֈD��@Ԛ<��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��;9��Sޡ8��$����&��>��&��2̙E�� ֈD��>ܤK��$��'��&��9�Q��')'��S����>��&��2̙E�� ֈD��>ܤK��ƋQ��'��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD���~��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B��$����&��,��&ίB��>��T��7��>��K��U��V��J�J��K��U��Q��T��I��1���R��/��0��Qec��6��A��B��6��T�� ��2��EۈX��D��:ۈX��>ў7��&��B����,��&ίB��>��T��7��K��V����Q��I��1���R��/��Q��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD����$����&��2��@�� ��8�,��T����2��@�� ��,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��_]��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,�9��:�� ��>��I\Z��$֗>��A��S�� ��1��9��E��Ź��4��(����>��&��2��4�� ��E��B߻WֈD��1��H��%��,��:�� ��>��I��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��A?��$����&���� ۈX��2��@��Q��T��W��N��EܾW��,��;��P��T����,��T86������ ۈX��2��@��Q��T��W��N��EܾW��,��;ٱP����,��T��� ��2��EֈD��$����&�� ��2��EֈD��)'��$��� ��2̙EֈD��>ܤK��"��6��"����&#!��$��� ��2̙EֈD��>ܤK��"��6��"�� ��2��EֈD��$����&�� ��2��EֈD��53��ޥ0��C��E��$����&��0��>�� ��2��EֈD��J��<��=�@,*��ޥ0��C��E����0��>�� ��2��EֈD�J��=�@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��Hؕ7��;��E��E��@��;��Dؕ7��;��E��E��@���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��Xŷ5��D��/��D��/��Xŷ5��D��D���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D ��;��D��H��B��U��>��U��W��6��T��;��DΑB��>��U��6���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��D��H��D��H��T��;��D��D��D��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��>	��;��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��B��D��/��>	��B��D��>���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;��D��H��=��<��T��;��D��=��T���;��1��>��D��H��;��1��>��D,*��;��>��D��H��6��6��;��D��H��9��F��A��@Ԛ<#!��;��>��D��6��;��D��9��F��A��@Ԛ<��;��1��>��D��H��;��1��>��D��;ӈ5��U��D��>��D��H��D��H��;�5��D��>��D��D���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;;9��K��6������>��H��E��1��K��/��Q��4��D�G��K��I����A��B86��K��6������>��H��E��1��K��/��Q��4��G��K��I����A��B���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;#!��D��E��1ߢ?��0��8��I��D��<��B��B ��D��E��1�?��8��I��D��<��B��B���E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I��E��1��?��0��;��E��1��?��0��;��E��1ߢ?��0	��E��1�?���E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;PNڤ5��5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<MKܤ5ַ;��>��E��1��?��1��B��T��/��>׆B��/��1��/��6��9��I�P��T��R��;��I��@Ԛ<���E��1��?��0��;��E��1��?��0��;GE��D��K��O��F��H��E��1��K��/��Q��4��D�G��K��O��JܤK��>��6��D��G��@��K20���H��E��1��K��/��Q��4��G��ܤK��>��6��D��@��K��E��1��?��0��;��E��1��?��0��;A?��A׆B��?��K��U��E��I��3�R��>��7��D��E��1��?��P��;��6��6��@Ԛ<;9��A׆B��?��K��U��E��I��3��>��7��D��E��1��?��P��;��6��@Ԛ<Q��E��1��?��0��;��E��1��?��0��;��1��A�?��Iַ;	��1��A��I�������E��1��?��0��;��E��1��?��0��;ַ;��E��1��?��,��;��@Ԛ<ַ;��E��1��?��,��;��@Ԛ<��E��1��?��0��;��E��1��?��0��;53��A��U��E��1��A��Iٟ@��;��N��?�9��8��5��D��@��@��@/-��A��U��E��1��A��@��;��N��?�9��8��5��D��@��@���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F)'��A�OݰF��B��F��A����S��F��>��L��S��2 ��A�O�F��F��A����F��>��L��S���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F86ݰF��B��S��F��Q��B��J��7��6��8��T��7��QݰF��B��S��F��B&$�F��F��Q��BќJ��6��8��7��Q�F��F��B���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F ��AסE��T��/ݰF��Bٟ@��3��@Ԛ<��A��T�Fٟ@��3��@Ԛ<���G��=ݰF��B��S��F��G��=�F��F#!��B��N��0ݰF��B��S��F��2��Uа.��T��B��N�F��F��2��*��G��=ݰF��B��S��F��G��=�F��F��S��F��U��R��7��T��F��U��7��T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T���� ��V��>б��1��1�F֎T���� ��Vб���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J/-��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@��@,*��S��A��S��1��1�F֎T��=��>щQ��C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J)'������1��1�F֎T��=��?��N��;��7��8��K������1��1�F֎T����7��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J����=��?��N��;��C�;����M������C�;����M���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��J��6������!A?��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��7��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<;9����1��1�F֎T��B��J��Hį-��H��U��Hڶ>��2��>��A��R��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J ӪN��1��1�F֎T����E��@��@��@ӪN��1��1�F֎T����E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J86��1��1�F֎T��0��3��V��C��J��7��6��8��T��7��1��1�F֎T20��1��1�F֎T��0��3��V��CќJ��6��8��7��1��1�F֎T���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J��1��1�F֎T��@��?��1��1�F֎T��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JSQ��D��R�0��7��>��I�8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��APN��D��R�0��7��>��8Ҳ0��2��A��Xڃ��N��>��1��1�F֎T��A��K��Aٟ@�H��D��P�D��A���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53��1��1�F֎T��W��"�����)����$������I��K��4��6)'��1��1�F֎T��W��"�����)������I��4���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JJH����7��&��:֎T��1��1�F֎T����T��T��T����=��?��N��;��T����������!53����7��&��:֎T��1��1�F֎T����T��T��T������T��K���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<&$��C��E��>��1��1�F֎T��@��0��=��@Ԛ<���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�JPN��1��1�F֎T����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!.,��1��1�F֎T��������7����G����T��T��T�+���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�Jki��1��1�F֎T��K��Sħ;��S�� ��C��9��>��>��4��K��.��T��RҲ0��A��G�B�@��>�����=��?��N��;�����)��ʪ\Z��1��1�F֎T��K��S��S�� ��Cޖ>��>��4��K��.��T��RҲ0��A��G�B�@��>����������)��ʪ���1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J53����&��1��1�F֎T��R��B��O��E��V����C��E��@��@��@,*��&��1��1�F֎T��R��B��O��E����C��E��@��@���1��1�F֎T��P��A�J��1��1�F֎T��P��A�J20��ڶ>��S��:—P��G��2��&��*��*��1��1�F֎T��@Ԛ<,*����S��:��I��2��&��*��*��1��1�F֎T��@Ԛ<��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J,*����7��D��T��1��1�F֎T��6��U��=��?��N��;����7��T��1��1�F֎T��6�����1��1�F֎T��P��A�J��1��1�F֎T��P��A�JA?����7��D��2��T��:֎T��1��1�F֎T�����=��?��N��;��T��T��K�;20����7��2��T��:֎T��1��1�F֎T������T��T��K�;��1��1�F֎T��P��A�J��1��1�F֎T��P��A�J_]��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B������!YW��$��U��-£-��E��7��-Ҳ0��AʡH��9�D��S��&��1��1�F֎T��$����U��-��CɤU��TҲ0�A��B���86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/��
-������N��W��=��H��D��E�8��K��D��G��@��K/-��X��1ʡH��9��7΂��N��W��/��D�8��K��D��@��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C\Z��-��A��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K�
-��:��K��4��8��?��:��T><��-��A��X��1ʡH��9��7����N��W��/��D�C�
-��:��4��8��?��:��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cb`��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K����?����I��U��>��D��E��?��T΂:��C̛<A?��X��1ʡH��9��7����N��W��/��D��E����?����I��U��D��?΂:��C̛<�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K΂:��4��T/-��X��1ʡH��9��7����N��W��/��D�8��K΂:��4��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��>��K��X��1��K����N��W��/��D��>�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CSQ��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��O��K��D��O��G��D��O��6��G20��X��1ʡH��9��7����N��W��/��D��O��D��G��D��6��G�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CJH��X�,��1�9��CʡH��9��7�/�
-������/��@��C��H��W��D��EģC��KùB��N��L,*��X��1ʡH��9��7����N޻/��W��D�CùB��N��L�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�Cqo��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��>��K����L��2��8��A��W��T��6��O�0��U—P��D��7��>��6��;PN��X��1ʡH��9��7����N��W��/��D��>����L��P��A��W��6��O�0��U—P��D��7��>��6��;�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C><��D��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��K&$��D��X��1ʡH��9��7����N��W��/��D��E�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CPN��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�/��K��I���/�/��C��T20��X��1ʡH��9��7����N��W��/��D��/��I���/��C��T�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C;9��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��G����I#!��X��1��K����N��W��/��D��G����I86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��E��P��K ��X��1��K����N��W��/��D��P��K�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�C86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E�8��K�/��6��T,*��X��1ʡH��9��7����N��W��/��D�8��K�/��6�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��EģC��K��GģC��:��7����B��W��T53��X��1ʡH��9��7����N��W��/��D�C��GģC��7����B��W86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CMK��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K��:΂:��<��B��B20��X��1ʡH��9��7����N��W޻/��D�C��:΂:��<��B��B�86��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CGE��X�,��1�9��CʡH��9��7�/�
-������N��W��C��H��D��EģC��K΂:��6��T)'��X��1ʡH��9��7����N��W޻/��D�C΂:��686��X�,��1�9��C��K�/�
-������N��W��=��H��D��EģC��K��X��1��K����N��W��/��D�CVT��X�,��1�9��CʡH��9��7�/�
-������N��W��=��H��D��E��P��K��G��D��O��6΂:��P��6��G;9��X��1ʡH��9��7����N��W��/��D��P��K��G��D��6΂:��P��6��G���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��6�O��D��S�D��A��N��-��<��6��6��D��S�D��A���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6><��-��I��6��6�O��E��6��0��FǂS��H��A��V��T��J��D��8��D��A��P/-��-��I��6��6��E��6��FǂS��H��A��V��T��D��8��A���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6#!��N��B��-��<��6��O��C��8��A��9��9��N��-��<��6��O��C��8��A��9���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6��<��C��6��=�R��J��<��C��6�R��J���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6)'��D��-��I��H��D��6��/��E��6��-��1��1��6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��6 кB��-��<ԋ/��C��6��6��JƱC��TкB��-��<ԋ/��C��6��6��JϱC���N��B��-��<��6��6��N��-��<��6��686��-��I�I��6�I��6�I��6��6�O��U��>��E��6��D��S�D��A53��-��I�I��6�I��6�I��6��6��U��>��E��6��D��S�D��A��N��B��-��<��6��6��N��-��<��6��6,*��N��B��-��<�I��6��C��?����6��P����6��T&$��N��-��<�I��6��C��?����6��P����6���N��B��-��<��6��6��N��-��<��6��6_]��-��I��6��D��D��9��D��6��6��>��=��/��,ֈ;��N��?�K�C��L��3��;ނB��/��6��/��7��T��Nؕ7؄/��ESQ��-��I��6��D��9��6��6��>��=��/��,ֈ;��N��?�K�C��L�3ނB��/��6��/��7��T��Nڕ7��E��N��B��-��<��6��6��N��-��<��6��686��5��-��<��H��C��?��D��A��P��;��0��T��?��6��T��)����!&$��5��-��<��H��C��?��A��;��T��6��T��)���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��U��İU��7�/��5�.��W��@ßN��W��F�/��U��I��T20��4��U��İU��7�/��5�.��W��@ßN��W�/��U��I��T���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐWН?,*��4��U��İU��7�/��5�.��W��@ßN��W�/̐W���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/;9��U��İU��7�/�.��W��@ßN����ۏ"����(����!��U��E��T(&��U��İU��7�/�.��W��@ßN�[��U��E��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I��T/-��4��U��İU��7�/��5�.��W��@ßN��9��TŇ7��I���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��0��W��F�9��G��3��Q��T86��4��U��İU��7�/��5�.��W��@ßN��0��W�9��G��3��Q��T���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/,*��4��U��İU��7�/��5�.��W��@ßN��W��F�/)'��4��U��İU��7�/��5�.��W��@ßN��W�/���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/,*��4��İU��7�/��5��:��S��9İUН?̛<��7��T&$��4��İU��7�/��5��:��S��9İU��?��7���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/��R��>Н?��T	��R��>ܞ?���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�//-����U��İU��7�/���.��W��@ßN�1��T��7̛<,*����U��İU��7�/���.��W��@ßN��1��7̛<���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/53��4��İU��7�/��5��:��S��9İU��:��4��K����"����!,*��4��İU��7�/��5��:��S��9İU��:��4��K��"���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/;9��4��U��İU��7�/��5�.��W��@ßN��W��F�?�9��G��HН?��T20��4��U��İU��7�/��5�.��W��@ßN��W�?�9��G��/���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/GE����U��İU��7�/�.����W��@ßN��W��F�/ɴ9Н?��Tɴ9ʡH��9��?�/��T;9����U��İU��7�/�.����W��@ßN��W�/ɴ9ܞ?ɴ9��9�/��T���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/#!��4��U��İU��7�/��5�.��W��@ßN#!��4��U��İU��7�/��5�.��W��@ßN���İU��7�/��İU��7�/86��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��W��A��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��A��A��İU��7�/��İU��7�/86��4��U��İU��7�/��5�.��W��@ßN��W��F�/̝5̛<��Q��T20��4��U��İU��7�/��5�.��W��@ßN��W�/̝5̛<��Q���İU��7�/��İU��7�/20��4��U��İU��7�/��5�.��W��@ßN��W��F�/ÐW��W/-��4��U��İU��7�/��5�.��W��@ßN��W�/ÐW��W��İU��7�/��İU��7�/53��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q��T20��T��1��4��U��İU��7�/��5�.��:��S��9İU��W��Q���İU��7�/��İU��7�/ �
-��N��U��İU��7�/�.��@��K �
-��N��U��İU��7�/�.��@��K��İU��7�/��İU��7�/A?��4��U��İU��7�/��5�.��W��@ßN��W��F�/��E��:��Tɴ9��:��T�;86��4��U��İU��7�/��5�.��W��@ßN��W�/��E��:ɴ9��:�;�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��K��F��Eڶ>��FˎW��B��D��I��K��T�����)��ʪ/-��K��F��Eڶ>��FˎW��B��D��I��K�����)��ʪ�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J&$��Sį-��K��>��J���N��T���C��T��T#!��Sį-��K��>��J�ϞN���C��T��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J кB��6��Sį-��K��I��K��T��:��KкB��6��Sį-��K��I��K��:��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��;��Kʗ,��/��Sտ7��P��C��@��;��B ��;��Kʗ,��/��Sտ7��P��C��;��B�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T20��3��B��B��D��K��6��S��9��A��@��S��@��0��6��0��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J/-��;��Kʗ,��/—P��L��>��C��B��F�R��K�A��K��B,*��;��Kʗ,��/—P��L��>��C��B��F�R��K�A��K�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J)'��;��Kʗ,��/��S��K��D͙7��I��R��N͙7��T&$��;��Kʗ,��/��S��K��D��I��R��N͙7��T�į-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J��-��K��-��Kį-��K��EˎWٟ@��6֬4��Jį-��K��EˎW��5֬4��J#!��S��Kб��J��7��6��8��T��7��U��>��S��KбќJ��6��8��7��U��>���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;��,��B��R��/��>��4��7��,��B��R��/��>��4��7���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9ѹ6��7��,��BƸ=��D��J��7��.��K��/��B��9��A��=��B��@��@��@&$չ6��,��BƸ=��D��J��*��/��B��A��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;;9��R��/��B��,��B��.��P��԰��'��0��V��A������Uѹ6��F��G,*��R��/��B��,��B��.��P��0��V��A��"��Uݹ6��G���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;zx��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��N��J��7��6��8��T��7��;��2��/ޟEŮ<��N��6��9��A��=��Bǭ;��HΆO��-��5_]��R��/��,��B��C��M��R��/��@��B��B�R��-��P��2�K��O��NќJ��6��8��7��;��R��6��N��A�HΆO��-��5���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;86���
-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@��K/-��>ß1��QİL��	��R��/Ξ)��,��B��WβI��3��I��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;\Z��(����<��7����N��6����B����=��G�;��3��>��7����K������������������
���#!��<����K����������
������R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;)'��N��6��@��4��,��B��H��A��R��/��D��@Ԛ<&$��N��@��4��,��B��H��A��R��/��D��@Ԛ<���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;/-��R��/��,��B��@��Hٟ@ʜ2��I��A��N��6��@��@��@)'��R��/��,��B��@��Hٟ@ʜ2��I��A��N��@��@���R��/��,��B��;��R��/��,��B��;)'��NЃB��;��W�$��,��BΞ)��9��"��@��@��@&$��NЃB��;��W�$��,��BΞ)��9��"��@��@��R��/��,��B��;��R��/��,��B��;,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B,*��/��>��,��B��J��>��,��B��J��>��,��B��A��B���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B,*ʡH��9��=��7��B��;��U��>��C��E��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U53��B��7��;��U��S�L��8��7��B��9��7��7��	��N����H��&$��B��;��U��S�L��8����	��N����H�����B��7��8��;��U��B��8��;��U&$��B��7��;��>��8��N��@���>��;��G��B ��B��;��>��8��N��@���>��;��G��B��7��8��;��U��B��8��;��U,*��B��7��;��>��8��N��@Ɓ-��6��7Ɓ-��6��H��T#!��B��;��>��8��N��@ȁ-��7ȁ-��H��T���B��7��8��;��U��B��8��;��U��7��B��;��U��>��C��8�,��T��7��B��;��U��>��C��,��T��B��7��8��;��U��B��8��;��U��B��;��U�L��C��8�,��T��B��;��U�L��C��,��T���B��7��8��;��U��B��8��;��U��B��;��U��>��C��@��K��B��;��U��>��C��@��K��B��7��8��;��U��B��8��;��U)'��7��B��;��U��>��C��B��U��8��J��<��B��B)'��7��B��;��U��>��C��B��U��8��J��<��B��B���B��7��8��;��U��B��8��;��U ��B��;��U��>��C��7��C��<��B��B��B��;��U��>��C��7��<��B��B��B��7��8��;��U��B��8��;��U,*��B��7��>��;��U��N��8��C��.��V��I��<��7��; ��B��>��;��U��N��8��C��.��I��7���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��@��?��J��.ϭB��@���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB)'��J��.ʭB�/��L��
��F��;��F��?��8�,��T ��J��.ϭB��L��
��F��;��F��,��T���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��'��G��N��O��C&$��J��.ϭB��L����F��O��'��G��N��O��C���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB,*��J��.ʭB�/��L����F��U��O��L��B��<��B��B&$��J��.ϭB��L����F��O��L��B��<��B��B���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/��>��L��J��.ϭB��>��L���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB53��J��.ʭB�/��8��N��J��.ʭB�/�G��>��9��8��F��>��T,*��J��.ϭB��8��N��J��.ϭB�G��>��9��8��F��>���F��U��/��J��.ʭB�/��F��J��.ϭBMK��D��V��D��:��J��TʭB�/��>ڶ>��9Ԛ<��Q��D��M��/��F��U��7��>�G��J��E��@Ԛ<A?��D��V��D��:��J��TϭB��>��9Ԛ<��Q��D��M��F��7��>�G��J��E��@Ԛ<��F��U��/��J��.ʭB�/��F��J��.ϭB/-��J��.ʭB�/��8��I��C¨0��3��?��;��9��<��>��T��J��.ϭB��8��IϨ0��-��<��>���F��U��/��J��.ʭB�/��F��J��.ϭB#!��J��.ʭB�/��L����F��U��O��@��K��J��.ϭB��L����F��O��@��K��F��U��/��J��.ʭB�/��F��J��.ϭB��J��.ʭB�/�;��J��6��J��.ϭB�;��J��6���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��?��;��8��W��B��=��&��;��WɾS��2��S��C��I��9)'��?��;��8��W��B��=��&��;��W��2��S��C��9���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=/-��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.��T,*��H޽B��;��8��A��E��0��W��B��=щQ��U��P��.���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JH��W��B��R��P��I��9��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��G��OŒA��TDB��W��B��R��P��I��=��5��0��<��G��0��G��8��7��W��G��Q��D��2��GŒA��T���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20��P��G��,��D��N��G��8��0��6��W��B��=��C��=��S��7,*��P��G��,��D��N��G��8��5��W��B��=��C��S��7���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=GE��W��=��D��,��?��R��;��G��0��G��8��D��N��@��W��G��7ӽD��I��E��CӽD��I><��W��=��D��7��R��;��G��0��G��8��D��N��@��W��GӽD��I��EӽD��I���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<20޽B��R��0��W��B��>��=�M��>��I��?��;��8щQ��@Ԛ<���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=)'��;��8��0��W��B��=��D��>щQ��D��S�D��A&$��;��8��0��W��B��=ӗ>щQ��D��S�D��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=JHԓ4��5��9��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T�!����!����";9ԓ4��5��D��0ԓ4��B��=����S��R��J�>��E��;��8��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=#!��;��8��>��E��6��Q��W��B��=��@��N ��;��>��E��6��Q��W��B��=��@��N���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=��R����8��G��8��>��=��>��P��R����8��G��8��>��=��>��P���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=VTԓ4��5��9��D��0ԓ4��B��=��O��<��G��BDŽP�B�@��;��8��>׽R��G��6��S��T�!����!����"DBԓ4��5��D��0ԓ4��B��=��O��<��G��BDŽP��B��;��8��>׽R��G��6��S��T��X���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,DB��G׫;��@��2��>��H��8��G�K��0��G��8��W��B��=��F��?��H��G��,��H��,���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=\Z��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>��TYW��7��W��C����Ȼ����������2��2��H��G��/��C��N�K��0��8��W��=ߌ,��3��=��G��Gև9��>���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=����H��$��,��G��G��8��8��W��-��B��G��H��H��H��$��,��G��G��8��8��W��-��B��G��H��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��H��D��E��T��L��B��L��=��,��K����H��$��,��G��G��8��8��W��-��B��G��H��H��$��,��G��G��8��8��W��-��B��G��H��Q��H��$��,��G��G��8��8��W��-��B��G��H��D��E��T��L��B��L��,��K���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=86��G��,��D��N��G��8��0��6��W��B��=��C��=��Pֈ;̛<��A��T/-��G��,��D��N��G��8��5��W��B��=��C��Pֈ;̛<��A���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��T��C��R�J��G��<��8��Q��G��8��O��6��0��G��6��U��<��8��Gڶ>��S��=86��C�J��G��<��8��Q��G��8��O��6��0��G��6��<��8��G��S��=���G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=SQDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��9��D��0ԓ4��B��=��R��/��A��E��A��TMKDŽP�B��;��8��>׽R��G��>��G��8��;��?��Sԓ4��5��D��0ԓ4��B��=��R��/��A��E��A��G��8ԓ4��B��W��C��=��G��8ԓ4��B��W��C��=DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7DB��S��9��I��/��C��D��<��8�J��Gԓ4��G��W��B��-��R��N��=��
-��K��F�7�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��20����
����Ͳ��4ʉ5��/��%��D�H��G��A��A��O��C��4ˉ5��%��D�H��A��A��O��C�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5�� ����
����ʉ5ޚT��D��G��@��K����5��D��@��K�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��&$����
����ۚKʉ5��R��G̛<��"����&��ۚK݉5��G̛<��"�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��ʉ5����
������8�,��Tʉ5����,��T�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��20��7��������
����ʉ5ޚT��4��L��/ȈX��<��B��B��7��5��4��L��/ȈX��<��B��B�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��,*����
������H���6��=��>ʉ5��B��-��A��B#!����H���6��=��>ʉ5��B��-��A��B�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��;9����
����ۚK��4ʉ5��G��8��O��E��>έ;��L�S��DʡH��9�;,*��ۚK��4ʉ5��G��O��E��>٭;�S��DʡH��9�;�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��#!��@����
������>ʉ5��D��S�D��A��@����>ʉ5��D��S�D��A�����
����ʉ5��	��ʉ5��ʉ5����
������@��Kʉ5����@��K����
����ʉ5��	��ʉ5��&$������$��6����
������6ʉ5��@Ԛ<������$��6����6ʉ5��@Ԛ<�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6��@��K��-��;��@��K��-��;�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��653��.��H�B��@��M��6��4��A��6�O��I��0щQ��U��P��.��T/-��.��H�B��@��M��6��4��A��6��I��0щQ��U��P��.�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6><��.��H�B��@��M��6��4��A��6�O��H��A��V��T��J��D��8��D��A��P20��.��H�B��@��M��6��4��A��6��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6)'��.��4�9�B�3��I��6�O��F��U��P��U��T#!��.��4�9�B�3��I��6��F��U��P��U�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6A?ڤ5��5��D��>��.��1�B��@��D��4��A��=������
��@��6�O��G��;��P20ܤ5��D��>��.��1�B��@��4��A��=����@��6��G��;��P�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6GE��.��J��S��=��H�B��@��D��H��4��A��D��A��P��;��0��T��?��6��T��)����!,*��.��S��H�B��@��H��4��A��A��;��T��6��T��)�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��620��.�B��K��M��4��A��H��A��V��T��J��D��8��D��A��P)'��.�B��K��M��4��A��H��A��V��T��D��8��A�,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��D��4��A��,��H�B��5��4��A��6�O��U��P��.��T)'��D��4��A��,��H�B��5��4��A��6��U��P��.,*��,�B��A��@��D��6��4��A��E��5��4��A��6�O&$��,�B��A��@��6��4��A��E��5��4��A��6/-��.��H�B��@��D��4��A��=��6�O��G��U��P��9��T#!��.��H�B��@��4��A��S��G��U��P��9���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��@��G��.��M��T��A�/��B��@��G��.��M��T��Q��8ޚT��N��G��K��T��O��T,*��@��G��6��A�/��@��G��6��Q��8��+��K��T��O���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M)'��@��G��@��M��T�/��-��5��6��P��9�?ַ;#!��@��G��@��M�/��-��5��6��P��9��?���@��G��M��T	��@��G��M����@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��T��C�3��G�3��G��9ܞN��T��T��O��C�3��G��9��8��Iַ;��@��G��.��M��T��G��@��=��@��G��M��T��.��@��M��Tki��@��G��6��G��=��@��G��M��.��@��M��C�3��G�3��G��9��N��T��C�3��G��9��I��@��G��6��G��=��@��G��M��.��@��M��@��G��M��T	��@��G��M86��@��G��.�4��@ϚL��4��M��T�;��M�4߹-��W��Hԓ6��Iַ;&$��@��G��.��@��4��M��6߹-��W��Hԓ6��I���@��G��M��T	��@��G��M&$��@��G��.��M��T߹-�5��T��O��O��Iַ;��@��G��6߹-�5��T��O��I��@��G��M��T	��@��G��M����.��@��M��T��.��M��T��@��M��TܞN��D��>��.��M��T��E��=��.��M��T��=��.��M��T�IϪJ��1��.��M��@��G��.��@��M��T��D��C�3��G��9��8��Iַ;\Z��.��@��M��6��@��M�N��>��6��E��=��6��=��6�IϪJ��1��.��M��@��G��.��@��M��D��C�3��G��9��I���@��G��M��T	��@��G��M&$��.��M��@��G��M��T�J��-��U��@ؙD��T#!��.��M��@��G��M�J��-��U��@ؙD��T��@��G��M��T	��@��G��M)'��'��=��.��@��G��M��T��I��B��.��<��B��B#!��'��=��.��@��G��M��I��B��<��B��B���@��G��M��T	��@��G��M\Z��.��M��@��G��M��T��.��M��T��.��@��M��T��@��M��T��E��M��T��=��.��M��T��C�3��G��9��8��Iַ;><��.��M��@��G��M��6��.��@��M��@��M��E��M��=��6��C�3��G��9��I��@��G��M��T	��@��G��M/-��@��G��=��@��G��.��M��T��=��.��M��T�I��G��@ ��@��G��=��@��G��6��=��6�I��G���@��G��M��T	��@��G��M><��VܞN��T��>�� ���� ��B�IɤU��1��.��@��G��M��T��C�3��G��9/-��V��N��>��B�IɤU��1��.��@��G��M��C�3��G��9��@��G��M��T	��@��G��MDB��M��U��@��G��@��M��T��@��M��T��M��T��M��U��,��H��P��5ѳBʈF��P��?53��M��@��G��@��M��@��M��M��M��,��H��P��5ѳBʈF��P��?���@��G��M��T	��@��G��M;9��@��G��.��M��T��,��;��M��T��7��3��;��E��=��5��7��T��Iַ;)'��@��G��6��,��;��M��7��;��E��5��7��T��I��@��G��M��T	��@��G��M53��@��G��.��M��Tַ;��@��G��.��M��T��D��,��B��Pַ;Υ6&$��@��G��6ַ;��@��G��6��D��,��Pַ;Υ6c������I��6�;��0ڳQ	��+��0ڳQ ��I��6�;ٟ@�9ٟ@��0��A��@Ԛ<��+��9��0��A��@Ԛ<K������I��6�;��0ڳQ	��+��0ڳQ��I��6�;��-��N	��+��-��N�������I��6�;��0ڳQ	��+��0ڳQ20��I��6�;��0ʭBќ:��-��W��I��6�;��I��6��>��S��2&$��+��0ʭBќ:��-��W��I��6��I��6��>��So������I��6�;��0ڳQ	��+��0ڳQ&$��U��I��I��6�;��-��N�1��D��@��@��@��U��I��+��-��N�1ځD��@�������I��6�;��0ڳQ	��+��0ڳQSQ��������I��6�;��0��9��6�W��I��-��:��P��U��PޜF��T��I—P��R��M��T��I��6ޜF��6JH��������+��0��9��6�W��I��-��:��P��U��PޜF��T��I��R��M��T��I��6ޜF��6�������I��6�;��0ڳQ	��+��0ڳQA?��Q��2�?��E��C��=��E��@��.��=��9�Q��C��B��9�Q��C��ͦ(����!)'��Q��2�?��E��C��=��@ƋQ��C��BƋQ��C��i������I��6�;��0ڳQ	��+��0ڳQ ��.��I��W��I��6�;��8�T��A��B��.��I��W��+��8�T��A��B�������I��6�;��0ڳQ	��+��0ڳQ86��I��6�;��6��U��=�9��=��>��C�<ʡH����6��I��H�<��T&$��+��6��9��>��C�<ʡH����6��I��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;GE��<��M��N��L��;��;��T��B��T��4��B��T��/��R��6��G��U��K��P��9��PگD��T><��<��M��N��L��;��;��B��4��B��T��/��R��6��G��U��K��9��PگD��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;53ä=��F��9��E��N��L��<��M��N��L��M��T��M��=��E��P��>,*ä=��F��B��N��L��<��M��N��L��M��T��M��E��P���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;����.��M�@��D��>��3��PϪJ��B��E҄J�J��9��R��>�9ֈD��C��S��W��9ٟ@��1��9��2��D��>��9��E��<��M��N��L��A��M��7��S�9��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.��T����.��M�@��D��>��3��PϪJ��B��EԄJ��9��R��>��D��C��S��W�@��1��9��2��D��>��B��<��M��N��L��A��M��S��=�>��D��>��9��@��S��6��;��,��D��P��>��=��/��U��P��.���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;YW��J��9��E��<��M��N��L����5��?��J����7��7��E��B��=����H��Q��2��8����@dž9��V��T��P��HSQ��J��B��<��M��N��L����5��?��J����7��7��B��=����H��Q��2��8����@dž9��V��T��P��H���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;JH��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H�<��TGE��D��9��>��6��E��>��<��M��N��LëO��8��2޽B��4��;щQʡHб��6��T��H��T���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;#!��9��EʕV��<��M��N��LʕV��6��@Ԛ< ��BʕV��<��M��N��LʕV��6��@Ԛ<���<��M��N��L��6��;��<��M��N��L��6��;86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A86��>��I��6��>��<��M��N��L��6��;��,��>��6��;��D��P�D��A��<��M��N��L��6��;��<��M��N��L��6��;PN��N��A��=��<��M��N��L��;��;��T�Rڶ>��S��E��>��C��=��,��B��/��7Ȼ;��T��=��.��LGE��N��A��=��<��M��N��L��;��;��T�R��S��E��>��C��,��B��/��7Ȼ;��T��=��L�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G53��>��W��5��CȥW��G��8��E��<����=��?��N��;†M�8��T)'��>��W��5��CȥW��G��8��E��<����†M��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��Ghf����;��>����>��WȥW��,��:��K��>��;����=��?��N��;����7��=��?��N��;��G����T��T��T������
-�� ����!FD����;��>����>��WȥW��,��:��K��>��;��������7����G����T��T��T�+�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>��W��5��C����WȥW��G��8��E��<��=��?��N��;����T��T��T��G�8̛<86��>��W��5��C����WȥW��G��8��E��<������T��T��T��G��8�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;20��ȥW��>��W��2��G�/��I֣.ŞG��9�/��;����7�;�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>����>��Q��R��@��8��S֗T��7��ȥW��@��@��@/-����>����>��Q��R��@��8��S֗T��7��ȥW��@��@�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G20����>��ȥW��S��8��D��0��;����T����=��?��N��;)'����>��ȥW��S��8��D��0��;����T���������ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G,*��A��>����Q��5��=��Qڶ>��S��ȥW��@��@��@&$��A��>����Q��5��=��Q��S��ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G#!��ȥW��ȥW��K��ȥW��,��:ĝ�� ��ȥW��ȥW��K��ȥW��,��:؝�����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��GDB��>����@��G��W�C����;��9��Q��6��6��B��W����4����ȥW��@��@��@><��>����@��G��W�C����;��9��Q��6��B��W����4����ȥW��@��@����ȥW��>����1ڶ>��S��G�ȥW��>����1��S��G_]��N��9��U��L��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T����=��?��N��;��T����������!DB��N��U��=��>˾3��ȥW��>����G��/��N��Iǡ6����T��T��T������T��K�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�=��D�J��>��?��=��D�J��>��?��=}	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=�J��?��,��=�J��?��,��=�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=��D�J��>��?��=��GĊA��>��T��D�J��>��?��=��GĊA��>�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=��?��E�J׍Q��D��G��@��K��?��E�J׍Q��D��@��K�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=,*��D�J��>��?��=��E��?��N��K��L��F��9��@��K)'��D�J��>��?��=��E��?��N��K��L��F��9��@�	��?�J�=	��?�J�=��?��=��E�J�=׍Q��P��B��6��?��=��E�J�=׍Q��P��B	��?�J�=	��?�J�=����;��?��1��K��E�J��>�=׍Q��C��P��D��C��K��9��K��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B�9��K��J��K��>��N�9͝,ڪ3��.��WȻ��B�D��E��A¶7ģC��:��Q����;��?��1��K��E�J��>�=׍Q��C��P��D��C��9��>ٟ@���9��@��9��W��>��4��R��/ҾW��B��1��.�O��>��N����B��8��J��>��N�9Ν,��.��WȻ��B�D��E��NģC��:��Q�	��?�J�=	��?�J�=20����?����>��?�J��>��,��N��1��6��6��=��=��@Ԛ<)'����?����>��?�J��>�=��1��6��=��@Ԛ<	��?�J�=	��?�J�=��D�J��>��?��=��@��K��D�J��>��?��=��@��K�	��?�J�=	��?�J�=><��D�J��>��?��=��4��F��S��CܞN��/����O��������J��-��0��E/-��D�J��>��?��=��4��F��S�N����O����J��7��E	��?�J�=	��?�J�=��?�J�=��4Н?��A��3��A��T��?�J�=��4��A��A�	��?�J�=	��?�J�=)'��V��H��D�J��>��4�=��5��D�3Ȼ;��>��T ��V��D��>��4�=��5��D�3Ȼ;��>	��?�J�=	��?�J�= ��?��E�J�=׍Q��F��K��	��A��B ��?��E�J�=׍Q��F��K��	��A��B�	��?�J�=	��?�J�=��?�J�=��Uа.��T��?�J�=��*	��?�J�=	��?�J�=)'�J��>��?�=ʡH۩R��V��-��T��.��6��.��T&$�J��>��?�=ʡH۩R��V��-��T��.��6��.�	��?�J�=	��?�J�=;9��?��E�J�=׍Q��7��E��7��0��	��NʡH��
-��H��0��6��4��T��53��?��E�J�=׍Q��,��0��	��NʡH��
-��H��0��6��4��T��	��?�J�=	��?�J�=20��U��W��X�=��6��?��K��J�J�=��3��WН?��>��A��T,*��U��W��X�=��6��?��K��J�J�=��3��W��?��A�	��?�J�=	��?�J�=/-��D��9��D��D�G��?��>�J��>��,��N��D��S�D��A#!��9��G��?��>�J��>�=��D��S�D��A	��?�J�=	��?�J�=86��4��?߸3ѝ6��B��5��-��0��I�J��?߸3�=��=��I̛<��Q��T20��4��?߸3ѝ6��B��5��0��I�J��?߸3�=��=��I̛<��Q�	��?�J�=	��?�J�=��D��/��F��;	��D��F��;	��?�J�=	��?�J�=)'��D�J��7��?��>��=����F��>��>��@��>��T#!��D�J��7��>��=����F��>��>��@��>���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��1��F��E��W��K��.��W�K��C��:��E��T��1��F��W��K��W�K��C��:��E���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>;9��I��F��E��A��W̋?�6��F��F��1��U�K��>�6�2��6��:��:��@20��I��F��A̋?�6�.��1��U�K��>�6�2��6��:��:��@���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>&$��I�K��M��F��E��-��>��CϨH��Q��R��T��I�K��M��F��-��CΨQ��R��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>20��I��F��E��D��6��A��S��1��F՟?��>��>��D��S�D��A)'��I��F��D��6��Aū1��?��>��>��D��S�D��A���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>53��F��E��>��>��M��*��ɬI��*��I��*��5��5��T��H�<��T,*��F��>��>��M��*��ɬI��*��I��*��5��T��H��T���I��F��E��T��>��I��F��T��>,*��I��F��E��D��6��A��S��1��F՟?��>��>��@Ԛ<#!��I��F��D��6��Aū1��?��>��>��@Ԛ<��I��F��E��T��>��I��F��T��>53��H����F��E��>��>��@��I��U��>��J��-��F�>��T��L��P20��H����F��>��>��@��I��U��>��J��-��F�>��T��L��P���I��F��E��T��>��I��F��T��>_]��I��F��E��7��1��U��C��5�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��TVT��I��F��7��1��U��C�6�K��7��W��E��>�V��W��A��7��5��S��J��S��2��4��.��@��7�Uև9��>��I��F��E��T��>��I��F��T��>20��I—P��=��E��>��>��F��E��D��H��>��Q��I��B��,ܔN)'��I��=��E��>��>��F��D��H��>��Q��I��B��G���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��4��P��@Ԛ<��:��;��4��P��@Ԛ<���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;�
-��:��O��;��W��L�/��?��T�
-��:��;��W��.���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��O��:��4��;��D��G��@��K��O��:��;��D��@��K���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O�D��>��;��@��K��:�D��>��;��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2	��D��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��; ��:��O��;��2��,��L��D��G��@��K��:��;��2��,��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2	��:��;��2���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;&$����:��OƔ>��;��2�1��E��T��!����!����:Ɣ>��;��2�1��E��T�����D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��8�,��T��:��;��2��,��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��; ��P��:��O��8��;��:��I̺@��:��T��P��:��8��;��:��@���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;#!��:��O��;��J��:��O��4��9��7��4��T��:��;��J��:��4��7��4��T���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��D��O��;��2��:��T��D��;��2��:��T���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;��2��D��G��@��K��:��;��2��D��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;��:��O��;��2��7��C��<��B��B��:��;��2��7��C��<��B��B���D��E��O��;��D��E��O��; ����N��:��O��;��B��F��8��@��K����N��:��;��B��F��8��@��K��D��E��O��;��D��E��O��;��:��O��;�I��@��K��:��;�I��@��K���D��E��O��;��D��E��O��;��:��O��;��J��@Ԛ<��:��;��J��@Ԛ<��D��E��O��;��D��E��O��;#!��:��O��KخG��5��K��;��D��G��@��K��:��KخG��5��K��D��@��K���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��/��I��M��T��S��;ͺ?ٟ@��6��A�7��B��I��T��S��;ͺ?��5��+���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?&$��(��T��S��;��>��6��/��I��M��@��@��@��(��T��S��;��>��6��I��@��@���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M��T��R��;��>��>��V��Bͺ?�C��7�=��V��-��A��B)'��I��T��R��;��>��>��Bͺ?�C��7��V��A��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?��(��T��S��6��4ͺ?��(��T��S��6��4ͺ?���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��T��;ͺ?��D��S�D��A��I��T��;ͺ?��D��S�D��A���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?A?��/��I��M��P�D��;��Fͺ?��M��7��K��/��1�I��-�I��-������@Ԛ<53��I��P�D��;��Fͺ?��M��K��/�I��-�I��-������@Ԛ<���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ? ��/��I��M��F̽>��S��6��>��N��B��I��F̽>��S��6��>��N��B���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?53��;ͺ?�9��T��.��/��I��/��J��@��/��T��A��/��I��M��T,*��;ͺ?�9��T��.��I��/��J��@��/��T��A��I��T���(��T��;ͺ?��(��T��;ͺ?��(��T��R��;��>ͺ?��@Ԛ<��(��T��R��;��>ͺ?��@Ԛ<��(��T��;ͺ?��(��T��;ͺ?86��/��I��M̺ٟ@��6ʔ7��;��Vͺ?��2��(��/��I��M����I��@)'��I̺��5ʔ7��;��Vͺ?��2��(��I����I��@��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8&$��U��J�G��>�S��I��B��E��U��3��H��8��U�G��>�S��I��B��8��H��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8zx��7��HܞN��D�G��>�S��E��U��7��HܞN��D�G��>�S��E��U��Q��7��HܞN��D�G��>�S��E��U��D��E��T߹-��8��Lԓ6��Iַ;��C��=��.b`��7��H�N�G��>�S��8��7��H�N�G��>�S��8��Q��7��H�N�G��>�S��8��D��E��T߹-��8��Lԓ6��I��C��=��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8ki�/��K��@�G��>�S��E��U�S��T��S��U��Q��=��W��B�S��E��U�S��I��B��E��U߹-��=��E��M��S��Iַ;��B��U��1��TPN�/��K�G��>�S��8�SŘ<��Q��=��W��B�S��8�S��I��B��8߹-��=��E̠M��I��B��U��1��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8DB��-ܞN��D�G��>��!������)�S��E��U��Q��-����Q�;ۓR��T��C��G�0/-��-�N�G��>�S��8��Q��-����Q�;ۓR��C��G�0��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��886��R�N��U�G��>�S��E��U��I��B�S��E��U��)����:�/��B#!��N�G��>�S��8��I��B�S��8��:�/��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8/-ܞN��D�G��>�S��I��B��E��U�;�S��I��B��E��U&$�N�G��>�S��I��B��8�;�S��I��B��8��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8SQ��-ܞN��D�G��>�S��E��U��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��Iַ;ŒA��TJH��-�N�G��>�S��8��Q��D��2��V��Fȣ8��4�X��I��UҔB��<֗T��I�7��IŒA��T��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��8����P����P��G��>�S��E��U�G��>�S��8/-��L�4��4ȣ8�G��>��E��U�S��I��B��E��U̍�� ��L��4ȣ8�G��>��8�S��I��B��8�G��>�S��E��U�G��>�S��820��D�G��I��B��E��U�S��E��U��V��;��E��U��B��E��U#!��D�G��I��B��8�S��8��V��8��B��8� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��O��9��4��>��6�O��@Ԛ<��B��R��O��9��4��>��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>20��D��C��D�9�7��U��D��E��4��Oٟ@��6��A��A�7��B ��D��R��5��U��D��M��Oٟ@��6��+� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��D��B��C��D��Iٟ@��9��6��4��E��>йS��D��K�9ٟ@��9��S��M��>��B�U��-щQ��@Ԛ<><��D��B��R��@��5��4��E��>޹S @��9��S��M��>��B�U��-щQ��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>PN��O��D��6��>��D��=��7��A��D��B��D��C��D��=�9�>��D��Iٟ@��O��D��2�O��@��@��@><��O��D��6��>��D��=��7��A��B��R��9�>��D��@��O��D��2�O��@��@� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>b`��D��B��D��C��D��Cٟ@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��9��6ɤK��A��>��A�9��1��0��T��DPN��B��R��@��9ɤK��E��7��>��RɤK�/ϪJ��>��H��=��Q��9��5ɤK��A��>��A��1��0��T��D� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>_]��D��B��C��D��N��5�9��O��H��3��4��8��B��D��4��R��4��O��@��4��W��OŮP��O��4�/��T��D����O��TDB��D��B��R��N��5��O��3��8��B��D��4��M��O��@��4��W��X޵+��T����O��T� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>,*��B��D��C��D��9��4��>��,��6�O��D��P�D��A&$��B��D��R��9��4��>��,��6��D��P�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>#!��B��C��D��9��4��>��A��6�O��@Ԛ<��B��R��9��4��>��A��6��@Ԛ<� ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��>DB��D��B��D��>��C��D��.��N��A��>��%��>��R��6��Iٟ@��9��7��D��S�D��A53��B��>��R��.��N��A��>��%��>��R��6��@��4��D��S�D��A ��D��B��C��D��Iٟ@��9��6��4��>��D��B��R��@��5��4��> ��U��C��D��9��4��>��A��6��?��,��U��R��9��4��>��A��6��?��,� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P/-��B؇9��6˩5ֲR����1��F��Q�?ٟ@��S��P��G��3&$��B؇9��6ֲR����1��F��Q�?ٟ@��S��G� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P#!��B��O��F��R��6˩5֛7��>��3��P��J��B��O��F��R��6�7��3��P��J� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��Pki��B��T��S��6˩5��0��Q��N�?�9��H��9��R��I��J��IН?��T��X��L��I��/��I��/��I��/��B��=��6��I��6��B��=��-��0YW��B��S��6��0��Q��N�?�9��H��9��R��I��J��Iܞ?ɜX��I��/��I��I��/��B��=��6��I��6��B��=��0� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PDB��0�5��OȨK��F��D�9��I��V��B��T��E��LȨK��F�9��I��V��:��TН?��>/-��0�5��OӨK��D��I��V��B��E��LӨK��I��V��:��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��B��T��S��6˩5��0��B��T��6˩5��1��T��7��H��;��T#!��B��S��6��0��B��6��1��7��H��;��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T��6��6˩5��0��Q��GН?��>��B��6��6��0��Q��G��?� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��PA?��B��T��S��6˩5؇9��?˩5��O��M��R��9��I����1��F��U��F��F��P��J86��B��S��6؇9��?˩5��O��M��R��9��I����1��F��U��F��P��J ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P��B�R��6˩5��1��?��F��B��T��B�R��6��1��?��B��T� ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P20��U��C��>��B��0��6˩5��N��R��3��1��S��FщQ��@Ԛ</-��U��C��>��B��0��6��N��R��3��1��S��FщQ��@Ԛ< ��B��U��0��6˩5��F��E�9��1��P��B��U��0��6��F��E��1��P ��B��T؇9��6˩5��M��5��R��F��F��B؇9��6��M��5��R��F��F���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��?��>��P��.��T��G��6��>��?��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��@��N��>��P��C��@��N��>��P���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T;9��G߹-��.��T��G��TޚT��>��9��B��K��R�9��KϋI��L‡K��A��B20��G߹-��.��T��GޚT��>��B��R�9��KϋI��L‡K��A��B���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T)'��.��T��G��T��6��>��7��K��M��?��U��>��T ��.��T��G��6��>��7��K��M��?��,���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T20��.��T�9��Kʉ5�5��>��A��>��B��K��=��U��;Н?��T)'��.��T�9��Kʉ5�5��>��A��>��B��U��;ܞ?���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T/-��.��T�9��Kʉ5�5��>��A��>��B��K��=��3��R��T&$��.��T�9��Kʉ5�5��>��A��>��B��3ҔR���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TDB��O��<��>��T��R��I��O��.��T��R��I��O�V��T��I��O��B��<ȬT��I��Q��>86��O��<��>��T��R��O��.��T��R��O�V��T��I��O��B��<ЬT��Q���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��TMK��.��T��G��T��9��.��D�S��>�9��>��A��K��@—P��B��@��	��A��6�O��:��@��@��@><��.��T��G��9��.��D�S��>��I��A��K��@��B��@��	��A��6��:��@��@���C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T><��.��,��9��T��>��G��T��P��6ڜ>��K��D��T��W��AПC��D��S�D��A;9��.��,��9��T��>��G��P��6ڜ>��K��D��T��W��AПC��D��S�D��A��C��E��.��T��G��T��K��6��T��C��E��.��T��G��K��6��T��.��T��G��T��6��>��@Ԛ<��.��T��G��6��>��@Ԛ<�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��4��F��E��4�A��J��T��5��4��T��?��1��W��Q̛<��7��T/-;-��4��E��4�A��J��T��5��4��T��?��W��Q̛<��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0PN̾-��-��H��,̾-�,��6��.ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��Nя7��>��1GE̾-��-��H��,;-��6ΩW��4��L��5��/��B��4��W��5�H��/��O��T��A��N��>��1�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0̾-��/��X��T̾-��/��X��T�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��H��8��4�A��J��T��5��4��0ޡ8��>��1��@��K)'��H��8��4�A��J��T��5��4��0��>��1��@��K�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z��H��,̾-�,��XΩW��8��4�A��L��T��5��4��0ޡ8��>��1��1��D��D��>��7��U��	̾-��X̾-��X��-��TMK��H��,;-��X��8��4�A��L��T��5��4��0��>��1��1��D��>��7��U��	̾-��X̾-��X�-�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0JH��8�A��J��T�O��C��6̾-�,��8��4��L��5��/��T��S��:��-�1��Q��B��U��/��;86��8�A��J��T�O��C��6;-��8��4��L��5��/��S��:��-ڠ#��/�̾-�,�A��J��T��0ޡ8;-�A��J��T��0,*��7��H��9��8��4�A��J��T��5��4��>��0ޡ8��7�̾-�,�A��J��T��0ޡ8;-�A��J��T��0/-̾-��/��/��?��8��4�A��J��T��5��T��;��U��/��T&$̾-��/��/��8��4�A��J��T��5��T��;��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0	̾-��C��T	̾-��C��T̾-�,�A��J��T��0ޡ8;-�A��J��T��0;9̾-�,��E���A��J��T����T�DɍP��M��A��:��7��.��U��/��T/-;-��E���A��J��T����T�DӍP��A��:��7��.��*�̾-�,�A��J��T��0ޡ8;-�A��J��T��0�A��J��T��?��@��K�A��J��T��?��@��K̾-�,�A��J��T��0ޡ8;-�A��J��T��0��I��L��I��L�̾-�,�A��J��T��0ޡ8;-�A��J��T��0\Z̾-��-��,̾-�,��6��.ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��1��W��K��4��0ޡ8��>��1MK̾-��-��,;-��6ΩW��H��8��4�4��3�A��J��T��4��Q۹/��8��5��?��WåK��0��>��1̾-�,�A��J��T��0ޡ8;-�A��J��T��0&$̾-��4�A��T��9��5��/��?��V��/��?��T ̾-��4�A��T��9��5��/��@��?��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��R��6������!��8�,��T��R��6����,��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?53��1��K��>��Q�P��?��F��:��Bб��4��D��=��3��-��A��B,*��1��K��>��Q�P��?��Bб��4��D��=��-��A��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?;9��4��F��:̔6��B��U��P��V��7����1��5��C��S��?��F��:��@Ԛ<20��4��:̔6��B��U��P��V��7����1��5��C��S��?��@Ԛ<�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?&$̔6ַ;�I��B��U��Vԋ/��C��S��?��F��:��1�I��B��U��Vԋ/��C��S��?�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?GE��F��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��F��:��D��S�D��A><��:̔6��B��U��>ȣ8��9��0��7����1��K��K��6��S��?��D��S�D��A�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? �D��A��7�O��=—P��Rߑ4��P��T�D��A��7��=��Rߑ4��P��T�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?,*��7��F��:��B��P����1��N��D��?��F��:��@Ԛ<#!��7��:��B��P����1��N��D��?��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��? ��?��F��:��6��S��>��J��<��B��B��?��6��S܃>��<��B��B�)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?20��7��F��:��B��P����1��N��D��?��F��:��6��S��@Ԛ<)'��7��:��B��P����1��N��D��?��6��S��@Ԛ<)'��F��B��U��Q��D����N��D��6��S��?��F��: ��F��B��U��Q����N��D��6��S��?��?��F��:��6��S��@��K��?��6��S��@��K���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/#!��0�-��0��:��Nٟ@�H�F��V��F��T��0�-��:ٟ@�H��F��F��T���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E��
��W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E��
��W��/��@��@��U��N��D�H��F��/��U��D�H��F��/,*��NԚ<��B��>��U��N��D��8��F��/��?��P��S��6 ��1��B��>��U��D��F��?��P��S��6���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��//-ӟ;��N��@��R��>��8��F��S��/��"ҥ3��!��@��;��6&$ӟ;��N��R��>��8��F��S��"ҥ3��!��@��6���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E��
��W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E��
��W��/��@��@��U��N��D�H��F��/��U��D�H��F��/#!��U��NۚK��/��D��8��F��D��S�D��A ��U��N��/��D��8��F��D��S�D��A���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/&$����F��S��5��<��U��T��=��N��@��>��/����F��S��<��Uǃ=��N��>��/���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E��
��W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E��
��W��/��@��@��U��N��D�H��F��/��U��D�H��F��/)'��MƛK��U��2��Q��T��5��D�H��F��/��@Ԛ<#!��MƛK��U��2��T��D�H��F��/��@Ԛ<���U��N��D�H��F��/��U��D�H��F��/PN��U����2��Q��N��5��D�H��F��/��Bٟ@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<JH��U����2��Q��N��D�H��F��/��@����S��K��D��N��D��
��S��C��>��K��2��@Ԛ<��U��N��D�H��F��/��U��D�H��F��/><��U��N��@�9��O����F��/��P��K��Sϥ%��U��N��@��S��/�4��3��D20��U��@��O����F��/��P��K��Sϥ%��U��N��@��S��/��4���U��N��D�H��F��/��U��D�H��F��/GE��M��:��5��<��U��T��D��8��F��/����S��K��N��H��1��E��
��W��/��@��@��@;9��M��:��<��U��T��D��F����S��K��N��H��1��E��
��W��/��@��@��U��N��D�H��F��/��U��D�H��F��/��8��F��E��Nڜ>��/��@��K��8��F��E��Nܜ>��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A86��,ݠ.��>��O��/��1��9��O��6��1��6��A��B��T��G��A�7��B/-��,��>��O��/��1��9��O��6��1��6��A��B��T��G��+�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��O��,ݠ.��B��:��D��G��@��K��O��,��B��D��@��K�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A ��C��N��,ݠ.��Q��A��D��P�D��A��C��N��,��Q��A��D��P�D��A�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A)'��Iַ;��D��N��0��C��T��,ݠ.��AщQ��@Ԛ<��Iַ;��D��N����AщQ��@Ԛ<�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��A,*��>��T��,ݠ.��9��A��B��A�A��4˛5��D�A��4��>��,��9��A��B��A˛5��D��An	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��A��N��T��,ݠ.��Nĵ*�	��,ݠ.��A��,��A_]��O��7��0��C��T��,ݠ.��7��>��D�GܤK��P������0��4��T����V��A��V����0��7��>��?����Q��;��GE��O߫B��>��G��K������0��4��T����V��A��V����0��7��>��?����Q��;��	��,ݠ.��A��,��APN����X��>��T��9�;��;��>��X��>��Q��A��7��A�O��7��R��N��;��X��7��:��U��>��E�8DB����X��>��;��>��X��>��Q��A��7��A�O��7��N��;��X��7��:��U��>��E�8�	��,ݠ.��A��,��A��,ݠ.��ݠ.��O��,��ݠ.��O	��,ݠ.��A��,��AA?��O߹-��5��,ݠ.߹-��,ݠ.��:߹-��HİU��M��A��N��C�)��O��8�,��T53��O߹-��5��,߹-��,��:߹-��H��M��A��N��C�)��O��,��T���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D ��IֈD��N��0��D��:��D��G��@��K��I��N��0��D��D��@��K���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D��IֈD��G��C��?��D��I��G��C��?��D���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D��D��:��IֈD��1��4	��D��I��1���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D><يR��IֈD��:��0��DيR��4��IֈD��:��0��B��IيR��4��T��C��,��>)'يR��I��:��0��D�R��I��:��0��B��I�R��C���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D;9��IֈD��>��0��E��F��R��4��:��0��>ğCѭDӮD��:ٟ@�H��@Ԛ</-��I��>��0��E��F��M��:��0��>ɟCܮDٟ@�H��@Ԛ<���F��IֈD��:��0��D��F��I��:��0��D��IֈD��:��0��@��?��I��:��0��@��F��IֈD��:��0��D��F��I��:��0��D,*��IֈD��:��0ߢ?��D��T��7��N��7��9��U��A��T#!��I��:��0ߢ?��D��7��N��7��9��U��A���F��IֈD��:��0��D��F��I��:��0��D20��D��3Ԛ<��I��F��>��IֈD��0��>��D��:��D��S�D��A#!��3��I��F��>��I��>��D��D��S�D��A��F��IֈD��:��0��D��F��I��:��0��D><��0��IֈD��:��0��D��0��4��IֈD��:��0��B��I��0��4��T��C��,��>/-��0��I��:��0��D��0��4��I��:��0��B��I��0��4��C���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>;9��E��8��7�C��C��@��N��.��H˱U����=���F��CסE��@��@��@20��8��7�C��C��@��N��.��H˱U����=���F��C��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>GE����=��
-��>��C��@��N��7��U��0��>ٟ@��6��M��V�I��W��>��E��D��S�D��A><����=��
-��>��C��@��N��U��0��>ٟ@��6��V��=��>��E��D��S�D��A���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>A?��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��F��C��@��N��@��@��@;9��C��@��N����=��
-�F��>��E��Mӛ?ߤ8��>��4��C��@��N��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB����=��
-��>��CסE��C��@��N��D��E��0��*��*ԑ4��9��A��*��/��@��@��@;9����=��
-��>��C��C��@��N��D��0��*��*ԑ4��9��A��*��/��@��@���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>b`��B˩5��5����=����>��O��*��7��C��@����N��7��C��@��ĕ6��T����F��R��/��H����F��H��4��ĕ6��TPN��B˩5��5����=����>��O��*����C��@��ĕ6��T����F��R��/��H����F��4��ĕ6��T���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>_]��6ɵO����=����>��C��@��Nð.��A��
-��>��>ٟ@�9ٟ@��D��DܢE��SܤK��A��@��CסE��SܤK��A��@Ԛ<SQ��6ɵO����=����>��C��@��Nð.��A��
-��>��>��9��DܢE��SܤK��A��@��C��SܤK��A��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>/-��7��C��@��N��7��
-��=���F��S������ÐW��7#!����
-��=���F��S������ÐW��7���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>><����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��Fٟ@��6��@Ԛ<;9����=��
-�F��>��C��@��N��P��E��Mӛ?��M��>��>��F��5��@Ԛ<���C��@��N����=��
-��>��>��C��@��N����=��
-��>��>DB�
-��������
-��H��E��0��7��!��7��������L��J��6��������7��7/-����H��0���������L��J��6��������7��7��C��@��N����=��
-��>��>��C��@��N����=��
-��>��>)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<)'��C��@��N����=��
-�F��(��D��>��M��@Ԛ<����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I��
��������E��L��/��������
��������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/JHùB��L��W¶7��/��J��7���H��>��/��B��/����Wȥ��O��B��4��7��8��4�/��:ĹB��N��/��J��8��4��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��W��>��S��/��9��4�/��:��AƭI���H��W��>��S��9��4��/��A����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/;9���H��>��S��/��>��4��N��O�/��:��/��/��9¶7��JùB��L��/#!����>��4��N��O��/��/��N��JĹB��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/20�/��:��7���H��>��S��/����O��B��4����7�/��:��/��/����H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I��
��������E��L��/��������
��������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/&$���H��>��S��/��O��B��4��"����>������O��B��4��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/,*���H��S��/��O��N��4�/��:ҁX��?��L��-��T#!���H��S��O��N��4��/ҁX��?��L�-���H��S��/��O��N��4�/��:���H��S��O��N��4��/PN��7���H��S��/��4��7�/��:����
ȥ������Ƕ,��W¶7��/��>��;��������G��B20��/����
ȥ������Ƕ,��N��/��>��;��������G����H��S��/��O��N��4�/��:���H��S��O��N��4��/)'��7���H��/��B��/��>��O��N��4��7�/��:��/���H��S��/��O��N��4�/��:���H��S��O��N��4��/kiùB��L��9¶7��/��J��7���H��S��/����9ȥ��4��N��O��7�/��:����9¶7��/�/��:����6��6ȈX��4��������&20ĹB��N��/��J��/����N��/��/����6��6ȈX��4��������H��S��/��O��N��4�/��:���H��S��O��N��4��/;9��7���H��>��S��/��>��O��B��4��7��B��R��:����0����A��B��B��R��:����0����A��B���H��S��/��O��N��4�/��:���H��S��O��N��4��/><ʡH�U٨I��7���H��S��/��4��7�/��:������:��,��A��F��>�� ʡH�U٨I��/������:��,��>������H��S��/��O��N��4�/��:���H��S��O��N��4��/VT��7���H��>��S��/��O��B��4��ʡH��9ɰ5ȥ��7�/��:������į?��I��
��������E��L��/��������
��������E���H��S��/��O��N��4�/��:���H��S��O��N��4��/��H��N�1��,��;��T��L��H�1��,��;��T��L���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��686��X��:��8��6˩5��4��X��:��8��6��4��V��D��T����(����!)'��X��:��8��6��4��X��:��8��6��V��D��T�����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6)'��E��8��:��X��6��6˩5�H��3��8��@��@��@#!��E��8��:��X��6��6�H��3��8��@��@���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��620��X��:��8��6ӻB��O��X��:��8��6˩5��Q��4��6��4��T#!��X��:��8��0��X��:��8��6��Q��6��T���X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6GE��X��:��8��6ӻB��O��X��:��8��6ӻB��O��X��:��8��6��H��6��T��$������!20��X��:��8��0��X��:��8��6��O��X��:��8��6��6��T����X��:��8��6˩5��X��:��8��6&$��X��:��8��6˩5�>��X��:��8��6˩5��7 ��X��:��8��6�>��X��:��8��6��7��X��:��8��6˩5��X��:��8��6#!��5��X��:��8��>��6˩5��6�R��@Ԛ< ��5��X��:��8��>��6��6�R��@Ԛ<�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��U�.��8߹-��U�.��8�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>/-��U�.��>��D��P��?��1��4��:щQȻ;��T��=��.��L)'��U�.��>��D��?��1��4��:щQȻ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>)'кB��U�.��6��:��D��P߇;Ȼ;��T��=��.��L#!кB��U�.��6��:��D߇;Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��> ��U�.��8߹-��U�.��.ʺB��P��T ��U�.��8߹-��U�.��.ʺB��P��T�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��U�.��:��/�0��E��F��T��6 ߹-��U�.��:��/�0��E��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>&$��U�.����V��P����1��B��,��,��	��5&$��U�.����V��P����1��B��,��,��	��5�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>߹-��V��;��T��6��4߹-��V��;��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>MK߹-��U�.�6��>��P��5��,�A߹-��U�.˭V�6��,��3��T߹-˭V�6܈I��U��?��9�0GE߹-��U�.�6��>��P��5��9߹-��U�.˭V�6��,��3��T߹-˭V�6߈I��?��9�0�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>MK����N����=��.��H����=��
�F��0��B��U�.��$��D��:����N����=��M��P��M��PJH����N����=��U����=��
�F��0��B��U�.��$��D��:����N����=��M��P��M��P�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>/-߹-��U�.��D��I��4��2��9��-��D��I��V��=�R��J#!߹-��U�.��D��4��2��9��-��+�R��J�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>53߹-��U�.��/�0��Bб��D��D�7��=��E��U��T۹/��U��D,*߹-��U�.��/�0��Bб��D��D�7��,��U��/��D�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��0��N��U��Oބ2��E������	����	��=ĪC��'��A��B&$߹-��U�.��0��N��U��OǷ.����=��A��B�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>SQ��:��Aб��=����>��U�.��=��9��V��>��D��>�9Ԛ<��O��I��SÄN��2��6��8�9��F��T��6GE��:��Aб��=����>��U�.��V��>��D��>��1��O��I��SÄN��9��8�9��F��T��6�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>)'��U�.��>��D��P��6��:��,Ȼ;��T��=��.��L#!��U�.��>��D��6��:��,Ȼ;��T��=��L�߹-��U�.��:��D��>߹-��U�.��:��D��>ec��1��O��������.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ڤ5��5��J��@��C��;ϵ>͵A��T��J��.��P��TVT��1��O����.��9�.��R��U�.��,��D��6��/��E��D��C��D��>ܤ5��J��@��Aϵ>͵A��T��J��.��P߹-��U�.��:��D��>߹-��U�.��:��D��> ߹-��1��U�.��9��T��D��S�D��A ߹-��1��U�.��9��T��D��S�D��A�߹-��U�.��:��D��>߹-��U�.��:��D��>_]��A��U�.��?��:��D��>��>��Rٟ@��6��U��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��.��LYW��A��U�.��?��:��D��>��>��Rٟ@��6��>��G��1��@��:��?��;��0��@�W��;��0��:��GȻ;��T��=��L߹-��U�.��:��D��>߹-��U�.��:��D��>GE��3Ԛ<��U�.��D��Pڶ>��9��V��C��=��6��R��M��K��
-��P��T��6��1��T��P��653��1��U�.��D��9��V��C��6��R��K��
-��P��T��6��1��TڀP�߹-��U�.��:��D��>߹-��U�.��:��D��>#!߹-��U�.��:��D��M��=��6��S��@Ԛ<#!߹-��U�.��:��D��M��=��6��S��@Ԛ<߹-��U�.��:��D��>߹-��U�.��:��D��>;9߹-��U�.��:��S��M��?��B��;��BɵO��M��S��B��#��*��*��.��T53߹-��U�.��:��S��M��?��B��;��BֵO��S��B��#��*��*��.�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S&$��F��M��G��M��M��>��.��3ˠS�8��7��T��F��M��G��M��M��>��.�8��7�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S)'��V��X��?��A��M—P��S��>��S��M�8��G��J#!��V��X��?��A��M—P��S��>��SٶM��1�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S/-��U��=��Sб�.��6��5��J��?��O��4ʄ/��&�8��7&$��U��=��Sб�.��6�J��O��4ʄ/��&��8�	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��SVT��7��0�:��7�K���U�6��A�8��>��C������¾9�8��T��—P��7—P��X��>¾9�8��7��;><�K���U�6��A��>��C���¾9��8��—P��7—P��X��>¾9��8��;�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��SA?��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<��(��������!53��U��=��Sб����7̛<�8��7��E��7��C��7����7̛<���	��U�8��S��U��SA?����N��,ˏR��0��#��>ˌD��3��U��=��S��NۥN����������&����7><����N��,ˏR��0��#��>ьD��U��=��S��NۥN����������&����7	��U�8��S��U��S86��W��7��I��U�8��>��S��E��Sٟ@�M߫U��@��U��'��@��@��@20��W��7��I��U��>��S��E��Sٟ@�M߫U��@��U��'��@��@�	��U�8��S��U��S/-��1۠N�
-��FɹK��U��=��S��5ۓR��:ϡS��F��A��T#!ޠN�
-��F��U��=��S��5ۓR��:ݡS��A	��U�8��S��U��S><�����������������
-��4��J��6����N��L�F��;��8��T��786�����������������
-��4��J��6����N��L�F��8��7�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��/�,��<��7��F����N��C��N��:��Q��E��I��/��4�O�5��.��L,*��/��<��7��F����N��N��:��+��@��4�O�5��L�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.b`��/��Q��E��T����N��/��Q��E��V��K��/��Q��E��L��9��O��/Լ=��E��T��/��Q��E��/��4��/��V��Q��E��1��WJH��/��+��T����N��/��+��V��/��+��L��9��O��/�=��T��/��+��/��/��V��7��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��C��N��/��Q��E��L��<��W��Q��T����N��N��/��+��L��<��W��Q�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.86����N��C��N��/�,��7ΩWǔ:��Q��B��1�5�O��.��Q��E��T,*����N��N��/��7ǔ:��Q��B��1�5�O��.��+��T�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.������N��/��Q��E��/��4��/��Q��/��4��K��/��Q��E��/��4��H��/��Q��E��Q��W��J��/��Q��E��E��/��V��/��Q��/��4��8��/��Q�O��4��/��Q��E��/��/��Q��/��K����-qo����N��/��+��/��/��Q��/��K��/��+��/��H��/��+��Q��W��/��+��E��V��/��Q��/��8��/��6��4��/��7��/��/��Q��/��K����-�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��./-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@/-����N��/��X�O��6��E��/��4�O�5��L��Q��E��@�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.;9��2����N��Q��E��/�O��O�5��2��Q�O��K��2��Q��O��K��"��W53��2����N��+��/�O��O�5��2��6��K��2��Q��O��K��"��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.�~����N��/�,��K�O��6��1ؙD��T�5�O��L��/��Q��:��L��E��G��:��9��/��4��E��.��E��J��Q��E��T����N��D��Q��E��N��K��7��9��1��W��Tec����N��/��K�O��6��1ؙD��T�5�O��L��/��Q��:��E��:��/��E��.��E��J��+��T����N۳9��E��N��7��9��1��W�����N��/�,��Q��E�O��.����N��/��+�O��.53����N��/�,��Q��I��/��E��4�O��L��/��.��D��Q��E��T)'����N��/��Q��@��E��4�O��L��/��.����T����N��/�,��Q��E�O��.����N��/��+�O��.&$����N��/��1��Q��I��/��4�O��.��D��Q ����N��/��1��Q��@��4�O��.۳9�	��@��U��E	��@��U��E��U��E��T��@��?	��U��E��@	��@��U��E	��@��U��E ��@��U��F��5��E�9��:��U��@Ԛ<��@��U��F��5��E��:��U��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<)'��U��C��9��S��;ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��V��X��,��)��E��Bٟ@��&��EϜV��Q��T��V��X��)��E��@��&��EϜV��Q���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>DB��DԚ<��(��������!ٟ@��6��E��S��>��)����%������"��6��"����&)'��D��5��E��S��>��)����%������"��6��"���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>#!��;ښL��)��E��6��?����?��O��K��T��;ښL��)��E��6����?��A��T���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>&$��8��V��1��)ٟ@��>��6��E��6��>��@Ԛ<#!��8��1��)ٟ@��>��6��E��6��>��@Ԛ<���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>53��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��A�7��B/-��K��:��S��;ٟ@��Sٟ@��>��6��E��6��6��>��G��+���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>\Z��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��5��K��9��>��6��6��;��N��D��S��PԮK߀3VT��$��B��)��K��F��E��6��>��R��D�I��6��P��G��H��>��R��K��9��>��6��6��;��N��D��S��PٮK���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>/-��R—Pٟ@��)����%ٟ@��6��E��6��>��D��P�D��A&$��R—Pٟ@��)��5��E��6��>��D��P�D��A���)ٟ@��6��E��6��>��)��5��E��6��>	��)��@��?��)��@��)ٟ@��6��E��6��>��)��5��E��6��>GE��!��Rٟ@��6��E��6��>ٟ@��щQ��K��B��)��B��$����&��9��U��>щQ��@Ԛ<;9��!��R��5��E��6��>ٟ@��щQ��K��B��)��B����U��>щQ��@Ԛ<���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$hf��$��>��I��?��9��T��W��O��$�8��$��>��I��?��9��T��W��O��$�8��Q��$��>��I��?��9��T��W��O��$�8��,��9��PMK��$��>ɞ9��W��O��$�8��$��>ɞ9��W��O��$�8��Q��$��>ɞ9��W��O��$�8��,��9��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��֥>��$�8��?��9��T��W�8��Q��H��.��T#!��֥>��$�8ɞ9��W�8��Q��H��.��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'��U��"����҈����$��4��T޲F��?��9��T��U��"����4��T޲Fɞ9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$tr�L��:��V��1��T��>��B��;��W�8������׫B�!��U��H��?��I��?��9��T��$�8��C��W��O��?��9��8��W�8ɳQ��W��Q��B��H��O_]�L��:��V��1��T��>��;��W�8������׫B�!��U��H��?��Iɞ9��$�8��C��W��OǞ9��W�8ɳQ��WвQ��H���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$;9µ��$��?��9��Tµ��$��?��9��T�@��M��@��>��K��T��@�/��Bɞ9ɞ9�@ܱM��>��K��@�/���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��<��?��9��T��W�8��$��C��Q��-����Q�;ۓR��T��C��G�0/-��<ɞ9��W�8��$��C��Q��-����Q�;ۓR��C��G�0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$��?��9��T��$�8��:ɞ9��$�8��:���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$86��?��9��T��$�8��?��9��T��$�8��,��?��9��T��$�8�/��P&$ɞ9��$�8ɞ9��$�8��,ɞ9��$�8�/��P���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$nl��4��T޲F��?��9��T��"����҈����$��A�/��B��4��T޲F��?��9��T��"����҈����$��Q��8ޚT��N��G��K��T��O��T><��4��T޲Fɞ9��"����A�/��4��T޲Fɞ9��"����Q��8��+��K��T��O���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$&$��?��9��T��Q��0��"lj:��?��9��TɳQ��Qɞ9��Q��0��"lj:ɞ9ɳQ��Q���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$ec����?��9��T��8�8��I��?��9��T��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��I��0�C��9��8��>ŒA��TSQ��ɞ9��8�8��Iɞ9��$�8��W��O��8�8��8��O��?��9�8��Q��D��2��C��0�C��9��>ŒA��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$20��?��9��T��$�8ܞN��D֥>��W��8ݶ;��U��W��8��9��T#!ɞ9��$�8�N֥>��W΀8��U��8��9��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$,*��"����҈����$��4��T޲F��?��9��Tlj:��"��"����4��T޲Fɞ9lj:��"���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$53��"����҈����$��4��T޲F��?��9��T��Q��D��2��D��T#!��"����4��T޲Fɞ9��Q��D��2��D��T���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$A?µ��$��?��9��T��A��=��U����L����E��Q��?DZ.��<��?��9��T��C��9/-ɞ9��A��=��U����L����E��Q��?DZ.��<ɞ9��C��9���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$)'����?��9��T��$�8����1��Ƨ!��"��K��0 ��ɞ9��$�8����1��'��"��K��0���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$_]��W�8��?��9��T��W�8��"��?��9��T��W�8��$���)��?��9��T��W�8��5��6��U��C��7��C��R��?��7��?A?��W�8ɞ9��W�8��"ɞ9��W�8ɞ9��W�8��5��6��U��C��C��R��?��7��?���?��9��T��$ɞ9��$A?��W��6�J�/��?��9��T��$�8��:��W��6�J��,��H��P��H��C��I��9��I20��W��F�/ɞ9��$�8��:��W��F��,��H��P��H��I��9��I��?��9��T��$ɞ9��$GE��"lj:��?��9��T��W�8��5��6��U��$��Ȓ ��������Ƨ!��G��8��O��<��T,*��"lj:ɞ9��W�8��5��6��U����G��8��O��<��T���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOMK��/��N��/��4ʅ>߰4�>���N��.��O��X��,��F��J��O��:��9��/��N��/��4��@��@��@,*��N��4���N��O��X��,��F��O��:��9��N��@��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO><��/��N��/��4ʅ>߰4�>�N��.��Xҥ3߫U��B��W��O��F��J��U��Q��J&$��N��4�N��X��U��B��W��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO����6��Mӛ?��6��Mӛ?��O�;��O��/��N��/��4��7��>��6��7��4��4��B����9��H��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ɿC�R��
��S����2ބ2��B��@����Bބ2ͩ-��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:������6��Mӛ?��6��Mӛ?աO��O��N��̻4��B����9��S����1����HŞ��1���K��į?��D��ߋ5����G��ބ2��4����P��K��ۥN��ۿC��
��S����2ބ2��B��@����B��D��	ۥN������B�D��B���M��/����N��7����<��Q��B������J��7����1��R��6��Mӛ?گD��4��ɭ4��:��N��7����1������ӛ?��9��:�����/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO86��/��N��/��4ʅ>߰4�>�N��.��X��W��B��O��F��J��U��Q��J#!��N��4�N��X��W��B��O��F��U��Q��J���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աOSQ��4��/��N��/��4��5��F��>��J��F��J��Iݩ5��O��Rܠ9��4��/��N��/��4��5��F��>��J��F��J,*��4��N��5��4��F��Iݩ5��O��M��4��N��5��4��F���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO ��0��0��,��B��4��.��Iַ;��@��?��0��,��B��4��.��I��@���/��N��/��4��0��O�;	��N��0աO����/��N��/��4ʅ>߰4�>���N��.��X��8��B��L��F��J��O�;��5��O��Rܠ9ݩ5�N��.��F��J��O��:ݩ5ʅ>߰4�>��O��X��F��J��O��Rܠ9љ5�5��K��U��S̛<��O�;ۤ3��:��U��S̛<��Q�0��O��C—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U˩5��O��Rܠ9ݩ5�J����U��S̛<����N��4���N��X��8��B��L��FաO��5��O��Mݩ5�N��F��O��:ݩ5��4��O��X��F��O��Mљ5��5��U��S̛<աO�3��U��S̛<��Q�0�O—P��7��>��S��Q��T��0��7��>��S��7��S˩5��W˩5��U��O��Mߩ5����U��S̛<��/��N��/��4��0��O�;	��N��0աO&$����9��:��9��;��2—P��X��>��9��:��;#!����9��:��9��;—P��X��>��9��:��;���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P)'��7��6��B��J��P��T��;��<̖@��@��T��M��L&$��7��B��J��P��T��;��<̖@��@��T��M��L���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P,*��C�F��7��6��B��JԿ7��;˨O��O��/��J��Iַ;&$��C�F��7��B��JԿ7��;˨O��O��/��J��I���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��—P��H��I��L��2��C����O��J—P��H��I��L��2��C����O��JسS��B��6��B��J��Dʿ7��E��>��P—P��H��I��L��2��C����O��J—P��H��I��L��2��C����O��J��Q��0��N��>��>��K��J��N����B��I��L��2��C����O��J��B��I��L��2��C����O��J۳S��6��B��JϿ7��E��>��P��B��I��L��2��C����O��J��B��I��L��2��C����O��J��Q��0��N׎>��KɏJ���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P,*��J��R��J�C��J��D��6��P��V��.��6��;��J��T)'��J��R��J�C��J��D��6��P��V��.��6��;��T���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P)'��A��R��J��B��J��D��6��PщQ��U��;�7��P&$��A��R��J��B��J��D��6��PщQ��U��;��7���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P53��BܥN��F��C��S��7��B��7��6��B��R��6��H��J��>��A��P/-��BܥN��F��C��S��7��B��7��B��7��H��J��>��A��P���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��PYW��Jǭ;��N��,��6��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��,��6��>��G��3��.ٟ@��DƂGщQ��@Ԛ<SQ��Jǭ;��N��7��>��P��Jǭ;��DƂGщQ��Jǭ;��D�@щQ��7��>��G��3��.ٟ@��DƂGщQ��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P/-��F��Jō/��N��J��D��0��P��L�3��6��>��;��G��B&$��J��N��J��D��0��P��L�3��6��>��;��G���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P#!��6ǭ;��>��Q��6��N��J��>��P��;�7 ��6��>��Q��6��N��J��>��P��;�7���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��C�F��J��B��6ǭ;��@Ԛ<��C�F��J��B��6��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P ��6��B��J��D��6��E��>��P��@Ԛ< ��6��B��J��D��6��E��>��P��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P ��7��6��B��J��>��P��Hڶ>��@Ԛ<��7��B��J��>��P��Hڶ>��@Ԛ<���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��A�7��B��L��BϨH��J��>��P��+���,��6��B��J��>��P��7��B��J��>��P/-��1��7��6��N��Jǭ;�J��2��=��>��P��Q��@��@��@)'��1��7��N��Jǭ;�J��2��=��>��P��Q��@��@��,��6��B��J��>��P��7��B��J��>��P#!��Lǭ;��BϨH��J��>��P��D��G��@��K��L��BϨH��J��>��P��D��@��K���,��6��B��J��>��P��7��B��J��>��P��U�F��J��B��7��6͎?��/��U�F��J��B��7͎?��/��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P&$��7��6��B��J��>��P��P��/��M�G��Q��T ��7��B��J��>��P��P��M�G��Q��T��,��6��B��J��>��P��7��B��J��>��P��;��-��M��=��;��-��M��=���,��6��B��J��>��P��7��B��J��>��P ��6��B��J��Dʿ7��E��>��P��@Ԛ<��6��B��JϿ7��E��>��P��@Ԛ<��,��6��B��J��>��P��7��B��J��>��P/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=/-ϨH��Jō/��B��J��>��PϨH��Jō/��B��Jڶ>��F��=���,��6��B��J��>��P��7��B��J��>��P/-��7��6��B��J��F��6�F��,��Q��V��M�G��.��D��6,*��7��B��J��F��6�F��,��Q��V��M�G��.��D��6��,��6��B��J��>��P��7��B��J��>��P20��7��6��B��C��J��>��P��/��G��=��Q��>��B��D��>ÐW,*��7��B��C��J��>��P��G��=��Q��>��B��D��>ÐW���,��6��B��J��>��P��7��B��J��>��P><��7��6��N��J��F��3��P��;�7��N��@��N��;��J��T��;��J��Q��;��J/-��7��N��J��F��3��P��;�7��N��N��;��T��;��Q��;��,��6��B��J��>��P��7��B��J��>��P ��Lǭ;��BϨH��J��>��P��:ÐW��4��L��BϨH��J��>��P��:ÐW��4���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��<��6��>��7��T��<��@��9��:��T��<��6��>��7��T��?��9��:���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��> ��M��4��T��C��T��7��@��<��@Ԛ<��M��4��C��T��7��@��@Ԛ<���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>53��9��T��B��@��>��T�K��7��<��:��7��@��<ǭ;��?��A��B,*��9��T��B��>��T�K��7��<��:��7��@ՄN��A��B���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>A?��>��T��<��@��>��/��2��6��S��C��S��E��T��<��@��>��-��/��7��B��6;9��>��T��?��>��/��2��6��S��C��S��E��T��?��>��-��/��7��B��6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>/-��T��@��<��T��@��<��/��T��@��<��6��S��E��A��T&$��T��@��T��@��/��T��@��6��S��E��A��T���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��4��T��<��@��H��A��V��T��J��D��8��D��A��P��4��?��H��A��V��T��D��8��A���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>��4��T��<�G��D��G��@��K��4��<�G��D��@��K���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>;9��4��T��R��F��7��@��<��5��@��2��D��0��O����6��P����6��T,*��4��R��I��@��5��@��2��0��O����6��P����6���4��T��7��@��<��<��>��4��7��@��<��>JH��>��C��T��<��7��@��<��6��R��>��1��6��R��>��7��,��O�9ϪJ��1��<��>��@Ԛ<;9��>��C��T��<��7��@��6��>��1��6��>��7��,��9��1��<��>��@Ԛ<��4��T��7��@��<��<��>��4��7��@��<��>,*��C��>��8��T��<��7��@��<��1��>��D��P�D��A&$��C��>��8��<��7��@��1��>��D��P�D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��686��6��N��B��U��C��-�9Ԛ<��D��/щQ��6��C��U��,��C��<��P)'��@��U��-��1��D��/щQ��6��U��,��C��<��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6#!��6��N��B��E��E��U�DщQ��C�D��P��@��E��E��U�DщQ��C�D��P���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��@Ԛ<��@��U��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6PN��6��N��B��U��Cٟ@�9ٟ@��N��D��.��B��2�I�O��=��.��@��D��N��B��2��B��E��1�S;9��@��U��9��N��D��.��2�I�O��=��.��@��D��N��2��B��E��1�S���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��M—P�C��.��6��D��@Ԛ< ��@��U��M—P�C��.��6��D��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6&$��6��N��B��U��Uӛ?��C��D��T��D�A��4��@��U��Uӛ?��C��D��T��D��A���6��N��B��U��C��6�O	��@��U��6��6��N��B��V��1��U��C��6�O��@��V��1��U��6��6��N��B��U��C��6�O	��@��U��620��6��N��B��U��C��-�9Ԛ<��6�Oݠ.��D����N��@Ԛ<#!��@��U��-��1��6ݠ.��D����N��@Ԛ<���6��N��B��U��C��6�O	��@��U��6��6��N��B��U��C��D��P�D��A��@��U��D��P�D��A��6��N��B��U��C��6�O	��@��U��6/-��6��N��B��6��O��U��C��N��3��>��E��T��B��E��T&$��@��6��O��C��N��3��>��E��T��B��E��T�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9/-��D��H��L��K��D‡?�O��A��O��6�:��,��A�7��B)'��D��H��L��K��D‡?�O��A��O��6�:��,��+�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4��=�R��J53��H��:��!��DƇ>�O��-��8�W��H�O��W��K��-��4�R��J�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��986��N��A��9��=��H��5��D‡?�O��Jٟ@��6�:��G��2��@��@��@,*��N��9��H��5��D‡?�O��J��5�:��G��2��@��@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9SQ��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��I��P��A��F��E��>��6MK��Dǭ;��D��Q��D�U��H��:��D�O��>��&��D�B��7��D�O��O��J��D��P��A��F��>��6�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=ϷA��H����@��H��=��Dć?�O��9��A����@�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9A?��,�O��-��H��D��Bٟ@��;��?��=��1��P��K��@‡?�O��=�9��=��@Ԛ<53��,�O��-��H��D��@�?��=��1��P��K��@‡?�O��9��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9JH��H��=��W��K��=��:��B��:��D‡?�O��:��D��1��=��@�9��=��D��9��D��5��@Ԛ<><��H��=��W��K��=��:��B��:��D‡?�O��:��1��=��@��=��9��5��@Ԛ<�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9&$��H��=��Dć?�O��=�9��=��D��S�D��A ��H��=��Dć?�O��9��D��S�D��A�,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9 ��V��H��Lć?�O��D��6��L��@Ԛ< ��V��H��Lć?�O��D��6��L��@Ԛ<,*��@��CӽD��=��H��K��:��=��-Ƈ>�O��=�9��=&$��@��CӽD��=��H��K��:��=��-Ƈ>�O��9><��D��H��K��L��9�GϪJ��D‡?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<;9��D��H��K��LݲLϪJ��D‡?�O��=��D�?��/ٟ@��=��@��6��@Ԛ<�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��120��H��,��8��2�,ֈ;��0��4��V��C��7��G��/��T��>��1)'��H��,��8��2ڈ;��4��V��Cî7��/��T��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1,*��V��@��,��1��V��2�,��7��C��7��G��.��V��@ �M��,��1��V��2��7��Cî7��.�M�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��153��H��8��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>,*��H��8��2��7��Cî7��/��T��>��1��?��T��J��Q�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1/-��W��?��A��;��O��V��2�,��7��C��7��G��A��.��T#!��W��?ҞM��O��V��2��7��Cî7��A��.�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2�,��>��B��-��4��5��J��2��>��B��-��5�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1hf��H��8�
-��N��2�,ԓ4��D��C��7��G��7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2�,��W��FVT��H��8�
-��N��2��4��Cî7��7��B��K��;��9��/��T��>��1��K��L��/��U��5�
-��5����>��2��W�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��186��H��,��7��H��8��2�,��R��N��V��C��7��G��/��T��7��>��1��H��,��>��1�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��2��C��2��C�)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1GE��2�,߀3՟?��4��H��8��V��2�,��7��C��7��G��/��T��>��1��?��T��J��Q��>;9��2߀3՟?��4��H��8��V��2��7��Cî7��/��T��>��1��?��T��J��Q)'��H��8��V��2�,��7��C��7��G��/��T��>��1#!��H��8��V��2��7��Cî7��/��T��>��1��V��2�,��7��C��7��G¶;��V��2��7��Cî7¶;���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5—P��=—Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D><��@��G��D��5��ՂP��R��A��5��H��D��KϲL��K��2��!��Q��H�9��T;9��@��G��D��5��ՂP��R��A��5��H��DϲL��K��2��!��Q��H�9��T���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��D,*�F�7��C��P��L߫W��A��=��R��A��D��K��S��7)'�F�7��C��P��L߫W��A��=��R��A��D��S��7���B��R��A��D��K��B��R��A��DPNՂP��L��E��;ߏG��K��C��R��A��D��K��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,MKՂP��L��E��;ߏG��K��C��R��A��D��C��B��A��M��K��C��K�?��K��C��<��O��Sߋ5��,��B��R��A��D��K��B��R��A��DGE��,��9��;��D��K��1��?؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D��K><��,��9��;��D�R؇9��U��8ȴS��>��C��P��D��7��L��R��A��B��A��D���B��R��A��D��K��B��R��A��DDB��R��A��D��K��C��5—P��=—Pބ2��R��A��D��K��C��5��8�:��-��9��E��T86��R��A��D��C��5��=܉2��R��A��D��C��5��8�:��-��9��E��T��B��R��A��D��K��B��R��A��D\Z��:��D��K��C��L��C��B��C��L��5��?��L��F��L��>��H��D��K��C��R��A��K�?��M��KߏG��K��C��BùFPN��:��D��C��L��C��B��C��5��?��L��L��>��H��D��C��R��A��K�?��M��KߏG��K��C��BùF���B��R��A��D��K��B��R��A��DJH��H���5ՂP��2��C��D��K��L��A��R��A��K��3��D��K��M��K��5��D��Kև9��>��TA?��H���5ՂP��2��R��K��L��A��R��A��K��3��D��M��K��5��D��Kև9��>��B��R��A��D��K��B��R��A��Dqo�
-��2��C��D��KՂP��L����A��R��A��K��3��D��K��M��KߏG��K����HӒC��,��N��D��K��5��=��T��Uߋ5��,��,��=��>��:��J_]�
-��2��R��KՂP��L����A��R��A��K��3��D��M��KߏG��K����HӒC��,��D��5��=��T��Uߋ5��,��=��>��:���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��;��K��F��A��BѤI��;��;��K��F��A��B���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��S��>��>ٟ@��6��;��@Ԛ<ѤI��;��S��>��>��5��;��@Ԛ<���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�JH��3Ԛ<ѤI��;��>��6��;��6��S��F����;�.��T��T��D��<��D�<��D��C��)�.��FDB��1ѤI��;��>��6��;��6��S��F����;�.��T��D��<��D�<��D��C��)�.��F���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�PNѤI��;��A�9ٟ@��6��-�9��A��4��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��TDBѤI��;��A @��6��9��>��T�M��SѤI��2��2��E��7��>��>��2��O��D��@��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T><ѤI��;��B��2ѤI��;��2ѤI��;��0ѤI��;��S��NѤI��;��N��O��F��T���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A53ѤI��;��>��>��;��U�0��>��;��D��6��P��G��D��S�D��A���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;�ѤI��;��M��@��KѤI��;��M��@��K���6ѤI��;���6ѤI��;�ѤI��;��@��?	ѤI��;��@��6ѤI��;���6ѤI��;� ѤI��;��8ٟ@��>��6��C��A�7��BѤI��;��8ٟ@��>��6��C��+�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��:��B�7��>��B��3��1��C��T��C��CԃP��-��C#!��:��B�7��>��B��3��1��C��CƠ<��C�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G/-Ԋ/��B��N��P��9��2��K��1��W��>��2Ԋ/��Lؒ.��=#!Ԋ/��N��9��K��1��W��>��2��Lؒ.��=�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��N��W��>ڹ3��2��1��%��K��9��E��?��A��F��F��?��D�J��EʡH��9��?�/86��N��W��>ڹ3��2��%��K��9��E��?��A��F��F��D�J��E��9�/�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G)'��A��F��F��?��9��E��1ڹ3��2��1��K��W��(#!��A��F��F��9��E��1ڹ3��2��K��W��(�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G86��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H�K��7��<��653��>��>��8��R��V��G��Bڹ3��;��2��F��5��>��H��7��<��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��D��N��A��D��V��9��3��>��R��9��B�>��:��D��:��T��C��S��-��@��@��@86��D��A��D��V��9��3��>��R��9��B�>��:��D��:��C��S��@��@�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G��J��B�7��>��J��3��/��:��J�7��>��J��3��/��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��A��F��?��9��C��1��N��Wڹ3��2��1��K��:��&�8��7)'��A��F��9��C��1��N��Wڹ3��2��K��:��&��8�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��9��E��N��3��9��>ڹ3��T��2��1��M����1��T)'��9��E��N��3��9��>ڹ3��T��2��M����1��T&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��9��D��B��B��3ҾW��1��9��6��5��9��D��:��Q��T��C��2ʶU��>��3�.ٟ@��6ǽ=��G��@Ԛ<A?��D��B��B��3ҾW��1��9��6��5ՔD��Q��C��2ʶU��>��3ٟ@��6��G��@Ԛ<�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53�1��TН?��>��/��3��>��N��D��3��>��2��H��T��C��.��:)'�1��?��/��3��N��D��3��>��2��H��C��.��:&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GDB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��ADB��5��3��>��R��D��>��B�7��H��L��T��D��>��B�7��:��L��G��D��S�D��A�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��V��>��N��Dͯ?ڹ3�F��1ȇN��;�9��F��G�B)'��V��>��N��Dͯ?ڹ3�F��1ׇN�9��F��G�B&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G,*��5�D��Bڹ3�G��>��<��9�7��>�?��L��S�:)'��5�D��Bڹ3�G��>��<��9�7��>��F��S�:�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G20��;�@�7��:��T��C��U��D��Tڹ3��>��NщQ��A�7��B#!��;�7��:��C��U��Dڹ3��>��NщQ��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��Gki��R��V��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B��Tec��/��>��N��Dڹ3��2��į?��1��T�9��Fܫ7�M��N��6��K��9��D��,��K��6�9��.��1��R��3�R��F����B��O��B�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G53��E��E��O��3��5��B��5��Iٟ@�7��A��:��5��Gς1��6��T&$��E��>��3��B��5��@�7��A��:��5��G��+&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GSQ��6��3��P��7��S��4��D��T�9��I��8��7��1�Dڹ3��2��:��T��C��RٍB��KЅJ��C��G��>��6DB��6��3��P��7��S��C��T��8��7��1�Dڹ3��2��:��C��RٍB��KЅJ��C��>��6�&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��GA?��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��J��D��8��D��A��P86��5�B��J��H����:ɚ��K�7��3��G��H��A��V��T��D��8��A&$��5��D��>��B�7�Dڹ3��2��:��T��C��G#!��5��D��>��B�7�Dڹ3��2��:��C��G&$��>��>��8��R��V��G��Bڹ3��;��2��F��5&$��>��>��8��R��V��G��Bڹ3��;��2��F��5�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��I��N��=��J��T)'يR��2ŞيR��2Ş��8��J��-��I��=��J��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8_]��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��>��G��@��K\Z��>��7��JЁH��?��ʡH��W��O��U��A��7��J��1��H����N����=���F���F��HيR��2��8��G��@��K�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8zx��(����"���������
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;��#_]�
-��U��A��7��J��1��H�
-��N����=���F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;��#�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8><يR��J��TيR��8��T��يR��<��J����N��	��=يR��J��-����8��T;9يR��J��TيR��8��T��يR��<����N��	��=يR��J��-����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��886يR��2��A��8��>يR��2��A��8��>��8��J��-��W��N����8��T,*يR��2ŞيR��2Ş��8��J��-��W��N����8��T�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8VT�
-��U��A��7��J��1��H����=���F���F��HيR��2��8��>�1��0�7����A��@��H۰M�3��AMK�
-��U��A��7��J��1��H����=���F���F��HيR��2��8�1��0�7����@��H�3��A�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8;9يR��2��8��>��9��K��A��8��D��6��P��>��JщQ��N��.��6��@Ԛ<53يR��2��8��9��K��A��8��D��P��>��JщQ��N��.��6��@Ԛ<�����يR��2��8����يR��2��8DB��O��H��D��-��6��J����=���F��HيR��2��8��>��DН?��>��Q��TيR��453��O��D��6��J����=���F��HيR��2��8��D��?��QيR��4����يR��2��8����يR��2��8b`����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��>��D��A��7��J��U��A��7��<��J��:��B��;\Z����U��A��7��J��1��H����N����=б�F���F��HيR��2��8��D��A��7��J��U��A��7��<��:��B��;���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��/��U��P��G��,��N��K��Q��M��/��U��P��G��,��K��Q��M���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��/�J��P��C��9�8��?��U��P��T��/�J��P��9�8��?��U��P��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDA?��/��/��P��/��/��P��O—P��=��-��/��/��P��C�?��K��P��/��/��Pĩ8><��/��/��P��/��/��P��O��=��-��/��/��P��C�?��K��P��/��/��Pĩ8���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD><��N��R��9��K��U��P��S��>��9��S�9Ԛ<��/��D��9��D��R��K��@Ԛ<53��N��R��9��K��U��P��S��>��9��S��1��/��9��R��K��@Ԛ<���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD20��5��9��P��/ַ;��/��P��/��P��A��/��P��?��P��F��7,*��5��9��P��/ַ;��/��P��/��P��/��P��?��P��F���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕDPN��U��P�?İU��H��P��.��F��-��S��5��1��S�S��A��P��K�8��5��G��6�����)��ʪJH��U��P�?İU��H��1��F��-��S��5��1��S�S��A��P��K�8��G��6�����)��ʪ���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD ��U��PʡH��9��8��C��C��H��/��T��U��P��9��C��C��H��/��T���U��P��/ڶ>��D��U��P��/ŕD��/��P��P��Q��A��P��,��9��P��/��P��P��A��P��,��9��P��U��P��/ڶ>��D��U��P��/ŕD��PʰD��/��Fַ;��PʰD��/��1�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��ܷT��1��W��>��/��>��/��C��SܷT��1��W��>��/��U��SܷT��1��W��>��/ܷT��1��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7��(����%����!����"�~��U��W��>��/��>��/��C��S��U��W��>��/��U��S��U��W��>��/��U��W��>��/��>�8��M����6��@��6��6��>ќ:��0��F��6�<��G��>��2��6��7���ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WGE��>ܷT��1��8��W��>��/��2��6��S��C��S��EܷT��1��8��W��>��-��/��7��B��6A?��>��U��8��W��>��/��2��6��S��C��S��E��U��8��W��>��-��/��7��B��6�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��WA?��6��D��Q��6��6��NیV��O��H��2ܷT��1��8��W��/��Q��6��6��;��6��=;9��6��Q��6��6��NیV��O��H��2��U��8��W��/��Q��6��6��;��6��=�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W��8ܷT��1��O��W��=��;��8��U��O��W��=��;�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W ܷT��1��W��F��M��>Л6��;��@��K��U��W��F��>Л6��;��@�ܷT��1��8��W	��U��8��WܷT��1��8��W��@��?��U��8��W��@ܷT��1��8��W	��U��8��W20��A��8��9�Q��EܷT��1��G��4��W��E��>��F��W��A��B)'��A��8ƋQ��E��U��G��4��W��E��>��F��A��B���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XMK��:��?��:��?��L��I��M��W��#������D��E��=��X��<��F��#����#��%Ӳ&��Ӳ&��;9��:��?��:��?��L��M��W��#������D��E��=��XѶ<��#��#�Χ���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X ��W��R��:��?�9��Iʉ5��X��@Ԛ<��W��R��:��?��Iʉ5��X��@Ԛ<���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��XVT��9��W��I��>��:��?щQ��V��4��6��V��6��#����6��#��%��6��#��
��6��$����6��#�8���8GE��9��W��I��>��:��?щQ��V��6��V��6��#��6��#��6��#��6����6��#�8���8���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X�9��Wʉ5��X��@��N�9��Wʉ5��X��@��N���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X><��W��R��:��?�9��I��G��>ܤK��V��#����%ѾC��H��T��L��6��L��T53��W��R��:��?��I��G��>ܤK��V��#����%��5��L��6��L��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X;9��<��W��1��/��>��:��?б�����9��WڶU��5���P����R��T53��<��W��1��/��>��:��?���9��WڶU��5���P����R��T���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X53��E��W��N��6��=��A��9��S��:��?�9��I��>��<��G��V��620��E��W��N��6��=��A��9��S��:��?��I��>��<��G��V��6���:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?�9��>��WΚI��5��D��X��K��8��WщQ��#��
��@��@��@/-��:��?̖>��WΚI��5��D��X��K��8��WщQ��#��@��@��:��?�9��WΚI��=��X��:��?�9��WΚI��=��X86��:��?��Gʉ5����B��W��/��U��X��7�A��E��3��D��#����&&$��:��?��G����B��W��/��U��X�A��E�����D��,��?��R��F��D��,��?��R��F;9��,��?��R��F��Q��U��B��D��A��P��;��0��T��?��6��T��)����!&$��,��?��R��F��Q��U��A��;��T��6��T��)��D��,��?��R��F��D��,��?��R��F/-��?��,��F��R��>��,��6��2ɀ?��E�B��P��2��2��>)'��8��F��R��>��,��6��2ɀ?��E�B��P��2��2���D��,��?��R��F��D��,��?��R��FMK��D��=��D��3��Dٟ@��F��R��?��,��1��@��?��>��1�9��Kٟ@�9ٟ@�-��4��,��@Ԛ<><��D��D��Rٟ@��F��R��8��1��@��?��>��1�9��K��9�-��4��,��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��>��BϨH��,��@��?��,��6��D��P�D��A&$��R��F��>��B؋8��@��8��6��D��P�D��A���D��,��?��R��F��D��,��?��R��F><��D��,��?��R��F��?ϨH��.��?��R��J��V��9��S��6��>��EщQ��@Ԛ<53��D��,�.��F��?��H�.��J��V��9��S��6��>��EщQ��@Ԛ<��D��,��?��R��F��D��,��?��R��F,*��R��F��,��?��H��A��V��T��J��D��8��D��A��P ��R��8��?��H��A��V��T��D��8��A���D��,��?��R��F��D��,��?��R��FGE��D��3��D����R�I��F��,��2��?��.����@��PیV��D��H��A��V��D��A��P��T><��D��R����R�I��F��,��2��?��.����@��PیV��D��H��A��V��A��T��D��,��?��R��F��D��,��?��R��F20��A��9��=��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<,*��9��R��F��,��L��?��.��,��K��,��6щQ��@Ԛ<���D��,��?��R��F��D��,��?��R��FA?��D��R��F��,íB��?��2��D��9��7��I��6��.��2��9��DܤK��4��@��@��@86��D��R��F��,íB��?��2��D��9��7��I��6��.��2�D��4��@��@��D��,��?��R��F��D��,��?��R��F��,��?��R��F��U��P��U��T��,�.��F��U��P��U���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I><��S��9��C��X��7ֈ?�N��X��Iַ;��W��N��I��,ڶ>��T��0��N��6�Q20��S��9��Xֈ?�N��X��I��W��N��,ڶ>��T��0��N��6�Q���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I)'��9��C��Xֈ?�N��X��Iַ;��B�<ނB��<��T#!��9��Xֈ?�N��X��I��B�<ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��:��-��9��C��Xֈ?�N��X��Iַ;��:��-��9��Xֈ?�N��X��I���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��Iec��/��S��B��X��D��/��S��:��X��D��<Υ6��1یV��0��/��S��B��X��U��B��O��B��E��B��V��B��,��B��-ނB��<��TMK��/��S��B��D��/��S�:��D��<Υ6��V��0��/��S��B��UüO��E��B��V��,��-ނB��<��T���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I,*��9��C��Xֈ?�N��X��Iַ;��M��/��TۓR��7��K ��9��Xֈ?�N��X��I��M��TۓR��7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I#!��9��C��FۨV��T��/��9��7ʡHб��6��9��F��8��/��9��7ʡHб��6���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I ��9��C��Xֈ?�N��X��Iַ;�7��4��9��Xֈ?�N��X��I�7���9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I&$��9��C��Xֈ?�N��X��Iַ;��/��N��7��2��9��Xֈ?�N��X��I��/��NĚ7��9��C��Xֈ?�N��X��Iַ;��9��Xֈ?�N��X��I��9��C��4ֈ?��Iַ;��6��B��T��9��4ֈ?��I��B��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OPN��D��D��7��O��C��-��S�O֊2��>��S��DɵO��6��8�9��H��A��V��T��J��D��8��D��A��PA?��D��D��7��C��-��S�O��>��S��DɵO��6��8�9��H��A��V��T��D��8��A���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�OA?��7��8��B��Bر/��D��2ѺKٟ@��6��T��C��M��U�<��F������!����"20��7��8��B��Bر/��D��2ѺK��5��T��C��M��U�<��F��	���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O ��5��7��:��C��D�O֊2ѺK��@Ԛ<��5��7��:��C��D�OѺK��@Ԛ<���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODBкB��9��N��7��:��C�O֊2��>ٟ@��6߇;��1��G��3��F��7;Q��6��7;Q��T><кB��9��N��7��:��C�O��>��5߇;��1��G��3��F��7;Q��6��7;Q��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�ODB��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N�3��>��M><��7��O��B��6��2��>��R��@��2��A��5��7;QԚ<��7;Q��T��7��N��>���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O)'��V��D��D��7��B��C��9��2��>��/��6��7��T)'��V��D��D��7��B��C��9��2��>��/��6��7��T���7��B�O֊2	��7��B�O)'��1��7��>��B��@��2��A��6�O��D��P�D��A&$��1��7��>��B��@��2��A��6��D��P�D��A��7��B�O֊2	��7��B�O&$��V��7�J��R��1��:��2��R��<��@��@��@#!��V��7�J��R��1��:��2��R��<��@��@���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M;9��2��8��=��S��0��M��2��8��G��N��0��6��W��,��6��4����6��T20ſ2��=��S��0��Mſ2��G��N��0��6��W��,��6��4����6���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��M/-��2��8��>��M��2��8��1��S��6��MۓR��9��T��,��K&$ƿ2��>��Mƿ2��1��5��MۓR��9��T��,��K���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��A��2��8��=χ7��1��S��6��M��N��1��SщQχ7��=�R��J,*��Aſ2��=χ7��1��5��M��N��1��SщQχ7�R��J���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M��S��6��M��E��S��2��8��@Ԛ<��5��M��E��Sſ2��@Ԛ<���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8ٟ@��6�Q��>��D��1��.��S��6��M��>��=����6��P����6��T20ſ2��5�Q��>��D��1��.��5��M��>��=����6��P����6��2��8��>��1��S��6��Mſ2��>��1��5��MYW��D��NԚ<��2��8��1��S��6��M��1�H��3��Vٟ@��2��8��1��D��A��P��;��0��T��?��6��T��)����!86��Nſ2��1��5��M��1�H��3��Vٟ@ſ2��1��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M><��2��8��1��S��6��M��>��1�H��3��PیV��D��H��A��V��D��A��P��T20ſ2��1��5��M��>��1�H��3��PیV��D��H��A��V��A��T��2��8��>��1��S��6��Mſ2��>��1��5��M53��M��Vٟ@��2��8��D��A��P��;��0��T��?��6��T��)����!��Mٟ@ſ2��A��;��T��6��T��)���2��8��>��1��S��6��Mſ2��>��1��5��M;9��K��S��6��MԚ<��2��8��D��A��P��;��0��T��?��6��T��)����!#!��K��5��MԚ<ſ2��A��;��T��6��T��)��2��8��>��1��S��6��Mſ2��>��1��5��M#!��A��2��8��1��S��6��M��D��S�D��A��Aƿ2��1��5��M��D��S�D��A�����—P��J��>��R��JЍ—P��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��JA?��V��N��N��,̥6��:��D��9��S��J��6�O��Q��Nέ;��L�S��DʡH��9�;86��V��N��,̥6��:��D��9��S��J��6��Q��N٭;�S��DʡH��9�;�����—P��J��>��R��JЍ—P��J��>��R��J/-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����—P��J��>��R��JЍ—P��J��>��R��J53����DȂ3��@��>��Q—P��J—P��J��>��R��C��R��A�7��B/-����DȂ3��@��>��Q—P��J—P��J��>��R��C��R��+�����—P��J��>��R��JЍ—P��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����—P��J��>��R��JЍ—P��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��D��J��A�7��B/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��D��J��+�����—P��J��>��R��JЍ—P��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����—P��J��>��R��JЍ—P��J��>��R��J��G��7��T��Q��-��G��7��T��Q��-�����—P��J��>��R��JЍ—P��J��>��R��J)'����Q—P��J��R��6�9��:��A��D��S�D��A#!����Q—P��J��R��9��A��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��JDB�����������A��2ûR�9��?��A��>��;��B��TûR�9��?��A��>��5��653��T�A��2ûR�9��?��A��>��;��BûR�9��?��A��>��5��6�����—P��J��>��R��JЍ—P��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��J><������6��J��D��9��S��J��6ȻW̑-�9ٟ@—P��J��>��R��J��@Ԛ<86����6��J��D��9��S��J��6ȻW̑- @—P��J��>��R��J��@Ԛ<�����—P��J��>��R��JЍ—P��J��>��R��J/-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����—P��J��>��R��JЍ—P��J��>��R��JMK����6��6�D��J��Q—P��L��>��J��R��J��J��Q—P��L��@��B��J��9��Uڤ5��5��@Ԛ<GE����6��6�D��J��Q—P��L��>��J��R��J��J��Q—P��L��@��B��J��Uܤ5��@Ԛ<�����—P��J��>��R��JЍ—P��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����—P��J��>��R��JЍ—P��J��>��R��J����4��B��4յG��W��G��X��F����4��B��4��W��X�����—P��J��>��R��JЍ—P��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����—P��J��>��R��JЍ—P��J��>��R��J,*������R��:��D��>ڝJ��R��K��2��D��G��@��K&$Ѝ��R��:��D��>ڝJ��R��K��2��D��@��K�����—P��J��>��R��JЍ—P��J��>��R��J)'����Q—P��J��R��6�9��:��A��D��S�D��A#!����Q—P��J��R��9��A��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��J;9��5��$��,��U��,������6��D��>��:��5��JЂJ��9��W��W����C/-��5��$��,��,Ѝ��6��D��>��:��5��JЂJ��9āR��A�����—P��J��>��R��JЍ—P��J��>��R��J,*��M��:��������D��>��J��6߻W��D��S�D��A,*��M��:��������D��>��J��6߻W��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��J�����A��F��8�,��TЍ�A��F��,��T�����—P��J��>��R��JЍ—P��J��>��R��J/-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ</-����R߻W—PۃJ��>��J��R��6߻W��,ƛK��9��@Ԛ<����—P��J��>��R��JЍ—P��J��>��R��J	����4��B	����4��B�����—P��J��>��R��JЍ—P��J��>��R��Jqo��=��>����Q��H��,��<��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R��N��7��>�8��0��9��D��S��PԮK߀3hf��=��>����Q��H��,��5�Wį?��;��>—P��J��B—PۃJ��D��9��S�I�F��>��J��6��R�7��>�8��0��9��D��S��PٮK����—P��J��>��R��JЍ—P��J��>��R��J/-��U��C��U��TʡH��>��/��X��>����>�A��2���� ��U��UʡH��>��/��X����>��2Ѝ�����—P��J��>��R��JЍ—P��J��>��R��JPN������P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6ǽ=��E��X��>��PGEЍ��P��;��>��L��C��D��9��<��?��>��J��9��K��B�D��L��=Ė1��6��E��>��P����—P��J��>��R��JЍ—P��J��>��R��J�����A��F��P��C��<��B��BЍ�A��F��P��<��B��B�����—P��J��>��R��JЍ—P��J��>��R��J)'����Q—P��J��R��6�9��:��A��D��S�D��A#!����Q—P��J��R��9��A��D��S�D��A����—P��J��>��R��JЍ—P��J��>��R��J&$����U��R��:��D��>��J����B����/��T&$����U��R��:��D��>��J����B����/��T���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB���2��<��;��>��2��<��;��>���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�)'��V����2��P��K��C�4��EȯB��-�;��J��6&$��V����2��P��K��C�4��EȯB��-�;ϜJ���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�20�T��3��=��C��;��D��9��>��:��C��O��-֛7��<��B��B,*�T��=��C��;��D��9��>��:��C��O��-��<��B��B���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�/-ȯB��K��C��;��9��;��L�V��6�����)��ʪ��,*ȯB��K��C��;��;��L�V��6�����)��ʪ�����P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�53��2��<��F��2��D��C��D��:��LܾW��X��F��H��F��N��I��9)'��2��<��,��D��:��LܾW��X��F��H��F��N��I���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��H����D��C��,ȯB��J��P��I��@Ԛ<#!��H����D��C��,ȯB��J��P��I��@Ԛ<���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A,*��>����2��P��:ȯB��K��6��N�K��D��S�D��A���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�#!��P��N��ȯB��>��9��H��-�B�V��6#!��P��N��ȯB��>��9��H��-�B�V��6���P��C��1��1ȯB���P��C��1ȯB�&$��&��D��C��1ȯB��I��9��1��P��I��@Ԛ<#!��&��D��C��1ȯB��I��1��P��I��@Ԛ<��P��C��1��1ȯB���P��C��1ȯB�JH����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;�9��V��;��K��XܤK��$GE����D��P��RȯB��I��H��,��5��6��:��L��I��B��,��I��;��V��;��K��XܤK��$���
��X˩5�R�9��:��
��X˩5ֲ9/-��'��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9,*ȏBҲU��>��R��<��G��I��X��I��C��E��#��CҮJ��B��>ɸ<��I��X��I��C��#߭J���
��X˩5�R�9��:��
��X˩5ֲ9GEݩ5��T��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<DBݩ5��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9GE��
��B߹-�;��:��XܷT��6˩5��J˩5��4��
��B߹-�;��:��XܷT��6˩5��/��7;9��
��B��-��:��XܷT��6��J˩5��4��
��B��-��:��XܷT��6��/��7���
��X˩5�R�9��:��
��X˩5ֲ9��D��B��:��
��>˩5��A��K��B��:��
��>˩5��A��K��
��X˩5�R�9��:��
��X˩5ֲ9��6��T��'��
߹-��X��6˩5��6��T��
߹-��X��6���
��X˩5�R�9��:��
��X˩5ֲ9/-��'��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9A?��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��9��D˩5ƛK��6��@��@��@;9��Uٟ@����5߹-��:��X��D˩5��I��:��X��B��D˩5ƛK��6��@��@���
��X˩5�R�9��:��
��X˩5ֲ9GEݩ5��T��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<DBݩ5��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9GE��
��B��I�;��:��XܷT��6˩5��J˩5��4��
��B��I�;��:��XܷT��6˩5��/��7A?��
��B��I�;��:��XܷT��6��J˩5��4��
��B��I�;��:��XܷT��6��/��7���
��X˩5�R�9��:��
��X˩5ֲ9��D��B��:��
��>˩5��A��K��B��:��
��>˩5��A��K��
��X˩5�R�9��:��
��X˩5ֲ9;9��5˱U̾-��C��3��C��I��Q��:����>��
��:��X��>��6˩5��,��;86��5˱U̾-��C��3��C��I��Q��:����>��
��:��X��>��6��,��;���
��X˩5�R�9��:��
��X˩5ֲ9/-��'��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9\Z��
��B��H��
��6ӻB��O��
߹-��:��XܷT��B��H��
߹-��XܷT��;��W��;����N����=��	�F��J˩5��4YW��
��B��H��
��6��O��
߹-��:��XܷT��B��H��
߹-��XܷT��;��W��;����N����=��	�F��J˩5��4���
��X˩5�R�9��:��
��X˩5ֲ9GEݩ5��T��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<DBݩ5��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9)'��
��>��6˩5��5�W�R�9��:��D��S�D��A ��
��>��6��5�Wֲ9��D��S�D��A���
��X˩5�R�9��:��
��X˩5ֲ9��D��B��:��
��>˩5��A��K��B��:��
��>˩5��A��K��
��X˩5�R�9��:��
��X˩5ֲ9A?б��=��	�F߹-��=��X��
��B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X��
��B��H��F��S��T��T��9��CگD��/���
��X˩5�R�9��:��
��X˩5ֲ9/-��'��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9nl�R��A߹-��:��X��>��
��B��6˩5��1��D��0�;��Hٟ@�R�9��:��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<_]�R��A߹-��:��X��>��
��B��6��1��0��Hٟ@ֲ9��K��B��B��>�5��I��B��E��K�R��F��T��D�>��6��@Ԛ<���
��X˩5�R�9��:��
��X˩5ֲ9GEݩ5��T��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<DBݩ5��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9GE��
��K��B��6��N��E��I��:��X��5�R�9��:˩5��U�I�R��>��:��D��S�D��AA?��
��K��B��6��N��E��I��:��X��5ֲ9˩5��U�I�R��>��:��D��S�D��A���
��X˩5�R�9��:��
��X˩5ֲ9��D��B��:��
��>˩5��A��K��B��:��
��>˩5��A��K��
��X˩5�R�9��:��
��X˩5ֲ9DB߹-��:��XܷT��6��H߹-��:��XܷT��6˩5��Q��'��
��Ѳ��B��6ӻB��O��453߹-��:��XܷT��6߹-��:��XܷT��6��Q��
��Ѳ��B��0��4���
��X˩5�R�9��:��
��X˩5ֲ9/-��'��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<,*��
��ڲ߹-��:��X��>˩5��6�I��:��,��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9/-��0��:��X��6˩5��0��:��X��6˩5�>��4��6��4��T&$��0��:��X��6��0��:��X��6�>��4��6��T���
��X˩5�R�9��:��
��X˩5ֲ9GEݩ5��T��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<DBݩ5��C��
��B��6��:��X��/ݩ5ٟ@��5��U�I��:��
��.��X��>˩5��G��@Ԛ<��
��X˩5�R�9��:��
��X˩5ֲ9A?б��=��	�F߹-��=��X��
��B˩5��H��F��S��T��:��TʡH��?��CگD��/86б��=��	�F߹-��=��X��
��B��H��F��S��T��T��9��CگD��/���
��X˩5�R�9��:��
��X˩5ֲ9��D��B��:��
��>˩5��A��K��B��:��
��>˩5��A��K��
��X˩5�R�9��:��
��X˩5ֲ9\Z��D�R��A��9į?߹-��=��X��>��6˩5��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��AYW��D�R��A��9į?߹-��=��X��>��6��H��0��-��D��E��0��6��EщQ��I��.��6щQ��2��2��D��S�D��A�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��H��E��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��;��W��H��E��K��K��9ݠ.��E��T����N��;��W��H��E��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��K��E��	��9ݠ.��E��T ����N��9��;��W��K��	��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��;��W��K��E��K��9ݠ.��E��T����N��;��W��K��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��E��K��9ݠ.��E��T ����N��F��;��W��E��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��F��;��W��2��T��9��K��9ݠ.��E��T&$����N��F��;��W��2��T��9��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K#!����N��F��;��W��H��K��9ݠ.��E��T ����N��F��;��W��H��K��9��E��T�����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K&$����N��F��;��W��H��E��K��9ݠ.��E��T#!����N��F��;��W��H��E��K��9��E��T����N��F��;��W��H��4��E��K����N��F��;��W��H��4��E��K)'����N��9ݠ.��;��W��H��E��	��9ݠ.��E��T#!����N��9��;��W��H��E��	��9��E��T���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��T��6��7��8��:��T��6��7��Iַ;—P��=��8��-)'��:��T��6��7��8��:��T��6��7��I��=��8��-���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I ��T��1��8��:��C��T��6��7��Iַ;��T��1��8��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��Iܥ6��0��T��6��7ȣ8��Iַ;ܥ6��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��:��0��E��U��P��U��,��I��:��T��6��7��Iַ;#!��:��0��8��P��,��I��:��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��0��T��6��7ȣ8��Iַ;��0��T��6��7��I���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I��:��C��T��6��7��Iַ;��@��?��:��T��6��7��I��@���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I/-��:��C��T��6��7��Iַ;��:��C��T��6��7��Iַ;��;#!��:��T��6��7��I��:��T��6��7��I��;���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I,*��0��E��U��4��J��8��:��C��T��6��7��Iַ;ܥ6 ��0��8��4��J��:��T��6��7��Iܥ6���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I)'��:��C��T��6��7��Iַ;��M��/��TۓR��7��K��:��T��6��7��I��M��TۓR��7���:��C��T��6��7��Iַ;��:��T��6��7��I ��7��E��U��:��C��T��6��7��Iַ;��7��8��:��T��6��7��I��:��C��T��6��7��Iַ;��:��T��6��7��I����P����P���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B�:��D��>�7��5��.��T�:��D��>�7��5��.��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BMK��%��X��6��Xޡ8��X��S��X��8��X��N��X��.��X��C��X�C��X��F��X��2��X��4��X��CA?��%��X��6��Xޡ8��X��X��8��X��N��X��X��X��X��F��X��2��X��4��X��C���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BSQİF��E��1��;��/��6��4��X۹/��>��O��X۹/��>��TʭB��S��>��OʭB��S��>��T��U��>��6��K53İF��B��/��6��X��>��O��X��>��B��>��O��B��>��U��>��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B��/��4��?����B��O��B��T��/��4��?����B��O��B���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��>��L��?��?��H��F��?����F��T#!��/��4��3��>��L��?��H��F����F��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��BGE˛5��9��/��=��T��4��>��X��?ޡ8��R��V��4��>��E��1��;��6��T��4��4��K��2,*���-��4��>����/��4��>��B��6��T��5��K��2���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��BVT��D�G��:��/��4��X��>��3��?��X��?��F��B��T��F��?ޡ8��H��?��.��:��FʭB��.��4��?��F��6><��G��:��/��4��X��>��3��?��X��F��B��T��Fޡ8��H��.��F����F��6���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��BVT��E��1��;��>��C��6��P��K��H��,��-��X��?��7�1�E��7��0����NʡH����H��0��6��4��T��DB��B��>��C��6��P��K��H��,��-��X����0����NʡH����H��0��6��4��T�����4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B#!��U�/��4��X��>��3��B��?��8�,��T ��U�/��4��X��>��3��B��?��,��T���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B/-��-��4��4��6��M��;����-��>��>��@��W��>��W��>)'��-��5��6��M��;����-��>��>��@��>��W��>���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T53��?��4��1��K��>��F����7��>�>��D��<��(��6հL��3��T���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B_]��E��1��;��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��(��������!MK��B��1��-��X��?��P��@��4��B��S��?��H��-��M��>ԁ:�F��T��,��;��J��8��L�0��X���4��X��>��E��1��;��4��X��>��B ��E��1��;��>��X��H��MʭB��W��T��B��>��X��H��MʭB��W��4��X��>��E��1��;��4��X��>��B)'��/��4��3��?��>��L��H��J��X�/��E��N��B#!��/��4��3��?��>��L��H��J��X��E��N���4��X��>��E��1��;��4��X��>��B/-��-��X��E��1��;��7߹-��W��D��7Օ��Nծ��H����-��X��B��Օ��Nծ��H����4��X��>��E��1��;��4��X��>��B����3��H��2��4��C��M΄/ǟ9��=��Tޡ8��?������$����������������������ڻ��������������(����$�������������������!����3��H��2��4��C��Mτ/��-�8������$����������������������ڻ��������������(����$�������������������!���4��X��>��E��1��;��4��X��>��B ��E��1��;��/��6��4��D��G��@��K��B��/��6��D��@��K��4��X��>��E��1��;��4��X��>��B,*��/��4��3��?��>��L��H��J��X�/��E�1ʞ:��-)'��/��4��3��?��>��L��H��J��X��E�1ʞ:��-���4��X��>��E��1��;��4��X��>��Bki��E��1��;��M�I��B��>ю2��/��4��A��T��2��3��W��S��;��X��Iю2��Xю2��>ю2��U��A��T��X��I��X�����)����PN��B��M��I��>ю2��/��4��F��2��3��W��S��;��X��2�2ю2��U��F��X��I�����)������4��X��>��E��1��;��4��X��>��B,*��H��E��1��;��>��/��4��H��?��L��B��<��B��B#!��H��B��>��/��4��H��L��B��<��B��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��9��G��O��B��Q��T��2��>����P��V��P��.��5��A��J��>��P)'��9��G��O��B��L��>����P��P�.��J��>��P���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FA?�C��O��W��>��M�>��B��W��A��Q��T��2��9��6��O��8��G��D��S�D��A;9�C��O��W��>��M�>��B��W��A��L��9��6��O��8��G��D��S�D��A���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��9��G�>��B��Q��T��2��>��V��J��7��6��8��T��7��=&$��9��G�>��B��L��>��VќJ��6��8��7��=���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��
-��F��W��L����S��J������$����2����A��B/-��
-��F��W��L����S��J������$����2����A��B���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��N��F��H��F��O��F��O��V��V��A��4��@��K&$��N��F��H��F��O��F��O��V��V��A��4��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86�>��B��Q��T��2��>΂P��F��;�/��U��
��N��5��L��U��ٶ,*�>��B��L��>΂P��F��;�/��U��
��N��5��L��U���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!�>��B��W��B��Q��T��2��>��F��@Ԛ<�>��B��W��B��L��>��F��@Ԛ<���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F#!��W��2��E�>��D��Q��T��2ϩN��F��B��W��E�>��D��LϩN��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��FMK��R��9��G��M��W��W��A��I�>��B��N��=��=�9��=��A��>��M��N��S��9��=��A�7��B86��RךG��W��W��A��I�>��B��N��=��=��>��M��N��S��9��=��+��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F��8��F��5��R��.��U��E��S��2��8��F��=��.��U��E��S���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F;9��D��9��D��I�>��B��W��R��Q��T��2��>��S��Q�U��>��V��@Ԛ<&$��9��I�>��B��W��R��L��S��>��V��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��J��>��R��8��"����������F��K��%��F��J��>��R������F��%��F���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F86��D��S��8��G�>��B��W��A��Q��T��2��>��M��@��?��@��@��@/-��D��S��8��G�>��B��W��A��L��>��M��@��?��@��@��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F53��D��9��6��M��E��K�>��B��Q��T��2��>��V��D��@��@��@&$��D��9��6��E��K�>��B��L��>��V��@��@���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F&$��W��>��V�>��B��W��A��Q��T��2��@Ԛ< ��W��>��V�>��B��W��A��L��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F ֖F��>��P��Mމ6��J��6��J����7��+��Mމ6��J��6��J����7���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F)'��4�>��B��W��A��Q��T��2��>��J��F��@Ԛ< ��4�>��B��W��A��L��>��S��@Ԛ<��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��U�Mӛ?�1��?��7��F��,��7���M�����R��Q#!��U�Mӛ?�1��?���M�����R��Q���W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F20��R��0��W��6�>��BйS��Q��T��2��>��F��D��S�D��A,*��R��0��W��6�>��BйS��L��>��F��D��S�D��A��W��A�>��B��Q��T��2��>��F��W��A�>��B��L��>��F/-��D��9��6��M�>��B��W��A��Q��T��2��D��S�D��A&$��D��9��6�>��B��W��A��L��D��S�D��A���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IDZ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IDZ.��>��4��I��2��I��D��T��0��I��I��D��0��I,*��I��D��T��0��1�,��I��C��D��T��0��I��D��T#!��I��D��T��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I/-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ</-��0��I��9��Q��6��S��=��K��I��8��K��I��:��@Ԛ<���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��IA?��D��T��3��I��5��8��D��T��3��8��I��5��D��T��3��X��5��8��I����?53��D��3��I��5��8��D��3��8��I��5��D��3��X��5��8��I��2���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I&$��I��D��T��P��D��N��0��I��0��I��@Ԛ<��I��D��P��D��0��0��I��@Ԛ<���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I/-��0��I��D��T��Fַ;��8��-����8��T��������!��0��D��1��8��-����8�����I��D��T��0��I��I��D��0��I��0��I��4��@Ԛ<��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��D��0��I��D��0��I��4��D��0��IDZ.��>��4��I����?/-��D��0��I��D��0��I��4��D��0��IDZ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��IDZ.��>��4��@Ԛ<��0��IDZ.��>��4��@Ԛ<��I��D��T��0��I��I��D��0��I20��I��D��T��,�;��0��1�,��I��C��D��T��0��I��D��T)'��I��D��T��,�;��0��1��I��D��0��I��D��T���I��D��T��0��I��I��D��0��IDB��D��T��3��0��I��D��T��3��0��I��4��D��T��3��0��IDZ.��>��4��I����?86��D��3��0��I��D��3��0��I��4��D��3��0��IDZ.��>��4��I��2��I��D��T��0��I��I��D��0��IDB��,��TܷT��0��I��,��TܷT��0��I��4��,��TܷT��0��IDZ.��>��4��I����?86��,ܷT��0��I��,ܷT��0��I��4��,ܷT��0��IDZ.��>��4��I��2���I��D��T��0��I��I��D��0��I&$��R��I��D��T��N��0��I��0��I��4��@Ԛ<��R��I��D��0��0��I��4��@Ԛ<��I��D��T��0��I��I��D��0��IMK��D��T��3��N��0��I��D��T��3��N��0��I��4��D��T��3��N��0��IDZ.��>��4��I����?/-��D��3��0��D��3��0��4��D��3��0DZ.��>��4��I��2���I��D��T��0��I��I��D��0��I��0��I��5��I����?��0��I��5��I��2��I��D��T��0��I��I��D��0��I��0��IػK��4��@��K��0��IػK��4��@��K���I��D��T��0��I��I��D��0��I��0��I��Iַ;��4��D��G��@��K��0��I��I��4��D��@��K��I��D��T��0��I��I��D��0��I��0��I��D��G��@��K��0��I��D��@��K���I��D��T��0��I��I��D��0��I#!��I��0��IػK��I��0��I��4��I��0��I#!��I��0��IػK��I��0��I��4��I��0��I��I��D��T��0��I��I��D��0��I53��D��T��3��I��D��T��3��Iַ;��D��T��3�O�I��I����?#!��D��3��I��D��3��I��D��3�O��I��2��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��B��6	�L��B��6��L��7ٟ@��8��Lٟ@��8�,*��6—P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8��L��7��@��K	�L��@��K��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8��L��7��6��?	�L��6��?��L��7ٟ@��8��Lٟ@��8�,*��6—P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�,*��6�L��7��8��>ٟ@��H��F��@��F��7��6��>��P)'��6�L��8��>ٟ@��H��F��@��F��7��6��>��P��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�20��A��7�L��7�Hٟ@��8��E��P��;��:��P��O��@��@��@,*��A��7�L�Hٟ@��8��E��P��;��:��P��O��@��@��L��7ٟ@��8��Lٟ@��8�,*��6—P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�—P��,�L��7��?��6��0���?��6��0��L��7ٟ@��8��Lٟ@��8��L��7��@��?�L��@�L��7ٟ@��8��Lٟ@��8�/-�L��7ٟ@��8��A��R��>��:��6��>��N��D��S�D��A,*�Lٟ@��8��A��R��>��:��6��>��N��D��S�D��A��L��7ٟ@��8��Lٟ@��8�,*��6—P��,�L��7ٟ@��8��H��7��@�K��7��@Ԛ<#!��6�ٟ@��8��H��7��@�K��7��@Ԛ<�L��7ٟ@��8��Lٟ@��8�wu�L��7��D��F��6�L��7��B��7�L��7��6��<��6—P��,�L��7��
-�Gٟ@��8��6��7��@��7��5�L��7��8��>ٟ@�;��F��J��>��N��1�S_]�L��D��F��6�L��B��7�L��6Ǥ<���
-�Gٟ@��8��6��7��@��7��5�L��8��>ٟ@�;��F��J��>��N��1�S�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��C��P��I��/��C��/��9��?��T#!�0��=����P��I��/��C��/��9��?��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S86¨0ʽ=��=��Tɾ=��C��6��=��Tɾ=��C��6��A�A��N��T��A��T#!�0��=��T�6��=��T�6��A��A��T��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��W����7��>��7��C��<��B��B)'�0��=��Tɾ=��W����7��>��7��C��<��B��B�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-��=��Tɾ=��>¨0ʽ=ʇX��Qޢ<�Qɾ=��Cݰ?��Q��.&$��=��Tɾ=��>�0ʇX��Qޢ<�Q̾=ݰ?��Q�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��J��6��O��T¨0��A��=��Tɾ=��J��6��K�0ҳ��O��T�0ҳ��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SJH¨0ʽ=��PʇX��D�Q��=��Tɾ=��C��>ΉX˛5¨0��A��/��T��D¨0��A��/�A��4�J53�-ʇX��D�Q��=����>ΉX˛5�0��/��T��D�0��/��A�J�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S)'¨0��A��=��Tɾ=��W�9��L��/͒�A��4��T�0��=��Tɾ=��W��/͒�A��4�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��SSQ¨0ʽ=��P��N��=��Tɾ=��C��@��Eޢ<�Qɾ=��C��D��/��D��/��QİU��4��4����������
/-�-��N��=����@��Eޢ<�Q̾=��D��/��D��9��4�*�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S,*¨0��A��=��Tɾ=��6��=��T��3��O��T��D�A��4&$�0��=��Tɾ=��6��=��T��3��O��T��D��A�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��=��Tɾ=��6����8�,��T�0��=��Tɾ=��6����,��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0��A��=��Tɾ=��S��N��.��W��0�A��T����(����"�0��=��T̗<��.��0�A��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��SDB��S��4��8¨0ʽ=��P��=��Tɾ=��Cϛ)ϛ)�)�)�Q��Tɾ=��C��9��8��K��T/-ФO��8�-��=��ϛ)ϛ)�)�)�Q����9��8��K��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S¨0ʽ=��>��=��Tɾ=��R��@Ԛ<�0��>��=��Tɾ=��R��@Ԛ<¨0��A��=��Tɾ=��S�0��=��Tɾ=��S#!��E��=¨0ʽ=��=��Tɾ=��.��8��?̛<��=�0��=��Tɾ=��.��?�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S20¨0ʽ=��Dٟ@ޢ<�Qɾ=��@��=��Tɾ=��C��P��H��/��4&$�0��Dٟ@ޢ<�Qɾ=��@��=����H��/��4¨0��A��=��Tɾ=��S�0��=��Tɾ=��S;9¨0��A��=��Tɾ=��C��E��S��S��.��PщQ¨0��A�A��4��D�A��T&$�0��=����E��S��*щQ�0��A��D�A��T�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S/-¨0��A��D��>��=��Tɾ=��CѲ/��D��T����(����"�0��D��>��=��Ѳ/��D��T��¨0��A��=��Tɾ=��S�0��=��Tɾ=��S ¨0��A��D��>��=��Tɾ=��C��@��K�0��D��>��=����@��K�¨0��A��=��Tɾ=��S�0��=��Tɾ=��S53¨0��A��=��Tɾ=��C��D�A��4��A�A��T��UʡH��9�A��/,*�0��=��T˾=��D��A��A�A��T��UʡH��9�A��/¨0��A��=��Tɾ=��S�0��=��Tɾ=��S��=��Tɾ=��C��6¨0��A��T��=��T�6�0��T�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/)'��/��/Æ.��J��:��N��L��J��S�1��/��G��B ��/��/Æ.��J��:��N��L��W��/��G�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/��S�1�D��?	��W�D��?�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/53��S�1��/��E��7��0��C��/��7��7����S�1��/����A��B#!��W��/��E����7����W��/����A��Bw	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/�A��B�A��B�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/20��9��J��/��?ſQ��5ߕJ��C��M��C��R��U��RН?��Q��T)'��9��J��/��?ſQ��5ߕJ��C��M��C��R����Q�	��S�1��/��W��/��S�1��/��B��;��A�A��T��W��/��B��A�A��T	��S�1��/��W��/><��B��U��>��9��@��V��W��F�?��Wַ;��;�E��-��S�1Н?��>��A��T20��B��>��9��@��V��W��F�?��Wַ;��;�E��-��W��?��A�	��S�1��/��W��/߹-��CʡH��9��7��Qן9ں-ʡH��9��7��Qן9	��S�1��/��W��/��S�1��/��I�A��6��W��/��I�A��6���E��G��?��>��-��E��G��?��>��-/-��G��?�R�1��4��2��T��N��5��=�7��@��P��:��J#!��G��?�1��4��2��T��5��=��@��P��J��E��G��?��>��-��E��G��?��>��-)'��G��?��>��-��P��L΅/��Bڶ>��S��J��@Ԛ< ��G��?��>��-΅/��B��S��J��@Ԛ<���E��G��?��>��-��E��G��?��>��-��G��?��T�4��G��?��T�4��E��G��?��>��-��E��G��?��>��-��G��W��-��T��G��*���E��G��?��>��-��E��G��?��>��-��E�,��G��?�/��-��"��D��:��E��G��?�/��-��"��D��E��G��?��>��-��E��G��?��>��-20��G��?��>��-��G��6��4��?��9ʉ5��;˫N¶;�P��N��T,*��G��?��>��-��G��4��?��9ʉ5��;ΫN�P��N��T���E��G��?��>��-��E��G��?��>��-86��G��?��>��-��2��2΅/��8��B��?¶7ģC��CщQ��D��P�D��A,*��G��?��>��-΅/��8��B��N�CщQ��D��P�D��A��E��G��?��>��-��E��G��?��>��-20����N��E�,��G��?��>��-��?¶7ʡH��W��B��:ģC��O#!����N��E��G��?��>��-��N��W��:�C���E��G��?��>��-��E��G��?��>��-&$��G��?��>��-��8��G��?��>��-��4��-��2#!��G��?��>��-��G��?��>��-��4��-��2��E��G��?��>��-��E��G��?��>��- ��G��?��>��-�/��.��BʭBѡ8¶;��G��?��>��-�/��BʭBѡ8¶;���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>,*��K�=�9��:ׄ9��?��DϪJ��P��>؞C��@��@��@ ��=��:ׄ9��?��DϪJ��Pρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>��>؞C��1��9��Tׄ9��?��@Ԛ<ρ>��1��9��Tׄ9��?��@Ԛ<���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>Ư8��Hׄ9��?��>؞C��@��@��@Ư8��Hׄ9��?ρ>��@��@���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>ׄ9��?��=��7ׄ9��?��=��7���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>&$��>؞Cׄ9��?��6��R�1��T��D��P�D��A ρ>ׄ9��?��6�1��T��D��P�D��A���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>DB����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��I��>؞Cб†M�8��6><����=����>��1��9��Tׄ9��?ׄ9��B��9��>��>Ư8��Iρ>б†M��8���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>#!��U—P۴2��>��M��N��,��B��MСG��T��U��P��>��M��N��B��MСG��T���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>hf��1��	��T��Sׄ9��?��A��J�9��J��O��T��,��Q��
��S��F��>��T��9��P��,��1�R��>؞Cб��:��6�����)��ʪ_]��1��	��T��Sׄ9��?��A˱9��O��T��,��Q��
��S��F��>��T��9��P��,��1�Rρ>б��:��6�����)��ʪ���>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>;9��>؞Cׄ9��?��B��:�9ڶ>��S��T��=��O��>��I��,��T�J��@Ԛ<53ρ>ׄ9��?��B��:�9��S��T��=��O��>��I��,��T�J��@Ԛ<��>ׄ9��?ϪJ��J�1��>��>ׄ9��?ϪJ��J�1��>A?��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9��>؞C��@��@��@;9��:��9�1��S��Tׄ9��?��9��M��,��.��T��>��BϪJ�9ρ>��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!��C��1����4��>��@��D��2��>��@Ԛ<#!��C��1����4��>��@��D��2��>��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2SQ��&����L����4�9��2��IщQ—P��=��&��1��X��4��B��D��7��1��X��G��:��&������T��6GEީ��L����4�9��2��IщQ��=��&��1��4��B��D��7��1��G��:��&������T��6�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2 ��Cڜ>����4��2��K��.��B��@��K��Cڜ>����4��K��.��@��K�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��6��1��&������6��P��>��4��2��9�Q��1��@����&��@��@��@20��6��1��&������6��P��>��4ƋQ��1��@����&��@��@�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2#!������6��E��4��2��4ڜ>��2����AЍ��6��E��4��4ڜ>��2����A�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2��6�>����4��2��E��X��@��N��6�>����4��E��@��N�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����6�9��4��2��A�7��B����6�9��4��+#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2����>��4ڜ>��F��5��@Ԛ<����>��4ڜ>��F��5��@Ԛ<�#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��&��F��D��6�D��2��4�9��@��D��2��>����1��X��J��V����V53��&��F��D��6�D��2��4�9��@��D��2��>����1��J����V#!��&��6�D����>��4�9��@��P��>��2#!��&��6�D����>��4�9��@��P��>��2;9��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��6�2��486��C��R����W��6��?۱U��R��T��:����R��&��6��D��>��D��4���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8MK����U��E��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��A�7��BDB��Ǡ2��;��6֊2��>��W��6��,ϨH��@��F�L��6��,��B��,��T��E��;��>��+���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�853�D��U��E��>��W��@��P��2��1�H��S��V��9��;��W��@Ԛ<,*�DǠ2��>��W��@��7��1�H��S��V��9��=��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886��U��E��2��V��=��L��2��9��6��T��=ȟN��2��D��S��>؞CԚ<20Ǡ2��2��V��=��L��2��9��6��T��=ȟN��2��D��Sρ>Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8JH��>؞C��X��A�N��;��W����S��V��6��2��6��D��U��E��=��W�L��6��,��6��@Ԛ<A?ρ>��X��A�N��=����S��V��6��2��6��DǠ2��=��W�L��6��,��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8PN��9��;��2��U��E��D����S��V�1��6��=��G��B��<�6��>؞Cб��:��6�����)��ʪDB��9��;��2Ǡ2��D����S��V�1��6��=��G��<ρ>б��:��6�����)��ʪ���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8DB��D��7��>��U��E��;��A����S��VϨH��,ϨH��W��;��6��2��>��T��6��@Ԛ<><��D��7��>Ǡ2��;��A����S��V؋8ϨH��W��;��6��2��>��T��6��@Ԛ<���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8;9��>��U��E��;��F��W��O��T��7��,��>��A��8��S��V��D��P�D��A20��>Ǡ2��;��F��W��3��7��,��>��A��S��V��D��P�D��A���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�886����U��E��6�1��6��=��V��6��>��6��L��=��>؞C��@��@��@/-��Ǡ2��6�1��6��=��V��6��>��6��L��=ρ>��@��@���U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8><��U��E��>��V��;��>��Wٟ@��2��>��6��@��2��>��6��>؞C��@��@��@53Ǡ2��>��V��;��>��Wٟ@��2��>��6��@��2��>��6ρ>��@��@��U��EϨH��W��V��@�8Ǡ2ϨH��W��V��@�8)'��>؞C��U��E��;��V��6��2��6��D��S�D��A#!ρ>Ǡ2��;��V��6��2��6��D��S�D��A���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��TJH�R��6��>��#��
��H��L��6��M��9ٟ@��U��V��UӁG��DܤK��8��<��#��
��@��@��@;9�R��6��>��#��H��L��6��M�@��U��V��UӁG��A��8��<��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T ��U��U��D��,��A��#��%��@��@��@��U��U��D��,��A��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T20��U��N��.��T��5ƛK��,��6�I�1��6��#��%��@��@��@,*��U��N��.��T��5ƛK��,��6�I�1��6��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��TDB�9Ԛ<��6ϪJ��>��#��
��>��Q�@��D��9��D��FҾW��SܤK��#��
��@��@��@,*��1��6��>��#��>��Q�@��9��FҾW��S��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T20��#��%��>��UӁG��D��9��D��.��7��>��#��%��@��@��@#!��#��>��UӁG��9��.��7��>��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��TMK�R��,��
��9��S��=ɵOʡH��9��B��>��U��U��D��=��UL�9��T��M��#��
��@��@��@;9�R��,��
��SɵO��9��>��U��U��D��=��UL�9��T��M��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T&$��DПC��,��UӁG��DܤK��#��
��@��@��@��DПC��,��UӁG��A��#��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T&$��E��>��F��#��
��U��D�K�0��@��@��@ ��E��>��F��#��U��D�K�0��@��@���#��
��U��D��T��#��U��D��T��@��@��@��#��
��U��D��T��#��U��D��T,*��#��
��U��D�K��-щQ��R��Q��#��
��@��@��@#!��#��U��D�K��-щQ��R��Q��#��@��@���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-YW��9��T����:��B��7ٟ@��������)��X��-���;��J��%��)ѾC��T��O��7��%��T�8��7��F��D�0A?��9��T����:��B��7ٟ@��X��-����;��)ѾC��T��O��7��%��T��8��F�0���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-&$��5��M�;��J��.��B��7��H��1��R��@Ԛ<#!��5��M��;��.��B��7��H��1��R��@Ԛ<���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-PN�;��J��A��5��D��N�8��R��8��E��B��S��;��7��6��X��H��-��N��F��K��,��D��P�D��AA?��;��A��5��D��N��R��N��B��S��;��5��X��H��-��N��F��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-DB�;��J��Iٟ@��F������X��H��-��E��DܤK��V�3��E��T�����)��ʪ86��;��@��Fح��X��H��-��E��D��V�3��E��T�����)��ʪ��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-_]��5��M�;��J��D��Cٟ@��F��2��6��K��:��X��-��R��B��9��S�8��@��D��6��9��>ҾW��D��,��D��P�D��APN��5��M��;��D��@��F��2��6��K��:��X��-��R��B��9��S�8��@��6ߖ>��D��,��D��P�D��A���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86�;��J�8��5��S���������X��H��-�8��E��6�O��@Ԛ<&$��;�8��5��S��X��H��-�8��E��6��@Ԛ<��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-�;��J١-ܤK��S��/��@��N��;١-ܤK��S��@��N���T�;��J��C��;��X��H��-��T��;��C��;��X��H��-><�;��J�8��5��S���������X��H��-�8��E��6�O��D��S�D��A,*��;�8��5��S��X��H��-�8��E��6��D��S�D��A��T�;��J��C��;��X��H��-��T��;��C��;��X��H��-86��6��C��;����	��X��-��N��W��H��T�;��J�����)��ʪ/-��6��C��;����X��-��N��W��H��;�����)��ʪ�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�GE��5��D��R��9��3��A��7��.��8��R��AƛK��2��T��H�?��T�!��H��A†M�8��6A?��5��D��R��9��3��A��7��8��R��AƛK��2��T��H�?��T�!��H��A†M��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�86��5��R��Aб�D��9��3��A��7��.��8��R��A���!��@��@��@20��5��R��Aб�D��9��3��A��7��8��R��A���!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ʡH��9��B��R��9����3��A��V��7��.��R��A��ϪJ��H��A��@��@��@20��9��R��9����3��A��V��7��R��A��ϪJ��H��A��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�;9�!��H��A��5��D��R��9��L��9��B��R��7��.��R��ϪJ��,��@Ԛ<86�!��H��A��5��D��R��9��L��9��B��R��7��R��ϪJ��,��@Ԛ<�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�ki��5��D��9��3��A��J��R��7��.��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��A†M�8��6ec��5��D��9��3��A��J��R��7��B��R��F��D��3����Bٟ@��7��5��Dٟ@��7��>��H��A��K��A��D��P�!��H��A†M��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�b`��5��D��R��9��L��9�D��R������7��.��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��A†M�8��6\Z��5��D��R��9��L��9�D��R������7��3��>����3��R��Q��K��U��D��A��-��D��3��D�!��H��A†M��8�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�/-��5��D��9��L��9�D��7��.��R��ƭI�!��@��@��@)'��5��D��9��L��9�D��7��R��ƭI�!��@��@�/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�hf��9��5��L��9�D��R��G��7��.��3��A��W�D��E��W��KѾC��H��T��7��H��A��7��:��6�����������)��ʪ��VT��9��5��L��9�D��R��G��7��3��A��W�D��E��W��K��5����:��6�����������)��ʪ���/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�20��R��9�D��9��3��A��R����7��.��6ǽ=��D��P�D��A,*��R��9�D��9��3��A��R����7��6��D��P�D��A/-��5��D��R��9�D��9��3��A��8��R��R����7��.�,*��5��D��R��9�D��9��3��A��8��R��R����7�><ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��.��K��5;9ϪJ��A��H��A��R��A��9�D��9��3��A��R��A��D��ϪJ��7��K��5���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��Bʔ7��7��>��P/-��W��<��7��R��:����1��E��P��9�M��Bݔ7��>��P���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��R��-��R��-���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<53��9��.�<��J��O��<��-щQ��.��6��O��.��6��U��7��7��T&$��9�<��J��O��<щQ��.��O��.��UԚ7��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20��.��6��O��3��7��;��0��G����.��6��.��6��<��B��B&$��.��O��3�7��0��G����.��.��<��B��B���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��8�I��6��T��.��O��8�I��6��T���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<��.��6��O��T����K��6��.��O��T����K��6���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<20����.��6��O�/��E��E��D��.��6��O�/��E��E��"��W#!������E��D��.��6��O�/��E��"��W���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<DB��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��D��S�D��A20��W��<��7��R��:����1��E��P��9�M��B��D��S�D��A���.��6��O��<��-	��.��O��</-��<��-��N��<��-��%������%��O܊7��<��0��>��T&$��<��-��N��<��%������%��O܊7��0��>��.��6��O��<��-	��.��O��<><��W��<��-��7��R��:��.��6��O�/��1��E��Pٟ@�9ٟ@�M��B��@Ԛ<,*��W��<��7��R��:����1��E��P��9�M��B��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��M��>��4��M��5��4Н?��A��3��A��T ��U��D��1ձM��4��M��5��4��A��A�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��C��T��%��8��>��9��S�1��M��E��;)'��U��D��1��C��T��%��8��>��9��S�1��M��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��U��3ʡH��W��R�D��U ��U��D��1��U��3ʡH��W��R�D��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1 ��U��D��1��M��>��4��M��5��G��3��U��D��1ձM��4��M��5��G�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��8��J��D��1��U��H��AʡH����R��G��M��=��T,*��U��8��J��D��1��U��H��AʡH����R��G��M��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1��F��B��L��L¶7��JѾC��4��W��,��M��4��;#!��U��D��1�B��N��J�C��W��1��4��;�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1ʡH��R��:��D��G��AʈO��>��6#!��U��D��1ʡH��R��:��D��G��A��>��6�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��.��1��P��D��>��J١-��-�� ��A��B&$��UȂ3��.��1��P��>��J١-��-�� ��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1SQ��>��K��U��9��D��1��M��.��O��G��UʡH��9�>��9��U��1��9��9��>��U��6�9��I��T��@Ԛ<DB��>��K��U��9��D��1��M��.ǼO��U�>��9��U��1��9��>��U��6��I��T��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�B—PϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��9ҧK��1�B—PϪJ��>��D ��U��D��1��9ҧK��1�B��J��>��D�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��UȂ3��1��M��C—P��Q��>��DԃP��E��A��B ��U͂3�M—P��Q��>��D�U��A��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1—P��Rޚ6��H��U��4��9��QÐW��B&$��U��D��1��Rޚ6��H��U��4��9��QÐW��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1&$��U��D��1��W��>β7��UщQ��D��G��@��K ��U��D��1��W��>ƴ7щQ��D��@��K�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��MʡH��R��H��U��U��D��1��MʡH��R��H��U�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1/-��U��D��1��@��1��G��M��3̛<��:��9��T�����!#!��U��D��1��@��1��G��M��3��:��T��W�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1��U��D��1��>��N��V����N��F��U��D��1��>��N��V����N��F�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1,*��U��D��1��MʡH��W��R��H��U��J��6��J����7,*��U��D��1��MʡH��W��R��H��U��J��6��J����7�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1JH��U��B��M�B��U��D��1��9ҧK��1�B—PϪJ��>��D��S��1��U��B��D��B��N��@Ԛ<><��U��M�B��U��D��1��9ҧK��1�B��J��>��Dū1��U��D��B��N��@Ԛ<�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��AʋM��Q��U��,��D��1��U��>��4��,��3��T��5��=��T&$��A��Q��U��,��D��1��U��>��4��3��5��=�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��153��4��U��D��1��M��/��5��S��7��H��4��7����N��
��H��)'��4��U��D��1��M��5��S������N��
��H���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1)'��U��D��1��U��/��VӲU��>��/��=��W��Q��T ��U��D��1��*ӲU��>��/��=��W��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1��UʡH��W��R��6��U��>�G��=��S��U��/��T��(����)������!/-��U��D��1��UʡH��W��R��6��U��>��G��S��U��T���	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1#!��U��D��1��M��>��U��.��6��<��B��B��U��D��1ձM��U��.��<��B��B�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��120��U��D��1�K��U��>�1��D��3̛<��2��/ќ6��H��Q��T&$��U��D��1�K��U��>�1��D��3��/��H��Q�	��U��D��1	��U��D��153��U��D��1ۓR��D��;��1��6ǁR��K��3��K��"����'����!)'��U��D��1ۓR��D��;��1��6ǁR��K��3��K��	��U��D��1	��U��D��1DB��U��D��1ۓR��4��H��5�B—PϪJ��>��D��3��K��T��(������(������!)'��U��D��1ۓR��4��H��5�B��J��>��D��3��K���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8qo��HԼO��R��@��C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��D��Oָ:��?��Thf��H����C��/��D��8��>ٟ@��8��	��P��@��N��LΊ;�J��@��>��@��BΊ;��R��P��@��N��LΊ;�J��@��>��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8nl��HԼO��R��@��C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��D��Oָ:��?��Tec��H����C��/��8��>ٟ@��8����P��HۇL��B��D��CɕH��5ǟV��G��R��P��H��G��L��B��D��=ږH��5��Dָ:��?��T���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8}{��M��:İU��;ԼO��R��@��?��R��5��<��D��>��B��D�5��@��E��7��K՞R��W��K��D��5��C��/��8��>ٟ@��>��:��T��(������!��K��;�8��6_]��:����?��R��5��<��D��>��B��D�5��@��E��G՞RʼG��D��5��C��/��8��>ٟ@��>��:��T����K��;�8��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8#!ß<��:��Dć?ԼO��@��C��/��8��>ٟ@ ß<��:��Dć?�O��C��/��8��>ٟ@���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8,*��HԼO��R��@��NܒM̺2��C��/��D��8��>ٟ@��#!��H����N�M��C��/��D��8��>ٟ@�����EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��853��HԼO��R��@��C��/��D��8��>ٟ@����R��E��@��>��D��W/-��H����C��/��D��8��>ٟ@����R��E��@��>��D��W���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��8&$��EԼO��@��C��/��8��>ٟ@��D��P�D��A#!��E�O��C��/��8��>ٟ@��D��P�D��A���EԼO��R��@����C��/��8��E������C��/��8ԼO��@��K��@��K�O��K��@��K��EԼO��R��@����C��/��8��E������C��/��8����HԼO��R��@��C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��@��>�Hٟ@��/ў7����:��@՞R��.ٟ@��/ў7��9��O��E��E��Xqo��H����C��/��8��>ٟ@����D��>��A��I��H��!������D��>�H��I��H��:��>�Hٟ@��/ў7����:��>ٟ@��/ў7��9��E��E��X���EԼO��R��@����C��/��8��E������C��/��8><ԼO��R��@��E��>��1��>��T��H��Iԓ4��C��/��8��>ٟ@����A�7��B,*����E��>��1��>��T����C��/��8��>ٟ@����+��EԼO��R��@����C��/��8��E������C��/��8;9��HԼO��@��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>�9��2��653��H�O��C��/��8��>ٟ@����H��2��9��<��Cԓ4��>��2��6���EԼO��R��@����C��/��8��E������C��/��8#!��HԼO��R��@��C��/��8��>ٟ@��@Ԛ<��H����C��/��8��>ٟ@��@Ԛ<��EԼO��R��@����C��/��8��E������C��/��820��H��?��R��S��H��Iԓ4��>ԼO��@‹7��C��/��8��>ٟ@)'��H��?��R��S����>�O‹7��C��/��8��>ٟ@
\ No newline at end of file
diff --git a/paddle/trainer/tests/gen_proto_data.py b/paddle/trainer/tests/gen_proto_data.py
deleted file mode 100644
index 8cc6d44673..0000000000
--- a/paddle/trainer/tests/gen_proto_data.py
+++ /dev/null
@@ -1,279 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from cStringIO import StringIO
-
-import paddle.proto.DataFormat_pb2 as DataFormat
-from google.protobuf.internal.encoder import _EncodeVarint
-
-import logging
-import pprint
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def encode_varint(v):
-    out = StringIO()
-    _EncodeVarint(out.write, v)
-    return out.getvalue()
-
-
-def write_proto(file, message):
-    s = message.SerializeToString()
-    packed_len = encode_varint(len(s))
-    file.write(packed_len + s)
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-def gen_proto_file(input_file, dicts, oov_policy, output_file):
-    def write_sequence(out, sequence):
-        num_features = len(dicts)
-        is_beginning = True
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            sample = DataFormat.DataSample()
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample.id_slots.append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample.id_slots.append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample.id_slots.append(0)
-
-            if patterns:
-                dim = 0
-                vec = sample.vector_slots.add()
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.ids.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-
-            sample.is_beginning = is_beginning
-            is_beginning = False
-            write_proto(out, sample)
-
-    num_features = len(dicts)
-    f = open(input_file, 'rb')
-    out = open(output_file, 'wb')
-
-    header = DataFormat.DataHeader()
-    if patterns:
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
-        slot_def.dim = sum(
-            [len(dicts[i]) for i in xrange(num_original_columns, len(dicts))])
-        logger.info("feature_dim=%s" % slot_def.dim)
-
-    for i in xrange(num_original_columns):
-        slot_def = header.slot_defs.add()
-        slot_def.type = DataFormat.SlotDef.INDEX
-        slot_def.dim = len(dicts[i])
-
-    write_proto(out, header)
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            write_sequence(out, sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-    out.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
-
-
-dict2 = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-if __name__ == '__main__':
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('trainer/tests/train.txt', cutoff, oov_policy)
-    dicts[2] = dict2
-    gen_proto_file('trainer/tests/train.txt', dicts, oov_policy,
-                   'trainer/tests/train_proto.bin')
-    gen_proto_file('trainer/tests/test.txt', dicts, oov_policy,
-                   'trainer/tests/test_proto.bin')
diff --git a/paddle/trainer/tests/mnist.list b/paddle/trainer/tests/mnist.list
deleted file mode 100644
index 703e87753d..0000000000
--- a/paddle/trainer/tests/mnist.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/mnist_bin_part
diff --git a/paddle/trainer/tests/mnist_bin_part b/paddle/trainer/tests/mnist_bin_part
deleted file mode 100644
index 08b93a0ebb..0000000000
Binary files a/paddle/trainer/tests/mnist_bin_part and /dev/null differ
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index 4aa64961d0..eaa8b9baf6 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -1,3 +1,17 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 /*
  * Copyright 2009-2010 Cybozu Labs, Inc.
  * Copyright 2011-2014 Kazuho Oku
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
deleted file mode 100644
index f189b21e86..0000000000
Binary files a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto and /dev/null differ
diff --git a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist b/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
deleted file mode 100644
index 8b041cd664..0000000000
--- a/paddle/trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.protolist
+++ /dev/null
@@ -1 +0,0 @@
-./trainer/tests/pydata_provider_wrapper_dir/test_pydata_provider_wrapper.proto
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_a.conf b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
deleted file mode 100644
index b1744db8d6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_opt_b.conf b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
deleted file mode 100644
index b1744db8d6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-################################### Data Configuration ###################################
-TrainData(ProtoData(files = "trainer/tests/mnist.list"))
-################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
-         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
-################################### Network Configuration ###################################
-data = data_layer(name ="input", size=784)
-
-fc1 = fc_layer(input=data, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-fc2 = fc_layer(input=fc1, size=800,
-               bias_attr=True,
-               act=SigmoidActivation())
-
-output = fc_layer(input=[fc1, fc2], size=10,
-                  bias_attr=True,
-                  act=SoftmaxActivation())
-
-lbl = data_layer(name ="label", size=1)
-
-cost = classification_cost(input=output, label=lbl)
-outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf b/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
deleted file mode 100644
index d19222360c..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_qb_rnn.conf
+++ /dev/null
@@ -1,154 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(        
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        Layer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            type = "recurrent",
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            inputs = Input(slot_names[i] + "_embedding_" + network_name,
-                           parameter_name = "rnn1.w0")
-        )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_config_rnn.conf b/paddle/trainer/tests/sample_trainer_config_rnn.conf
deleted file mode 100644
index b720d4d5a6..0000000000
--- a/paddle/trainer/tests/sample_trainer_config_rnn.conf
+++ /dev/null
@@ -1,180 +0,0 @@
-#edit-mode: -*- python -*-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
-
-# Note: when making change to this file, please make sure
-# sample_trainer_config_qb_rnn.conf is changed accordingly so that the uniitest
-# for comparing these two nets can pass (test_CompareTwoNets)
-
-default_initial_std(0.1)
-default_device(0)
-
-word_dim = 1451594
-l1 = 0
-l2 = 0
-
-model_type("recurrent_nn")
-
-sparse_update = get_config_arg("sparse_update", bool, False)
-
-TrainData(ProtoData(
-            type = "proto_sequence",
-            files = ('trainer/tests/train.list'), 
-            ))
-
-Settings(
-    algorithm='sgd',
-    batch_size=100,
-    learning_rate=0.0001,
-    learning_rate_decay_a=4e-08,
-    learning_rate_decay_b=0.0,
-    learning_rate_schedule='poly',
-)
-
-
-wordvec_dim = 128
-layer2_dim = 96
-layer3_dim = 96
-hidden_dim = 128
-
-slot_names = ["qb", "qw", "tb", "tw"]
-
-def SimpleRecurrentLayer(name, 
-                         size, 
-                         active_type, 
-                         bias, 
-                         input_layer_name, 
-                         parameter_name,
-                         seq_reversed = False):
-    RecurrentLayerGroupBegin(name + "_layer_group", 
-                             in_links=[input_layer_name], 
-                             out_links=[name],
-                             seq_reversed=seq_reversed)
-    memory_name = Memory(name=name, size=size)
-    Layer(
-        name = name,
-        type = "mixed",
-        size = size,
-        active_type = active_type,
-        bias = bias,
-        inputs = [IdentityProjection(input_layer_name),
-                  FullMatrixProjection(memory_name,
-                                       parameter_name = parameter_name,
-                                       ),
-                  ]
-        )
-    RecurrentLayerGroupEnd(name + "_layer_group")
-
-
-def ltr_network(network_name,
-                word_dim=word_dim,
-                wordvec_dim=wordvec_dim,
-                layer2_dim=layer2_dim,
-                layer3_dim=layer3_dim,
-                hidden_dim=hidden_dim,
-                slot_names=slot_names,
-                l1=l1,
-                l2=l2):
-
-    slotnum = len(slot_names)
-    for i in xrange(slotnum):
-        Inputs(slot_names[i] + network_name)
-    for i in xrange(slotnum):
-        Layer(
-            name = slot_names[i] + network_name,
-            type = "data",
-            size = word_dim,
-            device = -1,
-        )
-        Layer(
-            name = slot_names[i] + "_embedding_" + network_name,
-            type = "mixed",
-            size = wordvec_dim,
-            bias = False,
-            device = -1,
-            inputs = TableProjection(slot_names[i] + network_name,
-                                     parameter_name = "embedding.w0",
-                                     decay_rate_l1=l1,
-                                     sparse_remote_update = True,
-                                     sparse_update = sparse_update,
-                                     ),
-        )
-        SimpleRecurrentLayer(
-            name = slot_names[i] + "_rnn1_" + network_name,
-            size = hidden_dim,
-            active_type = "tanh",
-            bias = Bias(initial_std = 0,
-                        parameter_name = "rnn1.bias"),
-            input_layer_name = slot_names[i] + "_embedding_" + network_name,
-            parameter_name = "rnn1.w0",
-            )
-        Layer(
-            name = slot_names[i] + "_rnnlast_" + network_name,
-            type = "seqlastins",
-            inputs = [
-                slot_names[i] + "_rnn1_" + network_name,
-            ],
-        )
-    Layer(
-        name = "layer2_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer2_dim,
-        bias = Bias(parameter_name = "layer2.bias"),
-        inputs = [Input(slot_name + "_rnnlast_" + network_name, 
-                        parameter_name = "_layer2_" + slot_name + ".w", 
-                        decay_rate = l2, 
-                        initial_smart = True) for slot_name in slot_names]
-    )
-    Layer(
-        name = "layer3_" + network_name,
-        type = "fc",
-        active_type = "tanh",
-        size = layer3_dim,
-        bias = Bias(parameter_name = "layer3.bias"),
-        inputs = [
-            Input("layer2_" + network_name, 
-                  parameter_name = "_layer3.w", 
-                  decay_rate = l2, 
-                  initial_smart = True),
-        ]
-    )
-    Layer(
-        name = "output_" + network_name,
-        type = "fc",
-        size = 1,
-        bias = False,
-        inputs = [
-                  Input("layer3_" + network_name,
-                       parameter_name = "_layerO.w"),
-                 ],
-        )
-
-
-ltr_network("left")
-ltr_network("right")
-Inputs("label")
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-    )
-Outputs("cost", "qb_rnnlast_left")
-Layer(
-    name = "cost",
-    type = "rank-cost",
-    inputs = ["output_left", "output_right", "label"],
-    )
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
index d669fbc40c..741a0aa71d 100644
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -35,7 +35,7 @@ def outer_step(dummy_data):
                                  embedding_size=num_words)]
 
     def inner_step(dummy_memory, predict_word):
-        
+
         # simplified RNN for testing
         with mixed_layer(size=num_words) as layer:
             layer += full_matrix_projection(input=predict_word,
@@ -46,15 +46,15 @@ def outer_step(dummy_data):
                                                 param_attr=ParamAttr(name="wordvec"))
 
         return out
-    
+
     beam_gen = beam_search(name="rnn_gen",
                            step=inner_step,
                            input=gen_inputs,
                            bos_id=0,
                            eos_id=num_words-1,
                            beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=2 if beam_flag else 1,
-                           max_length=10) 
+                           num_results_per_sample=1,
+                           max_length=10)
     return beam_gen
 
 beam_gen_concat = recurrent_group(name="rnn_gen_concat",
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index 2b337282f6..58d27f15ae 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2),
                              embedding_size=num_words)]
 
 def step(dummy_memory, predict_word):
-    
+
     # simplified RNN for testing
     with mixed_layer(size=num_words) as layer:
         layer += full_matrix_projection(input=predict_word,
@@ -44,7 +44,7 @@ def step(dummy_memory, predict_word):
                                             param_attr=ParamAttr(name="wordvec"))
 
     return out
-    
+
 beam_gen = beam_search(name="rnn_gen",
                        step=step,
                        input=gen_inputs,
@@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen",
                        eos_id=num_words-1,
                        beam_size=2 if beam_flag else 1,
                        num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10) 
+                       max_length=10)
 
 seqtext_printer_evaluator(input=beam_gen,
                           id_input=sent_id,
diff --git a/paddle/trainer/tests/simple_sparse_neural_network.py b/paddle/trainer/tests/simple_sparse_neural_network.py
index 9604e1b9b4..970fb466dc 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network.py
@@ -1,6 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
-settings(batch_size=128, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
+settings(batch_size=17, learning_method=AdaGradOptimizer(), learning_rate=1e-4)
 
 file_list = 'trainer/tests/fake_file_list.list'
 
@@ -12,7 +26,7 @@ define_py_data_sources2(
 
 embedding = embedding_layer(
     input=data_layer(
-        name="word_ids", size=65536),
+        name="word_ids", size=8191),
     size=128,
     param_attr=ParamAttr(sparse_update=True))
 prediction = fc_layer(input=embedding, size=10, act=SoftmaxActivation())
diff --git a/paddle/trainer/tests/simple_sparse_neural_network_dp.py b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
index 8bfd1f37e7..49043c9175 100644
--- a/paddle/trainer/tests/simple_sparse_neural_network_dp.py
+++ b/paddle/trainer/tests/simple_sparse_neural_network_dp.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer.PyDataProvider2 import provider, integer_sequence, integer_value
 import random
 
@@ -7,15 +21,15 @@ def init_hook(settings, is_train, **kwargs):
 
 
 @provider(
-    input_types={'word_ids': integer_value(65536),
+    input_types={'word_ids': integer_value(8191),
                  'label': integer_value(10)},
     min_pool_size=0,
     init_hook=init_hook)
 def process(settings, filename):
     if settings.is_train:
-        data_size = 2**20
-    else:
         data_size = 2**10
+    else:
+        data_size = 2**5
 
     for _ in xrange(data_size):
-        yield random.randint(0, 65535), random.randint(0, 9)
+        yield random.randint(0, 8190), random.randint(0, 9)
diff --git a/paddle/trainer/tests/test.txt b/paddle/trainer/tests/test.txt
deleted file mode 100644
index 3ad503b34f..0000000000
--- a/paddle/trainer/tests/test.txt
+++ /dev/null
@@ -1,1000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
diff --git a/paddle/trainer/tests/testPyDataWrapper.py b/paddle/trainer/tests/testPyDataWrapper.py
index 2c29a27433..a76eeeacb9 100644
--- a/paddle/trainer/tests/testPyDataWrapper.py
+++ b/paddle/trainer/tests/testPyDataWrapper.py
@@ -20,28 +20,6 @@ import random
 import json
 import string
 
-
-@provider(slots=[
-    SparseNonValueSlot(10), DenseSlot(2), SparseValueSlot(10), StringSlot(1),
-    IndexSlot(3)
-])
-def processNonSequenceData(obj, filename):
-    with open(filename, "rb") as f:
-        for line in f:
-            slots_str = line.split(';')
-            index = int(slots_str[0])
-            non_values = map(int, slots_str[1].split()[1:])
-            dense = map(float, slots_str[2].split()[1:])
-            strs = slots_str[4].strip().split(' ', 1)[1]
-
-            def __values_mapper__(s):
-                s = s.split(":")
-                return int(s[0]), float(s[1])
-
-            values = map(__values_mapper__, slots_str[3].split()[1:])
-            yield [non_values, dense, values, strs, index]
-
-
 SPARSE_ID_LIMIT = 1000
 SPARSE_ID_COUNT = 100
 SEQUENCE_LIMIT = 50
@@ -146,8 +124,6 @@ def processSubSeqAndGenerateData(obj, name):
 
 
 if __name__ == "__main__":
-    pvd = processNonSequenceData("test.txt")
-    print pvd.getNextBatch(100)
     pvd = processSeqAndGenerateData("_")
     print pvd.getNextBatch(100)
     pvd = processSubSeqAndGenerateData("_")
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index e855a8fe2e..f3a964acb6 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
 }
 
 int main(int argc, char** argv) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   exit(0);
 #endif
   paddle::initMain(argc, argv);
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
deleted file mode 100644
index 383505f813..0000000000
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
-#include <algorithm>
-#include <cstdlib>
-
-#include "paddle/trainer/Trainer.h"
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-DECLARE_int32(gpu_id);
-
-DECLARE_bool(local);
-DECLARE_bool(use_gpu);
-
-DECLARE_string(config);
-DECLARE_string(nics);
-
-DEFINE_string(config_file_a, "", "config of one network to compare");
-DEFINE_string(config_file_b, "", "config of another network to compare");
-DEFINE_bool(need_high_accuracy,
-            true,
-            "whether need to run in double accuracy (recommended)");
-DEFINE_double(
-    max_diff_ratio,
-    0.0f,
-    "max diff ratio allowed for outputs and parameters (value/gradient)");
-
-struct ComData {
-  vector<Argument> outArgs;
-  vector<ParameterPtr> parameters;
-};
-
-void calcGradient(ComData& data, const string configFile) {
-  FLAGS_config = configFile;
-
-  FLAGS_local = true;
-  FLAGS_use_gpu = false;
-
-  FLAGS_nics = "";
-
-  *ThreadLocalRand::getSeed() = 0;
-  srand(0);
-
-  Trainer trainer;
-  trainer.init(TrainerConfigHelper::createFromFlagConfig(), false);
-
-  data.parameters = trainer.getGradientMachine()->getParameters();
-  trainer.getDataProvider()->setSkipShuffle();
-  trainer.train();
-}
-
-void checkBuffer(real* A,
-                 const char* desA,
-                 real* B,
-                 const char* desB,
-                 size_t len,
-                 size_t width = 1) {
-  int nNum = 0;
-  for (size_t i = 0; i < len; ++i) {
-    real diff = fabs(A[i] - B[i]);
-    if (diff > 0.0f &&
-        diff / std::max(fabs(A[i]), fabs(B[i])) > FLAGS_max_diff_ratio) {
-      nNum++;
-      LOG(INFO) << "Row: " << i / width << ", " << desA << " : " << A[i]
-                << "    " << desB << " : " << B[i];
-    }
-  }
-  EXPECT_EQ(0, nNum);
-  LOG(INFO) << "\n\n";
-}
-
-void compareGradient(ComData& comDataA, ComData& comDataB) {
-  vector<Argument> outArgsA = comDataA.outArgs;
-  vector<Argument> outArgsB = comDataB.outArgs;
-
-  for (size_t i = 0; i < outArgsA.size(); ++i) {
-    CpuMatrix matA(outArgsA[i].value->getHeight(),
-                   outArgsA[i].value->getWidth());
-    CpuMatrix matB(outArgsB[i].value->getHeight(),
-                   outArgsB[i].value->getWidth());
-
-    matA.copyFrom(*outArgsA[i].value);
-    matB.copyFrom(*outArgsB[i].value);
-
-    LOG(INFO) << "\n--------------------------------"
-              << " Check Network Output_" << i << ":"
-              << " -------------------------------------\n";
-    checkBuffer(matA.getData(),
-                "network A output",
-                matB.getData(),
-                "network B output",
-                matA.getElementCnt(),
-                matA.getWidth());
-  }
-
-  vector<ParameterPtr>& parametersA = comDataA.parameters;
-  vector<ParameterPtr>& parametersB = comDataB.parameters;
-
-  LOG(INFO) << "\n\n--------------------------------"
-            << " Check Gradient Machine Parameters:"
-            << " -------------------------------------\n";
-  for (size_t i = 0; i < parametersA.size(); ++i) {
-    ParameterPtr parameterA, parameterB;
-    parameterA = parametersA[i];
-    parameterB = parametersB[i];
-
-    CpuVector paraA(parameterA->getSize());
-    CpuVector paraB(parameterB->getSize());
-    paraA.copyFrom(*parameterA->getBuf(PARAMETER_VALUE));
-    paraB.copyFrom(*parameterB->getBuf(PARAMETER_VALUE));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_VALUE:  " << parameterA->getName()
-              << " ; size : " << paraA.getSize() << " ------------";
-    checkBuffer(paraA.getData(),
-                "Network A",
-                paraB.getData(),
-                "Network B",
-                paraA.getSize());
-
-    CpuVector gradA(*parameterA->getBuf(PARAMETER_GRADIENT));
-    CpuVector gradB(*parameterB->getBuf(PARAMETER_GRADIENT));
-
-    LOG(INFO) << "\n\n----------- PARAMETER_GRADIENT: " << parameterA->getName()
-              << " ; size : " << gradA.getSize() << " -----------";
-    checkBuffer(gradA.getData(),
-                "Network A",
-                gradB.getData(),
-                "Network B",
-                gradA.getSize());
-  }
-}
-
-TEST(Trainer, create) {
-  ComData dataA;
-  calcGradient(dataA, FLAGS_config_file_a);
-  LOG(INFO) << "\n\ntraining of Network A is finished\n\n";
-
-  ComData dataB;
-  calcGradient(dataB, FLAGS_config_file_b);
-  LOG(INFO) << "\n\ntraining of the Network B is finished\n\n";
-
-  compareGradient(dataA, dataB);
-}
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  initPython(argc, argv);
-
-#ifndef PADDLE_TYPE_DOUBLE
-  if (FLAGS_need_high_accuracy) {
-    LOG(INFO) << "skip test due to it's need high accuracy";
-    return 0;
-  }
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-4;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in low accuracy mode";
-  }
-#else
-  if (FLAGS_max_diff_ratio == 0.0f) {
-    FLAGS_max_diff_ratio = 2e-7;
-    LOG(INFO) << "auto set max_diff_ratio " << FLAGS_max_diff_ratio
-              << " in high accuracy mode";
-  }
-#endif
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 66ec65e340..92dc8aa9ec 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -25,45 +25,9 @@ limitations under the License. */
 #include <unordered_set>
 #include "picojson.h"
 
-void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
 void checkValue(std::vector<paddle::Argument>& arguments, picojson::array& arr);
 const std::string kDir = "./trainer/tests/pydata_provider_wrapper_dir/";
 
-TEST(PyDataProviderWrapper, NoSequenceData) {
-  paddle::DataConfig conf;
-  conf.set_type("py");
-  conf.set_load_data_module(std::string("testPyDataWrapper"));
-  conf.set_load_data_object(std::string("processNonSequenceData"));
-  conf.set_async_load_data(false);
-  conf.clear_files();
-  conf.set_files(kDir + "test_pydata_provider_wrapper.list");
-  paddle::DataProviderPtr provider(paddle::DataProvider::create(conf, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromPy;
-  provider->getNextBatch(100, &batchFromPy);
-
-  paddle::DataConfig conf2;
-  conf2.set_type("proto");
-  conf2.set_async_load_data(false);
-  conf2.clear_files();
-  conf2.set_files(kDir + "test_pydata_provider_wrapper.protolist");
-
-  provider.reset(paddle::DataProvider::create(conf2, false));
-  provider->setSkipShuffle();
-  provider->reset();
-  paddle::DataBatch batchFromProto;
-  provider->getNextBatch(100, &batchFromProto);
-
-  std::vector<paddle::Argument>& pyArguments = batchFromPy.getStreams();
-  std::vector<paddle::Argument>& protoArguments = batchFromProto.getStreams();
-  EXPECT_EQ(pyArguments.size(), protoArguments.size());
-
-  for (size_t i = 0; i < pyArguments.size(); ++i) {
-    checkEqual(protoArguments[i], pyArguments[i]);
-  }
-}
-
 TEST(PyDataProviderWrapper, SequenceData) {
   paddle::DataConfig conf;
   conf.set_type("py");
@@ -148,66 +112,6 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-void checkEqual(const paddle::Argument& expect,
-                const paddle::Argument& actual) {
-  if (expect.value) {
-    EXPECT_TRUE(actual.value != nullptr);
-    paddle::Matrix* e = expect.value.get();
-    paddle::Matrix* a = actual.value.get();
-    EXPECT_EQ(e->getWidth(), a->getWidth());
-    EXPECT_EQ(e->getHeight(), a->getHeight());
-    if (dynamic_cast<paddle::CpuSparseMatrix*>(e)) {
-      paddle::CpuSparseMatrix* se = dynamic_cast<paddle::CpuSparseMatrix*>(e);
-      paddle::CpuSparseMatrix* sa = dynamic_cast<paddle::CpuSparseMatrix*>(a);
-      EXPECT_EQ(se->getFormat(), sa->getFormat());
-      EXPECT_EQ(se->getElementCnt(), sa->getElementCnt());
-      size_t rowSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getElementCnt()
-                           : se->getHeight() + 1;
-      size_t colSize = se->getFormat() == paddle::SPARSE_CSC
-                           ? se->getWidth() + 1
-                           : se->getElementCnt();
-      for (size_t i = 0; i < rowSize; ++i) {
-        EXPECT_EQ(se->getRows()[i], sa->getRows()[i]);
-      }
-      for (size_t i = 0; i < colSize; ++i) {
-        EXPECT_EQ(se->getCols()[i], sa->getCols()[i]);
-      }
-      if (se->getValueType() == paddle::FLOAT_VALUE) {
-        EXPECT_EQ(paddle::FLOAT_VALUE, sa->getValueType());
-        for (size_t i = 0; i < se->getElementCnt(); ++i) {
-          EXPECT_EQ(se->getValue()[i], sa->getValue()[i]);
-        }
-      }
-    } else if (dynamic_cast<paddle::CpuMatrix*>(e)) {
-      EXPECT_EQ(e->getElementCnt(), a->getElementCnt());
-      for (size_t i = 0; i < e->getElementCnt(); ++i) {
-        EXPECT_EQ(e->getData()[i], a->getData()[i]);
-      }
-    }
-  }
-
-  if (expect.ids) {
-    EXPECT_TRUE(actual.ids != nullptr);
-    paddle::VectorT<int>* e = expect.ids.get();
-    paddle::VectorT<int>* a = actual.ids.get();
-    EXPECT_EQ(e->getSize(), a->getSize());
-    for (size_t i = 0; i < e->getSize(); ++i) {
-      EXPECT_EQ(e->getData()[i], a->getData()[i]);
-    }
-  }
-
-  if (expect.strs) {
-    EXPECT_TRUE(actual.strs != nullptr);
-    std::vector<std::string>* e = expect.strs.get();
-    std::vector<std::string>* a = actual.strs.get();
-    EXPECT_EQ(e->size(), a->size());
-    for (size_t i = 0; i < e->size(); ++i) {
-      EXPECT_EQ((*e)[i], (*a)[i]);
-    }
-  }
-}
-
 void checkValue(std::vector<paddle::Argument>& arguments,
                 picojson::array& arr) {
   // CHECK SLOT 0, Sparse Value.
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 264bc46ebc..394038cf73 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -24,7 +24,6 @@ using namespace std;     // NOLINT
 static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_hsigmoid.conf";
-static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
@@ -51,7 +50,7 @@ void checkGradientTest(const string& configFile,
 
 TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
 
 TEST(checkGradient, multiGpu) {
@@ -95,13 +94,6 @@ TEST(checkGradient, multi) {
 
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
-TEST(checkGradient, chunk) {
-  checkGradientTest(configFile3, false, false);
-#ifndef PADDLE_ONLY_CPU
-  checkGradientTest(configFile3, true, true);
-#endif
-}
-
 TEST(checkGradient, non_parallel) {
   checkGradientTest(configFile4, false, false);
 }
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 4d0174f784..b2a93d4d5e 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
 // 1. test trainer (cpu, gpu).
 TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
 
 TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
@@ -94,31 +94,31 @@ TEST(trainerOnePass, parallel) {
 #endif
 
 // 2. test average_window.
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(average_window, gpu) {
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
 }
 
 TEST(average_window, gpu2) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 2, 0.01);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window, gpu4) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window_cpu, gpu2) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 2, 0.01, true);
   FLAGS_num_passes = 1;
 }
 
 TEST(average_window_cpu, gpu4) {
-  FLAGS_num_passes = 100;
+  FLAGS_num_passes = 20;
   trainerOnePassTest(configFile1, true, false, 4, 0.01, true);
   FLAGS_num_passes = 1;
 }
@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkRemoteUpdater, gpuTrainer) {
   checkRemoteParameterUpdaterTest(configFile1, true, false);
 }
diff --git a/paddle/trainer/tests/test_config.conf b/paddle/trainer/tests/test_config.conf
index d1bb9b877f..2f86aaa753 100644
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -15,12 +15,7 @@
 
 from paddle.trainer_config_helpers import *
 
-TrainData(ProtoData(
-    files = "dummy_list",
-    constant_slots = [1.0],
-    async_load_data = True))
-
-TestData(SimpleData(
+TrainData(SimpleData(
     files = "trainer/tests/sample_filelist.txt",
     feat_dim = 3,
     context_len = 0,
diff --git a/paddle/trainer/tests/test_files.txt b/paddle/trainer/tests/test_files.txt
deleted file mode 100644
index 49002677a8..0000000000
--- a/paddle/trainer/tests/test_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/test_proto.bin
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 03446b3b2f..a8fbe31c2b 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
 #ifndef PADDLE_TYPE_DOUBLE
 
 TEST(RecurrentGradientMachine, test_generation) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   const auto useGpuConfs = {false};
 #else
   const auto useGpuConfs = {true, false};
@@ -124,6 +124,8 @@ TEST(RecurrentGradientMachine, test_generation) {
                      bool beam_search) {
     FLAGS_config_args = beam_search ? "beam_search=1" : "beam_search=0";
     for (auto useGpu : useGpuConfs) {
+      LOG(INFO) << configFile << " useGpu=" << useGpu
+                << " beam_search=" << beam_search;
       testGeneration(configFile, useGpu, hasSubseq, expRetFile);
     }
   };
diff --git a/paddle/trainer/tests/train.list b/paddle/trainer/tests/train.list
deleted file mode 100644
index f41e8e8893..0000000000
--- a/paddle/trainer/tests/train.list
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/data_bin_part
diff --git a/paddle/trainer/tests/train.txt b/paddle/trainer/tests/train.txt
deleted file mode 100644
index 2313aee987..0000000000
--- a/paddle/trainer/tests/train.txt
+++ /dev/null
@@ -1,5000 +0,0 @@
-Confidence NN B-NP
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-is VBZ B-VP
-widely RB I-VP
-expected VBN I-VP
-to TO I-VP
-take VB I-VP
-another DT B-NP
-sharp JJ I-NP
-dive NN I-NP
-if IN B-SBAR
-trade NN B-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-, , O
-due JJ B-ADJP
-for IN B-PP
-release NN B-NP
-tomorrow NN B-NP
-, , O
-fail VB B-VP
-to TO I-VP
-show VB I-VP
-a DT B-NP
-substantial JJ I-NP
-improvement NN I-NP
-from IN B-PP
-July NNP B-NP
-and CC I-NP
-August NNP I-NP
-'s POS B-NP
-near-record JJ I-NP
-deficits NNS I-NP
-. . O
-
-Chancellor NNP O
-of IN B-PP
-the DT B-NP
-Exchequer NNP I-NP
-Nigel NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-restated VBN I-NP
-commitment NN I-NP
-to TO B-PP
-a DT B-NP
-firm NN I-NP
-monetary JJ I-NP
-policy NN I-NP
-has VBZ B-VP
-helped VBN I-VP
-to TO I-VP
-prevent VB I-VP
-a DT B-NP
-freefall NN I-NP
-in IN B-PP
-sterling NN B-NP
-over IN B-PP
-the DT B-NP
-past JJ I-NP
-week NN I-NP
-. . O
-
-But CC O
-analysts NNS B-NP
-reckon VBP B-VP
-underlying VBG B-NP
-support NN I-NP
-for IN B-PP
-sterling NN B-NP
-has VBZ B-VP
-been VBN I-VP
-eroded VBN I-VP
-by IN B-PP
-the DT B-NP
-chancellor NN I-NP
-'s POS B-NP
-failure NN I-NP
-to TO B-VP
-announce VB I-VP
-any DT B-NP
-new JJ I-NP
-policy NN I-NP
-measures NNS I-NP
-in IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-last JJ B-NP
-Thursday NNP I-NP
-. . O
-
-This DT B-NP
-has VBZ B-VP
-increased VBN I-VP
-the DT B-NP
-risk NN I-NP
-of IN B-PP
-the DT B-NP
-government NN I-NP
-being VBG B-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-base NN B-NP
-rates NNS I-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-from IN B-PP
-their PRP$ B-NP
-current JJ I-NP
-15 CD I-NP
-% NN I-NP
-level NN I-NP
-to TO B-VP
-defend VB I-VP
-the DT B-NP
-pound NN I-NP
-, , O
-economists NNS B-NP
-and CC O
-foreign JJ B-NP
-exchange NN I-NP
-market NN I-NP
-analysts NNS I-NP
-say VBP B-VP
-. . O
-
-`` `` O
-The DT B-NP
-risks NNS I-NP
-for IN B-PP
-sterling NN B-NP
-of IN B-PP
-a DT B-NP
-bad JJ I-NP
-trade NN I-NP
-figure NN I-NP
-are VBP B-VP
-very RB B-ADVP
-heavily RB I-ADVP
-on IN B-PP
-the DT B-NP
-down JJ I-NP
-side NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Chris NNP B-NP
-Dillow NNP I-NP
-, , O
-senior JJ B-NP
-U.K. NNP I-NP
-economist NN I-NP
-at IN B-PP
-Nomura NNP B-NP
-Research NNP I-NP
-Institute NNP I-NP
-. . O
-
-`` `` O
-If IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-another DT B-NP
-bad JJ I-NP
-trade NN I-NP
-number NN I-NP
-, , O
-there EX B-NP
-could MD B-VP
-be VB I-VP
-an DT B-NP
-awful JJ I-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-, , O
-'' '' O
-noted VBD B-VP
-Simon NNP B-NP
-Briscoe NNP I-NP
-, , O
-U.K. NNP B-NP
-economist NN I-NP
-for IN B-PP
-Midland NNP B-NP
-Montagu NNP I-NP
-, , O
-a DT B-NP
-unit NN I-NP
-of IN B-PP
-Midland NNP B-NP
-Bank NNP I-NP
-PLC NNP I-NP
-. . O
-
-Forecasts NNS B-NP
-for IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-range VBP B-VP
-widely RB B-ADVP
-, , O
-but CC O
-few JJ B-NP
-economists NNS I-NP
-expect VBP B-VP
-the DT B-NP
-data NNS I-NP
-to TO B-VP
-show VB I-VP
-a DT B-NP
-very RB I-NP
-marked VBN I-NP
-improvement NN I-NP
-from IN B-PP
-the DT O
-# # O
-2 CD O
-billion CD O
--LRB- ( O
-$ $ B-ADJP
-3.2 CD O
-billion CD O
--RRB- ) O
-deficit NN B-NP
-in IN B-PP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-reported VBD B-VP
-for IN B-PP
-August NNP B-NP
-. . O
-
-The DT B-NP
-August NNP I-NP
-deficit NN I-NP
-and CC O
-the DT B-NP
-# # I-NP
-2.2 CD I-NP
-billion CD I-NP
-gap NN I-NP
-registered VBN B-VP
-in IN B-PP
-July NNP B-NP
-are VBP B-VP
-topped VBN I-VP
-only RB B-ADVP
-by IN B-PP
-the DT B-NP
-# # I-NP
-2.3 CD I-NP
-billion CD I-NP
-deficit NN I-NP
-of IN B-PP
-October NNP B-NP
-1988 CD I-NP
-. . O
-
-Sanjay NNP B-NP
-Joshi NNP I-NP
-, , O
-European JJ B-NP
-economist NN I-NP
-at IN B-PP
-Baring NNP B-NP
-Brothers NNPS I-NP
-& CC I-NP
-Co. NNP I-NP
-, , O
-said VBD B-VP
-there EX B-NP
-is VBZ B-VP
-no DT B-NP
-sign NN I-NP
-that IN B-SBAR
-Britain NNP B-NP
-'s POS B-NP
-manufacturing NN I-NP
-industry NN I-NP
-is VBZ B-VP
-transforming VBG I-VP
-itself PRP B-NP
-to TO B-VP
-boost VB I-VP
-exports NNS B-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-same JJ I-NP
-time NN I-NP
-, , O
-he PRP B-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-pessimistic JJ I-ADJP
-about IN B-PP
-the DT B-NP
-outlook NN I-NP
-for IN B-PP
-imports NNS B-NP
-, , O
-given VBN B-PP
-continued VBD B-NP
-high JJ I-NP
-consumer NN I-NP
-and CC I-NP
-capital NN I-NP
-goods NNS I-NP
-inflows NNS I-NP
-. . O
-
-He PRP B-NP
-reckons VBZ B-VP
-the DT B-NP
-current JJ I-NP
-account NN I-NP
-deficit NN I-NP
-will MD B-VP
-narrow VB I-VP
-to TO B-PP
-only RB B-NP
-# # I-NP
-1.8 CD I-NP
-billion CD I-NP
-in IN B-PP
-September NNP B-NP
-. . O
-
-However RB B-ADVP
-, , O
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-he PRP B-NP
-believes VBZ B-VP
-that IN B-SBAR
-a DT B-NP
-reduction NN I-NP
-in IN B-PP
-raw JJ B-NP
-material NN I-NP
-stockbuilding VBG I-NP
-by IN B-PP
-industry NN B-NP
-could MD B-VP
-lead VB I-VP
-to TO B-PP
-a DT B-NP
-sharp JJ I-NP
-drop NN I-NP
-in IN B-PP
-imports NNS B-NP
-. . O
-
-Combined VBN B-PP
-with IN B-PP
-at IN B-ADVP
-least JJS I-ADVP
-some DT B-NP
-rebound NN I-NP
-in IN B-PP
-exports NNS B-NP
-after IN B-PP
-August NNP B-NP
-'s POS B-NP
-unexpected JJ I-NP
-decline NN I-NP
-, , O
-the DT B-NP
-deficit NN I-NP
-could MD B-VP
-narrow VB I-VP
-to TO B-PP
-as RB B-NP
-little JJ I-NP
-as IN I-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-. . O
-
-Mr. NNP B-NP
-Briscoe NNP I-NP
-, , O
-who WP B-NP
-also RB B-ADVP
-forecasts VBZ B-VP
-a DT B-NP
-# # I-NP
-1.3 CD I-NP
-billion CD I-NP
-current JJ I-NP
-account NN I-NP
-gap NN I-NP
-, , O
-warns VBZ B-VP
-that IN B-SBAR
-even RB B-SBAR
-if IN I-SBAR
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-are VBP B-VP
-bullish JJ B-ADJP
-for IN B-PP
-sterling NN B-NP
-, , O
-the DT B-NP
-currency NN I-NP
-wo MD B-VP
-n't RB I-VP
-advance VB I-VP
-much JJ B-NP
-because IN B-SBAR
-investors NNS B-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-see VB I-VP
-further JJ B-NP
-evidence NN I-NP
-of IN B-PP
-the DT B-NP
-turnaround NN I-NP
-before IN B-PP
-adjusting VBG B-VP
-positions NNS B-NP
-. . O
-
-Nevertheless RB B-ADVP
-, , O
-he PRP B-NP
-noted VBD B-VP
-, , O
-`` `` O
-No DT B-NP
-one PRP I-NP
-will MD B-VP
-want VB I-VP
-to TO I-VP
-go VB I-VP
-into IN B-PP
-the DT B-NP
-trade NN I-NP
-figures NNS I-NP
-without IN B-PP
-a DT B-NP
-flat JJ I-NP
-position NN I-NP
-'' '' O
-in IN B-PP
-the DT B-NP
-pound NN I-NP
-. . O
-
-Meanwhile RB B-ADVP
-, , O
-overall JJ B-NP
-evidence NN I-NP
-on IN B-PP
-the DT B-NP
-economy NN I-NP
-remains VBZ B-VP
-fairly RB B-ADJP
-clouded VBN I-ADJP
-. . O
-
-In IN B-PP
-his PRP$ B-NP
-Mansion NNP I-NP
-House NNP I-NP
-speech NN I-NP
-, , O
-Mr. NNP B-NP
-Lawson NNP I-NP
-warned VBD B-VP
-that IN B-SBAR
-a DT B-NP
-further JJ I-NP
-slowdown NN I-NP
-can MD B-VP
-be VB I-VP
-expected VBN I-VP
-as IN B-SBAR
-the DT B-NP
-impact NN I-NP
-of IN B-PP
-the DT B-NP
-last JJ I-NP
-rise NN I-NP
-in IN B-PP
-interest NN B-NP
-rates NNS I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-takes VBZ B-VP
-effect NN B-NP
-. . O
-
-U.K. JJ B-NP
-base NN I-NP
-rates NNS I-NP
-are VBP B-VP
-at IN B-PP
-their PRP$ B-NP
-highest JJS I-NP
-level NN I-NP
-in IN B-PP
-eight CD B-NP
-years NNS I-NP
-. . O
-
-But CC O
-consumer NN B-NP
-expenditure NN I-NP
-data NNS I-NP
-released VBD B-VP
-Friday NNP B-NP
-do VBP B-VP
-n't RB I-VP
-suggest VB I-VP
-that IN B-SBAR
-the DT B-NP
-U.K. NNP I-NP
-economy NN I-NP
-is VBZ B-VP
-slowing VBG I-VP
-that DT B-ADVP
-quickly RB I-ADVP
-. . O
-
-The DT B-NP
-figures NNS I-NP
-show VBP B-VP
-that DT O
-spending NN B-NP
-rose VBD B-VP
-0.1 CD B-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-third JJ I-NP
-quarter NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-and CC O
-was VBD B-VP
-up IN B-ADVP
-3.8 CD B-NP
-% NN I-NP
-from IN B-PP
-a DT B-NP
-year NN I-NP
-ago RB B-ADVP
-. . O
-
-This DT B-NP
-compares VBZ B-VP
-with IN B-PP
-a DT B-NP
-1.6 CD I-NP
-% NN I-NP
-rise NN I-NP
-in IN B-PP
-the DT B-NP
-second NN I-NP
-from IN B-PP
-the DT B-NP
-first JJ I-NP
-quarter NN I-NP
-and CC O
-a DT B-NP
-5.4 CD I-NP
-% NN I-NP
-increase NN I-NP
-from IN B-PP
-the DT B-NP
-second JJ I-NP
-quarter NN I-NP
-of IN B-PP
-1988 CD B-NP
-. . O
-
-Mr. NNP B-NP
-Dillow NNP I-NP
-said VBD B-VP
-the DT B-NP
-data NNS I-NP
-show VBP B-VP
-the DT B-NP
-economy NN I-NP
-`` `` O
-is VBZ B-VP
-still RB B-ADVP
-quite RB B-ADJP
-strong JJ I-ADJP
-, , O
-'' '' O
-but CC O
-suggestions NNS B-NP
-that IN B-SBAR
-much NN B-NP
-of IN B-PP
-the DT B-NP
-spending NN I-NP
-went VBD B-VP
-on IN B-PP
-services NNS B-NP
-rather RB B-PP
-than IN I-PP
-consumer NN B-NP
-goods NNS I-NP
-should MD B-VP
-reduce VB I-VP
-fears NNS B-NP
-of IN B-PP
-more JJR B-NP
-import NN I-NP
-rises NNS I-NP
-. . O
-
-Certainly RB B-ADVP
-, , O
-the DT B-NP
-chancellor NN I-NP
-has VBZ B-VP
-made VBN I-VP
-it PRP B-NP
-clear JJ B-ADJP
-that IN B-SBAR
-he PRP B-NP
-is VBZ B-VP
-prepared VBN I-VP
-to TO I-VP
-increase VB I-VP
-interest NN B-NP
-rates NNS I-NP
-again RB B-ADVP
-if IN B-SBAR
-necessary JJ B-ADJP
-to TO B-VP
-both DT I-VP
-ensure VB I-VP
-that IN B-SBAR
-a DT B-NP
-substantial JJ I-NP
-slowdown NN I-NP
-does VBZ B-VP
-take VB I-VP
-place NN B-NP
-and CC O
-that DT O
-sterling NN B-NP
-does VBZ B-VP
-n't RB I-VP
-decline VB I-VP
-further JJ B-ADVP
-. . O
-
-Thursday NNP B-NP
-, , O
-he PRP B-NP
-reminded VBD B-VP
-his PRP$ B-NP
-audience NN I-NP
-that IN B-SBAR
-the DT B-NP
-government NN I-NP
-`` `` O
-can MD B-VP
-not RB I-VP
-allow VB I-VP
-the DT B-NP
-necessary JJ I-NP
-rigor NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-to TO B-VP
-be VB I-VP
-undermined VBN I-VP
-by IN B-PP
-exchange NN B-NP
-rate NN I-NP
-weakness NN I-NP
-. . O
-'' '' O
-
-Analysts NNS B-NP
-agree VBP B-VP
-there EX B-NP
-is VBZ B-VP
-little JJ B-NP
-holding NN B-VP
-sterling NN B-NP
-firm NN B-ADJP
-at IN B-PP
-the DT B-NP
-moment NN I-NP
-other JJ B-ADJP
-than IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-that IN B-SBAR
-rates NNS B-NP
-will MD B-VP
-be VB I-VP
-pushed VBN I-VP
-higher JJR B-ADJP
-if IN B-SBAR
-necessary JJ B-ADJP
-. . O
-
-And CC O
-, , O
-they PRP B-NP
-warn VBP B-VP
-, , O
-any DT B-NP
-further JJ I-NP
-drop NN I-NP
-in IN B-PP
-the DT B-NP
-government NN I-NP
-'s POS B-NP
-popularity NN I-NP
-could MD B-VP
-swiftly RB I-VP
-make VB I-VP
-this DT B-NP
-promise NN I-NP
-sound NN B-VP
-hollow JJ B-ADJP
-. . O
-
-Sterling NNP B-NP
-was VBD B-VP
-already RB I-VP
-showing VBG I-VP
-some DT B-NP
-signs NNS I-NP
-of IN B-PP
-a DT B-NP
-lack NN I-NP
-of IN B-PP
-confidence NN B-NP
-in IN B-PP
-Mr. NNP B-NP
-Lawson NNP I-NP
-'s POS B-NP
-promise NN I-NP
-Friday NNP B-NP
-. . O
-
-In IN B-PP
-European JJ B-NP
-trading NN I-NP
-it PRP B-NP
-declined VBD B-VP
-to TO B-PP
-$ $ B-NP
-1.5890 CD I-NP
-and CC O
-2.9495 CD B-NP
-marks NNS I-NP
-from IN B-PP
-$ $ B-NP
-1.5940 CD I-NP
-and CC O
-2.9429 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-Economists NNS B-NP
-suggested VBD B-VP
-that IN B-SBAR
-if IN B-SBAR
-the DT B-NP
-pound NN I-NP
-falls VBZ B-VP
-much JJ B-NP
-below IN B-PP
-2.90 CD B-NP
-marks NNS I-NP
-, , O
-the DT B-NP
-government NN I-NP
-will MD B-VP
-be VB I-VP
-forced VBN I-VP
-to TO I-VP
-increase VB I-VP
-rates NNS B-NP
-to TO B-PP
-16 CD B-NP
-% NN I-NP
-, , O
-both DT B-VP
-to TO I-VP
-halt VB B-VP
-any DT B-NP
-further JJ I-NP
-decline NN I-NP
-and CC O
-ensure VB B-VP
-that IN B-SBAR
-the DT B-NP
-balance NN I-NP
-of IN B-PP
-monetary JJ B-NP
-policy NN I-NP
-remains VBZ B-VP
-unchanged JJ B-ADJP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-Market NNP I-NP
-Activity NN I-NP
-
-The DT B-NP
-dollar NN I-NP
-posted VBD B-VP
-gains NNS B-NP
-in IN B-PP
-quiet JJ B-NP
-trading NN I-NP
-as IN B-SBAR
-concerns NNS B-NP
-about IN B-PP
-equities NNS B-NP
-abated VBN B-VP
-. . O
-
-Foreign JJ B-NP
-exchange NN I-NP
-dealers NNS I-NP
-said VBD B-VP
-that IN B-SBAR
-the DT B-NP
-currency NN I-NP
-market NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-distance VB I-VP
-itself PRP B-NP
-from IN B-PP
-the DT B-NP
-volatile JJ I-NP
-stock NN I-NP
-exchange NN I-NP
-, , O
-which WDT B-NP
-has VBZ B-VP
-preoccupied VBN I-VP
-the DT B-NP
-market NN I-NP
-since IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-, , O
-when WRB B-ADVP
-the DT B-NP
-Dow NNP I-NP
-Jones NNP I-NP
-Industrial NNP I-NP
-Average NNP I-NP
-plunged VBD B-VP
-more JJR B-NP
-than IN I-NP
-190 CD I-NP
-points NNS I-NP
-. . O
-
-Currency NN B-NP
-analysts NNS I-NP
-predict VBP B-VP
-that IN B-SBAR
-in IN B-PP
-the DT B-NP
-coming VBG I-NP
-week NN I-NP
-the DT B-NP
-foreign JJ I-NP
-exchange NN I-NP
-market NN I-NP
-will MD B-VP
-shift VB I-VP
-its PRP$ B-NP
-focus NN I-NP
-back RB B-ADVP
-to TO B-PP
-economic JJ B-NP
-fundamentals NNS I-NP
-, , O
-keeping VBG B-VP
-a DT B-NP
-close NN I-NP
-eye NN I-NP
-out IN B-ADVP
-for IN B-PP
-any DT B-NP
-signs NNS I-NP
-of IN B-PP
-monetary JJ B-NP
-easing NN I-NP
-by IN B-PP
-U.S. NNP B-NP
-Federal NNP I-NP
-Reserve NNP I-NP
-. . O
-
-Late RB B-ADVP
-in IN B-PP
-the DT B-NP
-New NNP I-NP
-York NNP I-NP
-trading NN I-NP
-day NN I-NP
-, , O
-the DT B-NP
-dollar NN I-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-1.8578 CD B-NP
-marks NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-1.8470 CD B-NP
-marks NNS I-NP
-late JJ B-NP
-Thursday NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-was VBD B-VP
-also RB I-VP
-changing VBG I-VP
-hands NNS B-NP
-at IN B-PP
-142.43 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-141.70 CD B-NP
-yen NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-late JJ B-NP
-Thursday NNP I-NP
-. . O
-
-In IN B-PP
-Tokyo NNP B-NP
-on IN B-PP
-Monday NNP B-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-currency NN I-NP
-opened VBD B-VP
-for IN B-PP
-trading NN B-NP
-at IN B-PP
-141.95 CD B-NP
-yen NN I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-Tokyo NNP I-NP
-close NN I-NP
-of IN B-PP
-141.35 CD B-NP
-yen NN I-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-Commodity NNP I-NP
-Exchange NNP I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-, , O
-gold NN B-NP
-for IN B-PP
-current JJ B-NP
-delivery NN I-NP
-settled VBD B-VP
-at IN B-PP
-$ $ B-NP
-367.30 CD I-NP
-an DT B-NP
-ounce NN I-NP
-, , O
-up IN B-ADVP
-20 CD B-NP
-cents NNS I-NP
-. . O
-
-Estimated VBN B-NP
-volume NN I-NP
-was VBD B-VP
-a DT B-NP
-light NN I-NP
-2.4 CD I-NP
-million CD I-NP
-ounces NNS I-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Hong NNP B-NP
-Kong NNP I-NP
-Monday NNP B-NP
-, , O
-gold NN B-NP
-was VBD B-VP
-quoted VBN I-VP
-at IN B-PP
-$ $ B-NP
-366.50 CD I-NP
-an DT B-NP
-ounce NN I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Limited NNP I-NP
-Partnership NNP I-NP
-said VBD B-VP
-it PRP B-NP
-proposed VBD B-VP
-to TO I-VP
-acquire VB I-VP
-A.P. NNP B-NP
-Green NNP I-NP
-Industries NNP I-NP
-Inc. NNP I-NP
-for IN B-PP
-$ $ B-NP
-40 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-In IN B-PP
-an DT B-NP
-Oct. NNP I-NP
-19 CD I-NP
-letter NN I-NP
-to TO B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-'s POS B-NP
-board NN I-NP
-, , O
-East NNP B-NP
-Rock NNP I-NP
-said VBD B-VP
-the DT B-NP
-offer NN I-NP
-is VBZ B-VP
-subject NN B-ADJP
-to TO B-PP
-the DT B-NP
-signing NN I-NP
-of IN B-PP
-a DT B-NP
-merger NN I-NP
-agreement NN I-NP
-by IN B-PP
-no DT B-ADVP
-later RB I-ADVP
-than IN B-PP
-Oct. NNP B-NP
-31 CD I-NP
-. . O
-
-The DT B-NP
-letter NN I-NP
-, , O
-attached VBN B-VP
-to TO B-PP
-a DT B-NP
-filing NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-, , O
-said VBD B-VP
-the DT B-NP
-approval NN I-NP
-is VBZ B-VP
-also RB B-ADVP
-contingent JJ B-ADJP
-upon IN B-PP
-obtaining VBG B-VP
-satisfactory JJ B-NP
-financing NN I-NP
-. . O
-
-An DT B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-official NN I-NP
-declined VBD B-VP
-to TO I-VP
-comment VB I-VP
-on IN B-PP
-the DT B-NP
-filing NN I-NP
-. . O
-
-The DT B-NP
-$ $ I-NP
-40-a-share JJ I-NP
-proposal NN I-NP
-values VBZ B-VP
-the DT B-NP
-company NN I-NP
-at IN B-PP
-about RB B-NP
-$ $ I-NP
-106.6 CD I-NP
-million CD I-NP
-. . O
-
-A.P. NNP B-NP
-Green NNP I-NP
-currently RB B-ADVP
-has VBZ B-VP
-2,664,098 CD B-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-. . O
-
-Its PRP$ B-NP
-stock NN I-NP
-closed VBD B-VP
-at IN B-PP
-$ $ B-NP
-38 CD I-NP
-, , O
-up IN B-ADVP
-$ $ B-NP
-1.875 CD I-NP
-, , O
-in IN B-PP
-national JJ B-NP
-over-the-counter JJ I-NP
-trading NN I-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-is VBZ B-VP
-a DT B-NP
-Mexico NNP I-NP
-, , I-NP
-Mo. NNP I-NP
-, , I-NP
-maker NN I-NP
-of IN B-PP
-refractory JJ B-NP
-products NNS I-NP
-. . O
-
-East NNP B-NP
-Rock NNP I-NP
-also RB B-ADVP
-said VBD B-VP
-in IN B-PP
-the DT B-NP
-filing NN I-NP
-that IN B-SBAR
-it PRP B-NP
-boosted VBD B-VP
-its PRP$ B-NP
-stake NN I-NP
-in IN B-PP
-A.P. NNP B-NP
-Green NNP I-NP
-to TO B-PP
-8.7 CD B-NP
-% NN I-NP
-. . O
-
-It PRP B-NP
-now RB B-ADVP
-holds VBZ B-VP
-233,000 CD B-NP
-A.P. NNP I-NP
-Green NNP I-NP
-common JJ I-NP
-shares NNS I-NP
-, , O
-including VBG B-PP
-30,000 CD B-NP
-shares NNS I-NP
-bought VBD B-VP
-last JJ B-NP
-Thursday NNP I-NP
-for IN B-PP
-$ $ B-NP
-35.50 CD I-NP
-to TO I-NP
-$ $ I-NP
-36.50 CD I-NP
-a DT B-NP
-share NN I-NP
-. . O
-
-New NNP B-NP
-York-based JJ I-NP
-John NNP I-NP
-Kuhns NNP I-NP
-and CC I-NP
-Robert NNP I-NP
-MacDonald NNP I-NP
-control NN B-VP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-sole JJ I-NP
-general JJ I-NP
-partner NN I-NP
-of IN B-PP
-East NNP B-NP
-Rock NNP I-NP
-Partners NNP I-NP
-L.P NNP I-NP
-. . O
-
-The DT B-NP
-sole JJ I-NP
-limited JJ I-NP
-partner NN I-NP
-of IN B-PP
-the DT B-NP
-partnership NN I-NP
-is VBZ B-VP
-Westwood NNP B-NP
-Brick NNP I-NP
-Lime NNP I-NP
-Inc. NNP I-NP
-, , O
-an DT B-NP
-indirect JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Westwood NNP B-NP
-Group NNP I-NP
-Inc NNP I-NP
-. . O
-
-Both DT B-NP
-Westwood NNP B-NP
-Brick NNP I-NP
-and CC O
-Westwood NNP B-NP
-Group NNP I-NP
-are VBP B-VP
-based VBN I-VP
-in IN B-PP
-Boston NNP B-NP
-. . O
-
-Freight NN B-NP
-rates NNS I-NP
-, , O
-declining VBG B-VP
-for IN B-PP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-decade NN I-NP
-because IN B-PP
-of IN I-PP
-competition NN B-NP
-spurred VBN B-VP
-by IN B-PP
-deregulation NN B-NP
-, , O
-are VBP B-VP
-bottoming VBG I-VP
-out IN B-PRT
-, , O
-turning VBG B-VP
-upward RB B-ADVP
-and CC O
-threatening VBG B-VP
-to TO I-VP
-fuel VB I-VP
-inflation NN B-NP
-. . O
-
-Trucking NNP B-NP
-, , I-NP
-shipping VBG I-NP
-and CC I-NP
-air-freight NN I-NP
-companies NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-rate NN B-NP
-increases NNS I-NP
-, , O
-scheduled VBN B-VP
-for IN B-PP
-this DT B-NP
-fall NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-reflecting VBG B-VP
-higher JJR B-NP
-costs NNS I-NP
-and CC O
-tightened VBD B-NP
-demand NN I-NP
-for IN B-PP
-freight NN B-NP
-transport NN I-NP
-. . O
-
-Major JJ B-NP
-shippers NNS I-NP
-say VBP B-VP
-they PRP B-NP
-expect VBP B-VP
-freight NN B-NP
-rates NNS I-NP
-to TO B-VP
-rise VB I-VP
-at IN B-ADVP
-least JJS I-ADVP
-as RB B-ADVP
-fast RB I-ADVP
-as IN B-PP
-inflation NN B-NP
-and CC B-ADVP
-maybe RB I-ADVP
-faster RBR B-ADVP
-in IN B-PP
-the DT B-NP
-next JJ I-NP
-few JJ I-NP
-years NNS I-NP
-. . O
-
-That DT B-NP
-'s VBZ B-VP
-a DT B-NP
-big JJ I-NP
-change NN I-NP
-from IN B-PP
-recent JJ B-NP
-years NNS I-NP
-when WRB B-ADVP
-freight NN B-NP
-haulage NN I-NP
-was VBD B-VP
-a DT B-NP
-bright JJ I-NP
-spot NN I-NP
-for IN B-PP
-U.S. NNP B-NP
-productivity NN I-NP
-, , O
-helping VBG B-VP
-to TO I-VP
-restrain VB I-VP
-inflation NN B-NP
-and CC O
-make VB B-VP
-U.S. NNP B-NP
-industry NN I-NP
-more RBR B-ADJP
-competitive JJ I-ADJP
-abroad RB B-ADVP
-. . O
-
-`` `` O
-Demand NN B-NP
-has VBZ B-VP
-caught VBN I-VP
-up IN B-PRT
-with IN B-PP
-the DT B-NP
-supply NN I-NP
-of IN B-PP
-certain JJ B-NP
-types NNS I-NP
-of IN B-PP
-freight NN B-NP
-transportation NN I-NP
-, , O
-and CC O
-rates NNS B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-move VB I-VP
-up IN B-ADVP
-'' '' O
-at IN B-PP
-a DT B-NP
-rate NN I-NP
-`` `` O
-close RB B-ADJP
-to TO B-PP
-or CC O
-slightly RB B-ADJP
-more JJR I-ADJP
-than IN B-PP
-the DT B-NP
-inflation NN I-NP
-rate NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Clifford NNP B-NP
-Sayre NNP I-NP
-, , O
-director NN B-NP
-of IN B-PP
-logistics NNS B-NP
-at IN B-PP
-Du NNP B-NP
-Pont NNP I-NP
-Co NNP I-NP
-. . O
-
-Shippers NNS B-NP
-surveyed VBN B-VP
-recently RB B-ADVP
-by IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-University NNP I-NP
-said VBD B-VP
-they PRP B-NP
-expect VBP B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-, , I-NP
-storage NN I-NP
-and CC I-NP
-distribution NN I-NP
-costs NNS I-NP
-to TO B-VP
-rise VB I-VP
-about IN B-NP
-4 CD I-NP
-% NN I-NP
-this DT B-NP
-year NN I-NP
-. . O
-
-Only RB B-NP
-10 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-250 CD I-NP
-shippers NNS I-NP
-polled VBN B-VP
-expected VBN B-VP
-their PRP$ B-NP
-freight-transport JJ I-NP
-costs NNS I-NP
-to TO B-VP
-decrease VB I-VP
-, , O
-compared VBN B-PP
-with IN B-PP
-30 CD B-NP
-% NN I-NP
-who WP B-NP
-had VBD B-VP
-looked VBN I-VP
-to TO B-PP
-freight VB B-NP
-transport NN I-NP
-to TO B-VP
-reduce VB I-VP
-costs NNS B-NP
-in IN B-PP
-past JJ B-NP
-years NNS I-NP
-. . O
-
-`` `` O
-This DT B-NP
-is VBZ B-VP
-the DT B-NP
-first JJ I-NP
-year NN I-NP
-since IN B-PP
-transportation NN B-NP
-deregulation NN I-NP
-in IN B-PP
-1980 CD B-NP
-that IN B-ADVP
-we PRP B-NP
-have VBP B-VP
-had VBN I-VP
-such JJ B-NP
-a DT I-NP
-dramatic JJ I-NP
-and CC I-NP
-broad-based JJ I-NP
-upturn NN I-NP
-in IN B-PP
-perceived VBN B-NP
-transportation NN I-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Bernard NNP B-NP
-LaLonde NNP I-NP
-, , O
-a DT B-NP
-transportation NN I-NP
-logistics NNS I-NP
-professor NN I-NP
-at IN B-PP
-Ohio NNP B-NP
-State NNP I-NP
-in IN B-PP
-Columbus NNP B-NP
-. . O
-
-The DT B-NP
-deregulation NN I-NP
-of IN B-PP
-railroads NNS B-NP
-and CC I-NP
-trucking NN I-NP
-companies NNS I-NP
-that WDT B-NP
-began VBD B-VP
-in IN B-PP
-1980 CD B-NP
-enabled VBD B-VP
-shippers NNS B-NP
-to TO B-VP
-bargain VB I-VP
-for IN B-PP
-transportation NN B-NP
-. . O
-
-Carriers NNP B-NP
-could MD B-VP
-use VB I-VP
-their PRP$ B-NP
-equipment NN I-NP
-more RBR B-ADVP
-efficiently RB I-ADVP
-, , O
-leading VBG B-VP
-to TO B-PP
-overcapacity NN B-NP
-they PRP B-NP
-were VBD B-VP
-eager JJ B-ADJP
-to TO B-VP
-fill VB I-VP
-. . O
-
-Shippers NNS B-NP
-cut VBP B-VP
-about RB B-NP
-$ $ I-NP
-35 CD I-NP
-billion CD I-NP
-from IN B-PP
-their PRP$ B-NP
-annual JJ I-NP
-, , I-NP
-inter-city JJ I-NP
-truck NN I-NP
-and CC I-NP
-rail NN I-NP
-costs NNS I-NP
-, , O
-to TO B-PP
-about RB B-NP
-$ $ I-NP
-150 CD I-NP
-billion CD I-NP
-, , O
-or CC O
-about IN B-NP
-6.4 CD I-NP
-% NN I-NP
-of IN B-PP
-gross JJ B-NP
-national JJ I-NP
-product NN I-NP
-, , O
-down RB B-ADVP
-from IN B-PP
-8 CD B-NP
-% NN I-NP
-of IN B-PP
-GNP NNP B-NP
-in IN B-PP
-1981 CD B-NP
-. . O
-
-But CC O
-with IN B-PP
-much NN B-NP
-of IN B-PP
-the DT B-NP
-inefficiency NN I-NP
-squeezed VBN B-VP
-out IN B-PP
-of IN B-PP
-the DT B-NP
-freight-transport JJ I-NP
-system NN I-NP
-, , O
-rising VBG B-NP
-costs NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-reflected VBN I-VP
-directly RB B-ADVP
-in IN B-PP
-higher JJR B-NP
-freight NN I-NP
-rates NNS I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-are VBP B-VP
-saying VBG I-VP
-` `` O
-the DT B-NP
-party NN I-NP
-'s POS B-VP
-over IN B-ADJP
-, , O
-' '' O
-'' '' O
-said VBD B-VP
-Mr. NNP B-NP
-LaLonde NNP I-NP
-. . O
-
-`` `` O
-Shippers NNS B-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-look VB I-VP
-for IN B-PP
-transportation-cost JJ B-NP
-savings NNS I-NP
-as IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-for IN B-PP
-the DT B-NP
-last JJ I-NP
-eight CD I-NP
-or CC I-NP
-nine CD I-NP
-years NNS I-NP
-. . O
-
-Transport NN B-NP
-rates NNS I-NP
-wo MD B-VP
-n't RB I-VP
-be VB I-VP
-an DT B-NP
-opportunity NN I-NP
-for IN B-PP
-offsetting VBG B-VP
-cost NN B-NP
-increases NNS I-NP
-in IN B-PP
-other JJ B-NP
-segments NNS I-NP
-of IN B-PP
-the DT B-NP
-economy NN I-NP
-. . O
-'' '' O
-
-Robert NNP B-NP
-Delaney NNP I-NP
-, , O
-a DT B-NP
-consultant NN I-NP
-at IN B-PP
-Arthur NNP B-NP
-D. NNP I-NP
-Little NNP I-NP
-Inc. NNP I-NP
-, , O
-Cambridge NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-said VBD B-VP
-`` `` O
-We PRP B-NP
-'ve VBP B-VP
-gotten VBN I-VP
-all PDT B-NP
-the DT I-NP
-benefits NNS I-NP
-of IN B-PP
-deregulation NN B-NP
-in IN B-PP
-freight-cost JJ B-NP
-reductions NNS I-NP
-. . O
-
-Now RB B-ADVP
-we PRP B-NP
-are VBP B-VP
-starting VBG I-VP
-to TO I-VP
-see VB I-VP
-real JJ B-NP
-freight-rate JJ I-NP
-increases NNS I-NP
-as IN B-SBAR
-carriers NNS B-NP
-replace VBP B-VP
-equipment NN B-NP
-, , O
-pay VB B-VP
-higher JJR B-NP
-fuel NN I-NP
-costs NNS I-NP
-and CC O
-pay VB B-VP
-more JJR B-NP
-for IN B-PP
-labor NN B-NP
-. . O
-
-You PRP B-NP
-'ll MD B-VP
-see VB I-VP
-carriers NNS B-NP
-try VB B-VP
-to TO I-VP
-recoup VB I-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-price NN I-NP
-cutting VBG I-NP
-that WDT B-NP
-occurred VBD B-VP
-previously RB B-ADVP
-. . O
-'' '' O
-
-Not RB B-NP
-everyone NN I-NP
-believes VBZ B-VP
-that IN B-SBAR
-the DT B-NP
-good JJ I-NP
-times NNS I-NP
-are VBP B-VP
-over IN B-ADJP
-for IN B-PP
-shippers NNS B-NP
-. . O
-
-`` `` O
-There EX B-NP
-'s VBZ B-VP
-still RB B-ADVP
-a DT B-NP
-lot NN I-NP
-of IN B-PP
-pressure NN B-NP
-on IN B-PP
-rates NNS B-NP
-in IN B-PP
-both DT B-NP
-rail NN I-NP
-and CC I-NP
-truck NN I-NP
-, , O
-'' '' O
-said VBD B-VP
-Gerard NNP B-NP
-McCullough NNP I-NP
-, , O
-lecturer NN B-NP
-in IN B-PP
-transportation NN B-NP
-at IN B-PP
-Massachusetts NNP B-NP
-Institute NNP I-NP
-of IN B-PP
-Technology NNP B-NP
-. . O
-
-Less-than-truckload JJ B-NP
-companies NNS I-NP
-, , O
-which WDT B-NP
-carry VBP B-VP
-the DT B-NP
-freight NN I-NP
-of IN B-PP
-several JJ B-NP
-shippers NNS I-NP
-in IN B-PP
-each DT B-NP
-truck NN I-NP
-trailer NN I-NP
-, , O
-discounted VBD B-VP
-away RB B-ADVP
-a DT B-NP
-4.7 CD I-NP
-% NN I-NP
-rate NN I-NP
-increase NN I-NP
-implemented VBD B-VP
-last JJ B-NP
-April NNP I-NP
-. . O
-
-The DT B-NP
-carriers NNS I-NP
-were VBD B-VP
-competing VBG I-VP
-fiercely RB B-ADVP
-for IN B-PP
-market NN B-NP
-share NN I-NP
-. . O
-
-Railroad-rate JJ B-NP
-increases NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-restrained VBN I-VP
-by IN B-PP
-weakening VBG B-NP
-rail-traffic JJ I-NP
-levels NNS I-NP
-and CC O
-keen JJ B-NP
-competition NN I-NP
-for IN B-PP
-freight NN B-NP
-from IN B-PP
-trucks NNS B-NP
-. . O
-
-An DT B-NP
-official NN I-NP
-at IN B-PP
-Consolidated NNP B-NP
-Freightways NNP I-NP
-Inc. NNP I-NP
-, , O
-a DT B-NP
-Menlo NNP I-NP
-Park NNP I-NP
-, , I-NP
-Calif. NNP I-NP
-, , I-NP
-less-than-truckload JJ I-NP
-carrier NN I-NP
-, , O
-said VBD B-VP
-rate NN B-NP
-discounting NN I-NP
-in IN B-PP
-that DT B-NP
-industry NN I-NP
-has VBZ B-VP
-begun VBN I-VP
-to TO I-VP
-`` `` O
-stabilize VB B-VP
-. . O
-'' '' O
-
-Consolidated NNP B-NP
-Freightways NNP I-NP
-plans VBZ B-VP
-to TO I-VP
-raise VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-5.3 CD B-NP
-% NN I-NP
-late JJ B-NP
-this DT I-NP
-year NN I-NP
-or CC O
-early JJ B-NP
-next JJ I-NP
-year NN I-NP
-, , O
-and CC O
-at IN B-NP
-least JJS I-NP
-two CD I-NP
-competitors NNS I-NP
-have VBP B-VP
-announced VBN I-VP
-similar JJ B-NP
-increases NNS I-NP
-. . O
-
-Truckers NNS B-NP
-are VBP B-VP
-`` `` O
-trying VBG B-VP
-to TO I-VP
-send VB I-VP
-signals NNS B-NP
-that IN B-SBAR
-they PRP B-NP
-need VBP B-VP
-to TO I-VP
-stop VB I-VP
-the DT B-NP
-bloodletting NN I-NP
-, , O
-forget VB B-VP
-about IN B-PP
-market NN B-NP
-share NN I-NP
-and CC O
-go VB B-VP
-for IN B-PP
-higher JJR B-NP
-rates NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Michael NNP B-NP
-Lloyd NNP I-NP
-, , O
-an DT B-NP
-analyst NN I-NP
-at IN B-PP
-Salomon NNP B-NP
-Bros NNP I-NP
-. . O
-
-And CC O
-`` `` O
-shippers NNS B-NP
-are VBP B-VP
-getting VBG I-VP
-the DT B-NP
-feeling NN I-NP
-that IN B-SBAR
-they PRP B-NP
-have VBP B-VP
-played VBN I-VP
-one CD B-NP
-trucker NN I-NP
-off IN B-ADVP
-against IN B-PP
-another DT B-NP
-as RB B-NP
-much JJ I-NP
-as IN B-SBAR
-they PRP B-NP
-can MD B-VP
-, , O
-'' '' O
-he PRP B-NP
-said VBD B-VP
-. . O
-
-Air-freight NN B-NP
-carriers NNS I-NP
-raised VBD B-VP
-their PRP$ B-NP
-rates NNS I-NP
-for IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-going VBG B-VP
-across IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-to TO B-PP
-Asia NNP B-NP
-by IN B-PP
-about IN B-NP
-20 CD I-NP
-% NN I-NP
-earlier RBR B-NP
-this DT I-NP
-month NN I-NP
-. . O
-
-And CC O
-Japan NNP B-NP
-Air NNP I-NP
-Lines NNPS I-NP
-said VBD B-VP
-it PRP B-NP
-plans VBZ B-VP
-to TO I-VP
-boost VB I-VP
-its PRP$ B-NP
-rates NNS I-NP
-a DT B-NP
-further JJ I-NP
-25 CD I-NP
-% NN I-NP
-over IN B-PP
-the DT B-NP
-next JJ I-NP
-two CD I-NP
-years NNS I-NP
-. . O
-
-Such JJ B-NP
-rate NN I-NP
-increases NNS I-NP
-`` `` O
-will MD B-VP
-increase VB I-VP
-the DT B-NP
-total JJ I-NP
-cost NN I-NP
-of IN B-PP
-U.S. NNP B-NP
-products NNS I-NP
-and CC O
-slow JJ B-VP
-down RP B-PRT
-the DT B-NP
-rate NN I-NP
-of IN B-PP
-increase NN B-NP
-of IN B-PP
-U.S. NNP B-NP
-exports NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Richard NNP B-NP
-Connors NNP I-NP
-, , O
-a DT B-NP
-senior JJ I-NP
-vice NN I-NP
-president NN I-NP
-of IN B-PP
-Yusen NNP B-NP
-Air NNP I-NP
-& CC I-NP
-Sea NNP I-NP
-Service NNP I-NP
-U.S.A. NNP I-NP
-Inc. NNP I-NP
-, , O
-the DT B-NP
-U.S. NNP I-NP
-air-freight-forwarding JJ I-NP
-subsidiary NN I-NP
-of IN B-PP
-Nippon NNP B-NP
-Yusen NNP I-NP
-Kaisha NNP I-NP
-of IN B-PP
-Japan NNP B-NP
-. . O
-
-Ship NN B-NP
-companies NNS I-NP
-carrying VBG B-VP
-bulk NN B-NP
-commodities NNS I-NP
-, , O
-such JJ B-PP
-as IN I-PP
-oil NN B-NP
-, , O
-grain NN B-NP
-, , O
-coal NN B-NP
-and CC O
-iron NN B-NP
-ore NN I-NP
-, , O
-have VBP B-VP
-been VBN I-VP
-able JJ B-ADJP
-to TO B-VP
-increase VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-in IN B-PP
-the DT B-NP
-last JJ I-NP
-couple NN I-NP
-of IN B-PP
-years NNS B-NP
-. . O
-
-Some DT B-NP
-bulk NN I-NP
-shipping VBG I-NP
-rates NNS I-NP
-have VBP B-VP
-increased VBN I-VP
-`` `` O
-3 CD B-NP
-% NN I-NP
-to TO I-NP
-4 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-past JJ I-NP
-few JJ I-NP
-months NNS I-NP
-, , O
-'' '' O
-said VBD B-VP
-Salomon NNP B-NP
-'s POS B-NP
-Mr. NNP I-NP
-Lloyd NNP I-NP
-. . O
-
-And CC O
-ship NN B-NP
-lines NNS I-NP
-carrying VBG B-VP
-containers NNS B-NP
-are VBP B-VP
-also RB I-VP
-trying VBG I-VP
-to TO I-VP
-raise VB I-VP
-their PRP$ B-NP
-rates NNS I-NP
-. . O
-
-Carriers NNP B-NP
-boosted VBD B-VP
-rates NNS B-NP
-more JJR B-NP
-than IN I-NP
-10 CD I-NP
-% NN I-NP
-in IN B-PP
-the DT B-NP
-North NNP I-NP
-Atlantic NNP I-NP
-between IN B-PP
-the DT B-NP
-U.S. NNP I-NP
-and CC O
-Europe NNP B-NP
-last JJ B-NP
-September NNP I-NP
-, , O
-hoping VBG B-VP
-to TO I-VP
-partly RB I-VP
-restore VB I-VP
-rates NNS B-NP
-to TO B-PP
-earlier JJR B-NP
-levels NNS I-NP
-. . O
-
-Ship NN B-NP
-lines NNS I-NP
-operating VBG B-VP
-in IN B-PP
-the DT B-NP
-Pacific NNP I-NP
-plan NN B-VP
-to TO I-VP
-raise VB I-VP
-rates NNS B-NP
-on IN B-PP
-containers NNS B-NP
-carrying VBG B-VP
-U.S. NNP B-NP
-exports NNS I-NP
-to TO B-PP
-Asia NNP B-NP
-about IN B-NP
-10 CD I-NP
-% NN I-NP
-, , O
-effective JJ B-ADJP
-next JJ B-NP
-April NNP I-NP
-. . O
-
-MGM NNP B-NP
-Grand NNP I-NP
-Inc. NNP I-NP
-said VBD B-VP
-it PRP B-NP
-filed VBD B-VP
-a DT B-NP
-registration NN I-NP
-statement NN I-NP
-with IN B-PP
-the DT B-NP
-Securities NNP I-NP
-and CC I-NP
-Exchange NNP I-NP
-Commission NNP I-NP
-for IN B-PP
-a DT B-NP
-public JJ I-NP
-offering NN I-NP
-of IN B-PP
-six CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-. . O
-
-The DT B-NP
-Beverly NNP I-NP
-Hills NNP I-NP
-, , I-NP
-Calif.-based JJ I-NP
-company NN I-NP
-said VBD B-VP
-it PRP B-NP
-would MD B-VP
-have VB I-VP
-26.9 CD B-NP
-million CD I-NP
-common JJ I-NP
-shares NNS I-NP
-outstanding JJ B-ADJP
-after IN B-PP
-the DT B-NP
-offering NN I-NP
-. . O
-
-The DT B-NP
-hotel NN I-NP
-and CC I-NP
-Gaming NNP I-NP
-company NN I-NP
-said VBD B-VP
-Merrill NNP B-NP
-Lynch NNP I-NP
-Capital NNP I-NP
-Markets NNPS I-NP
-will MD B-VP
-lead VB I-VP
-the DT B-NP
-underwriters NNS I-NP
-. . O
-
-Proceeds NNS B-NP
-from IN B-PP
-the DT B-NP
-sale NN I-NP
-will MD B-VP
-be VB I-VP
-used VBN I-VP
-for IN B-PP
-remodeling VBG B-NP
-and CC I-NP
-refurbishing VBG I-NP
-projects NNS I-NP
-, , B-PP
-as RB I-PP
-well RB I-PP
-as IN I-PP
-for IN B-PP
-the DT B-NP
-planned VBN I-NP
-MGM NNP I-NP
-Grand NNP I-NP
-hotel\/casino NN I-NP
-and CC I-NP
-theme NN I-NP
-park NN I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-stewed JJ B-VP
-over IN B-PP
-a DT B-NP
-letter NN I-NP
-from IN B-PP
-his PRP$ B-NP
-manager NN I-NP
-putting VBG B-VP
-him PRP B-NP
-on IN B-PP
-probation NN B-NP
-for IN B-PP
-insubordination NN B-NP
-. . O
-
-Mr. NNP B-NP
-Stone NNP I-NP
-thought VBD B-VP
-the DT B-NP
-discipline NN I-NP
-was VBD B-VP
-unfair JJ B-ADJP
-; : O
-he PRP B-NP
-believed VBD B-VP
-that IN B-SBAR
-his PRP$ B-NP
-manager NN I-NP
-wanted VBD B-VP
-to TO I-VP
-get VB I-VP
-rid JJ B-ADJP
-of IN B-PP
-him PRP B-NP
-for IN B-PP
-personal JJ B-NP
-reasons NNS I-NP
-. . O
-
-Unable JJ B-ADJP
-to TO B-VP
-persuade VB I-VP
-the DT B-NP
-manager NN I-NP
-to TO B-VP
-change VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-, , O
-he PRP B-NP
-went VBD B-VP
-to TO B-PP
-a DT B-NP
-`` `` I-NP
-company NN I-NP
-court NN I-NP
-'' '' O
-for IN B-PP
-a DT B-NP
-hearing NN I-NP
-. . O
-
-At IN B-PP
-the DT B-NP
-scheduled VBN I-NP
-time NN I-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-entered VBD B-VP
-a DT B-NP
-conference NN I-NP
-room NN I-NP
-in IN B-PP
-a DT B-NP
-building NN I-NP
-near IN B-PP
-where WRB B-ADVP
-he PRP B-NP
-worked VBD B-VP
-. . O
-
-After IN B-SBAR
-the DT B-NP
-three CD I-NP
-members NNS I-NP
-of IN B-PP
-the DT B-NP
-court NN I-NP
-introduced VBD B-VP
-themselves PRP B-NP
-, , O
-the DT B-NP
-chairman NN I-NP
-of IN B-PP
-the DT B-NP
-panel NN I-NP
-said VBD B-VP
-: : O
-`` `` O
-Go VB B-VP
-ahead RB B-ADVP
-and CC O
-tell VB B-VP
-us PRP B-NP
-what WP B-NP
-happened VBD B-VP
-. . O
-
-We PRP B-NP
-may MD B-VP
-ask VB I-VP
-questions NNS B-NP
-as IN B-SBAR
-you PRP B-NP
-go VBP B-VP
-along IN B-PRT
-, , O
-or CC O
-we PRP B-NP
-may MD B-VP
-wait VB I-VP
-until IN B-PP
-the DT B-NP
-end NN I-NP
-. . O
-'' '' O
-
-No DT B-NP
-lawyers NNS I-NP
-or CC I-NP
-tape NN I-NP
-recorders NNS I-NP
-were VBD B-VP
-present JJ B-ADJP
-. . O
-
-The DT B-NP
-only RB I-NP
-extra JJ I-NP
-people NNS I-NP
-were VBD B-VP
-a DT B-NP
-couple NN I-NP
-of IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-, , O
-one CD B-NP
-of IN B-PP
-whom WP B-NP
-knew VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-'s POS B-NP
-case NN I-NP
-intimately RB B-ADVP
-and CC O
-would MD B-VP
-help VB I-VP
-fill VB I-VP
-in IN B-PRT
-any DT B-NP
-facts NNS I-NP
-needed VBN B-VP
-to TO B-VP
-give VB I-VP
-the DT B-NP
-court NN I-NP
-the DT B-NP
-full JJ I-NP
-picture NN I-NP
-. . O
-
-Over IN B-PP
-a DT B-NP
-cup NN I-NP
-of IN B-PP
-coffee NN B-NP
-, , O
-Mr. NNP B-NP
-Stone NNP I-NP
-told VBD B-VP
-his PRP$ B-NP
-story NN I-NP
-. . O
-
-He PRP B-NP
-talked VBD B-VP
-about IN B-NP
-20 CD I-NP
-minutes NNS I-NP
-. . O
-
-When WRB B-ADVP
-he PRP B-NP
-was VBD B-VP
-through IN B-ADJP
-, , O
-the DT B-NP
-court NN I-NP
-members NNS I-NP
-asked VBD B-VP
-many JJ B-NP
-questions NNS I-NP
-, , O
-then RB B-ADVP
-the DT B-NP
-chairman NN I-NP
-said VBD B-VP
-they PRP B-NP
-would MD B-VP
-like VB I-VP
-to TO I-VP
-hear VB I-VP
-his PRP$ B-NP
-manager NN I-NP
-'s POS B-NP
-side NN I-NP
-and CC O
-talk VB B-VP
-to TO B-PP
-witnesses NNS B-NP
-. . O
-
-The DT B-NP
-chairman NN I-NP
-promised VBD B-VP
-Mr. NNP B-NP
-Stone NNP I-NP
-a DT B-NP
-decision NN I-NP
-within IN B-PP
-two CD B-NP
-weeks NNS I-NP
-. . O
-
-Bob NNP B-NP
-Stone NNP I-NP
-is VBZ B-VP
-a DT B-NP
-fictional JJ I-NP
-name NN I-NP
-, , O
-but CC O
-the DT B-NP
-incident NN I-NP
-described VBN B-VP
-is VBZ B-VP
-real JJ B-ADJP
-. . O
-
-It PRP B-NP
-happened VBD B-VP
-at IN B-PP
-Northrop NNP B-NP
-Corp. NNP I-NP
-in IN B-PP
-Los NNP B-NP
-Angeles NNP I-NP
-. . O
-
-The DT B-NP
-court NN I-NP
-is VBZ B-VP
-called VBN I-VP
-the DT B-NP
-Management NNP I-NP
-Appeals NNP I-NP
-Committee NNP I-NP
-, , O
-or CC O
-just RB B-NP
-`` `` I-NP
-MAC NNP I-NP
-, , O
-'' '' O
-and CC O
-it PRP B-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-hear VB I-VP
-a DT B-NP
-couple NN I-NP
-of IN I-NP
-dozen NN I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-. . O
-
-Alter VB B-VP
-some DT B-NP
-details NNS I-NP
-of IN B-PP
-this DT B-NP
-example NN I-NP
-and CC O
-it PRP B-NP
-could MD B-VP
-be VB I-VP
-taking VBG I-VP
-place NN B-NP
-today NN B-ADVP
-at IN B-PP
-Federal NNP B-NP
-Express NNP I-NP
-in IN B-PP
-Memphis NNP B-NP
-, , O
-the DT B-NP
-Defense NNP I-NP
-and CC I-NP
-Underseas NNP I-NP
-Systems NNP I-NP
-divisions NNS I-NP
-of IN B-PP
-Honeywell NNP B-NP
-in IN B-PP
-Minneapolis NNP B-NP
-, , O
-a DT B-NP
-General NNP I-NP
-Electric NNP I-NP
-plant NN I-NP
-in IN B-PP
-Columbia NNP B-NP
-, , O
-Md. NNP B-NP
-, , O
-or CC O
-a DT B-NP
-number NN I-NP
-of IN B-PP
-other JJ B-NP
-companies NNS I-NP
-. . O
-
-These DT B-NP
-firms NNS I-NP
-are VBP B-VP
-pioneers NNS B-NP
-in IN B-PP
-a DT B-NP
-significant JJ I-NP
-new JJ I-NP
-trend NN I-NP
-in IN B-PP
-the DT B-NP
-corporate JJ I-NP
-world NN I-NP
-: : O
-the DT B-NP
-rise NN I-NP
-of IN B-PP
-what WP B-NP
-I PRP B-NP
-call VBP B-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-. . O
-
-Although IN B-SBAR
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-is VBZ B-VP
-practiced VBN I-VP
-today NN B-NP
-in IN B-PP
-few JJ B-NP
-companies NNS I-NP
--- : O
-perhaps RB B-ADVP
-40 CD B-NP
-to TO I-NP
-60 CD I-NP
--- : O
-it PRP B-NP
-is VBZ B-VP
-one CD B-NP
-of IN B-PP
-the DT B-NP
-fastest JJS I-NP
-developing VBG I-NP
-trends NNS I-NP
-in IN B-PP
-industry NN B-NP
-. . O
-
-In IN B-PP
-the DT B-NP
-coming VBG I-NP
-decade NN I-NP
-a DT B-NP
-majority NN I-NP
-of IN B-PP
-people-oriented JJ B-NP
-companies NNS I-NP
-are VBP B-VP
-likely JJ B-ADJP
-to TO B-VP
-adopt VB I-VP
-it PRP B-NP
-. . O
-
-Corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-appeals NNS B-VP
-to TO B-PP
-management NN B-NP
-for IN B-PP
-a DT B-NP
-variety NN I-NP
-of IN B-PP
-reasons NNS B-NP
-. . O
-
-It PRP B-NP
-reduces VBZ B-VP
-lawsuits NNS B-NP
-from IN B-PP
-disgruntled JJ B-NP
-employees NNS I-NP
-and CC I-NP
-ex-employees NNS I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-reduced VBN B-NP
-legal JJ I-NP
-costs NNS I-NP
-and CC O
-better RBR B-NP
-public JJ I-NP
-relations NNS I-NP
-. . O
-
-It PRP B-NP
-helps VBZ B-VP
-to TO I-VP
-keep VB I-VP
-out IN B-PRT
-unions NNS B-NP
-. . O
-
-It PRP B-NP
-increases VBZ B-VP
-employee NN B-NP
-commitment NN I-NP
-to TO B-PP
-the DT B-NP
-company NN I-NP
-, , O
-with IN B-PP
-all DT B-NP
-that WDT B-NP
-means VBZ B-VP
-for IN B-PP
-efficiency NN B-NP
-and CC O
-quality NN B-NP
-control NN I-NP
-. . O
-
-What WP B-NP
-must MD O
-your PRP$ B-NP
-management NN I-NP
-team NN I-NP
-do VBP B-VP
-to TO B-VP
-establish VB I-VP
-corporate JJ B-NP
-due JJ I-NP
-process NN I-NP
-? . O
-
-Here RB B-ADVP
-are VBP B-VP
-four CD B-NP
-key JJ I-NP
-steps NNS I-NP
-: : O
-
-1 CD B-LST
-. . O
-Make VB B-VP
-sure JJ B-ADJP
-you PRP B-NP
-have VBP B-VP
-a DT B-NP
-strong JJ I-NP
-personnel NNS I-NP
-department NN I-NP
-. . O
-
-It PRP B-NP
-must MD B-VP
-be VB I-VP
-able JJ B-ADJP
-to TO B-VP
-handle VB I-VP
-most RBS B-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-that WDT B-NP
-can MD B-VP
-not RB I-VP
-be VB I-VP
-solved VBN I-VP
-in IN B-PP
-the DT B-NP
-trenches NNS I-NP
-by IN B-PP
-managers NNS B-NP
-and CC O
-their PRP$ B-NP
-subordinates NNS I-NP
-, , O
-else RB B-ADVP
-the DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicators NNS I-NP
-will MD B-VP
-be VB B-VP
-inundated VBN I-VP
-with IN B-PP
-cases NNS B-NP
-. . O
-
-At IN B-PP
-Polaroid NNP B-NP
-, , O
-the DT B-NP
-Personnel NNP I-NP
-Policy NNP I-NP
-Planning NNP I-NP
-Committee NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-about IN I-NP
-20 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-; : O
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-many JJ I-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-are VBP B-VP
-resolved VBN I-VP
-at IN B-PP
-earlier JJR B-NP
-stages NNS I-NP
-. . O
-
-At IN B-PP
-TWA NNP B-NP
-, , O
-the DT B-NP
-System NNP I-NP
-Board NNP I-NP
-of IN B-PP
-Adjustment NNP B-NP
-hears VBZ B-VP
-50 CD B-NP
-to TO I-NP
-75 CD I-NP
-cases VBZ I-NP
-a DT B-NP
-year NN I-NP
-, , O
-only RB B-NP
-a DT I-NP
-fraction NN I-NP
-of IN B-PP
-the DT B-NP
-complaints NNS I-NP
-brought VBN B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-At IN B-PP
-Citicorp NNP B-NP
-, , O
-the DT B-NP
-Problem NNP I-NP
-Review NNP I-NP
-Board NNP I-NP
-may MD B-VP
-hear VB I-VP
-only RB B-NP
-12 CD I-NP
-or CC I-NP
-so RB I-NP
-cases VBZ I-NP
-because IN B-PP
-of IN I-PP
-personnel NNS B-NP
-'s POS B-NP
-skill NN I-NP
-in IN B-PP
-complaint-resolution NN B-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-typical JJ I-NP
-year NN I-NP
-, , O
-up IN B-NP
-to TO I-NP
-20 CD I-NP
-% NN I-NP
-of IN B-PP
-the DT B-NP
-work NN I-NP
-force NN I-NP
-goes VBZ B-VP
-to TO B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-with IN B-PP
-complaints NNS B-NP
-of IN B-PP
-unfair JJ B-NP
-treatment NN I-NP
-. . O
-
-In IN B-PP
-a DT B-NP
-large JJ I-NP
-company NN I-NP
-that WDT B-NP
-means VBZ B-VP
-many JJ B-NP
-hundreds NNS I-NP
-of IN B-PP
-complaints NNS B-NP
-for IN B-PP
-personnel NNS B-NP
-to TO B-VP
-handle VB I-VP
-. . O
-
-2 CD B-LST
-. . O
-Formally RB B-ADVP
-or CC I-ADVP
-informally RB I-ADVP
-, , O
-train NN B-VP
-all DT B-NP
-your PRP$ I-NP
-managers NNS I-NP
-and CC I-NP
-supervisors NNS I-NP
-in IN B-PP
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-due-process NN I-NP
-approach NN I-NP
-. . O
-
-See VB B-VP
-that IN B-SBAR
-they PRP B-NP
-know VBP B-VP
-company NN B-NP
-personnel NNS I-NP
-policy NN I-NP
-backwards RB B-ADVP
-and CC I-ADVP
-forwards RB I-ADVP
-, , O
-for IN O
-it PRP B-NP
-is VBZ B-VP
-the DT B-NP
-`` `` I-NP
-law NN I-NP
-'' '' O
-governing VBG B-VP
-company NN B-NP
-courts NNS I-NP
-and CC I-NP
-adjudicators NNS I-NP
-. . O
-
-Coach NNP B-VP
-them PRP B-NP
-in IN B-PP
-handling NN B-VP
-complaints NNS B-NP
-so RB B-SBAR
-that IN I-SBAR
-they PRP B-NP
-can MD B-VP
-resolve VB I-VP
-problems NNS B-NP
-immediately RB B-ADVP
-. . O
-
-In IN B-SBAR
-case NN O
-managers NNS B-NP
-and CC O
-personnel NNS B-NP
-specialists NNS I-NP
-are VBP B-VP
-unsuccessful JJ B-ADJP
-and CC O
-subordinates NNS B-NP
-take VBP B-VP
-their PRP$ B-NP
-complaints NNS I-NP
-to TO B-PP
-a DT B-NP
-company NN I-NP
-court NN I-NP
-or CC I-NP
-adjudicator NN I-NP
-, , O
-teach VB B-VP
-managers NNS B-NP
-to TO B-VP
-accept VB I-VP
-reversals NNS B-NP
-as IN B-PP
-a DT B-NP
-fact NN I-NP
-of IN B-PP
-business NN B-NP
-life NN I-NP
-, , O
-for IN O
-in IN B-PP
-a DT B-NP
-good JJ I-NP
-due-process NN I-NP
-system NN I-NP
-they PRP B-NP
-are VBP B-VP
-bound VBN I-VP
-to TO I-VP
-happen VB I-VP
-. . O
-
-In IN B-PP
-the DT B-NP
-15 CD I-NP
-companies NNS I-NP
-I PRP B-NP
-studied VBD B-VP
-, , O
-reversal NN B-NP
-rates NNS I-NP
-range VBP B-VP
-on IN B-PP
-the DT B-NP
-average NN I-NP
-from IN B-PP
-20 CD B-NP
-% NN I-NP
-to TO B-PP
-40 CD B-NP
-% NN I-NP
-. . O
-
-3 CD B-LST
-. . O
-Decide VB B-VP
-whether IN O
-you PRP B-NP
-want VBP B-VP
-a DT B-NP
-panel NN I-NP
-system NN I-NP
-or CC O
-a DT B-NP
-single JJ I-NP
-adjudicator NN I-NP
-. . O
-
-A DT B-NP
-panel NN I-NP
-system NN I-NP
-like IN B-PP
-that DT B-NP
-in NN B-PP
-the DT B-NP
-Bob NNP I-NP
-Stone NNP I-NP
-example NN I-NP
-enjoys VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-high JJ B-NP
-credibility NN I-NP
-and CC O
-, , O
-for IN B-PP
-the DT B-NP
-panelists NNS I-NP
-, , O
-mutual JJ B-NP
-support NN I-NP
-. . O
-
-An DT B-NP
-adjudicator NN I-NP
-system NN I-NP
--- : O
-that DT B-INTJ
-is VBZ I-INTJ
-, , O
-an DT B-NP
-investigator NN I-NP
-who WP B-NP
-acts VBZ B-VP
-first JJ B-ADVP
-as IN B-PP
-a DT B-NP
-fact-finder NN I-NP
-and CC O
-then RB O
-switches VBZ B-VP
-hats NNS B-NP
-and CC O
-arbitrates VBZ B-VP
-the DT B-NP
-facts NNS I-NP
--- : O
-has VBZ B-VP
-such JJ B-NP
-advantages NNS I-NP
-as IN B-PP
-speed NN B-NP
-, , O
-flexibility NN B-NP
-and CC O
-maximum JJ B-NP
-privacy NN I-NP
-. . O
-
-International NNP B-NP
-Business NNP I-NP
-Machines NNPS I-NP
-and CC O
-Bank NNP B-NP
-of IN B-PP
-America NNP B-NP
-are VBP B-VP
-among IN B-PP
-the DT B-NP
-companies NNS I-NP
-using VBG B-VP
-the DT B-NP
-single-adjudicator JJ I-NP
-approach NN I-NP
-. . O
-
-4 CD B-LST
-. . O
-Make VB B-VP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-visible JJ B-ADJP
-. . O
-
-It PRP B-NP
-wo MD B-VP
-n't RB I-VP
-do VB I-VP
-any DT B-NP
-good NN I-NP
-for IN B-PP
-anybody NN B-NP
-unless IN B-SBAR
-employees NNS B-NP
-know VBP B-VP
-about IN B-PP
-it PRP B-NP
-. . O
-
-Most JJS B-NP
-managements NNS I-NP
-hesitate VBP B-VP
-to TO I-VP
-go VB I-VP
-all DT B-ADVP
-out NN I-ADVP
-in IN B-PP
-advertising VBG B-VP
-their PRP$ B-NP
-due-process NN I-NP
-systems NNS I-NP
-for IN B-PP
-fear NN B-NP
-of IN B-PP
-encouraging VBG B-VP
-cranks NNS B-NP
-and CC O
-chronic JJ B-NP
-soreheads NNS I-NP
-to TO B-VP
-file VB I-VP
-complaints NNS B-NP
-. . O
-
-On IN B-PP
-the DT B-NP
-other JJ I-NP
-hand NN I-NP
-, , O
-they PRP B-NP
-make VBP B-VP
-sure JJ B-ADJP
-at IN B-PP
-a DT B-NP
-minimum NN I-NP
-that IN B-SBAR
-their PRP$ B-NP
-systems NNS I-NP
-are VBP B-VP
-described VBN I-VP
-in IN B-PP
-their PRP$ B-NP
-employee NN I-NP
-handbooks NNS I-NP
-and CC O
-talked VBD B-VP
-up IN B-PRT
-by IN B-PP
-personnel NNS B-NP
-specialists NNS I-NP
-. . O
-
-Smith-Kline NNP B-NP
-Beecham NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-and CC O
-sometimes RB B-VP
-features VBZ I-VP
-its PRP$ B-NP
-grievance NN I-NP
-procedure NN I-NP
-in IN B-PP
-closed-circuit JJ B-NP
-TV NN I-NP
-programs NNS I-NP
-. . O
-
-Naturally RB B-ADVP
-, , O
-one CD B-NP
-of IN B-PP
-the DT B-NP
-best JJS I-NP
-ways NNS I-NP
-to TO B-VP
-guarantee VB I-VP
-visibility NN B-NP
-for IN B-PP
-your PRP$ B-NP
-due-process NN I-NP
-system NN I-NP
-is VBZ B-VP
-for IN B-SBAR
-top JJ B-NP
-management NN I-NP
-to TO B-VP
-support VB I-VP
-it PRP B-NP
-. . O
-
-At IN B-PP
-IBM NNP B-NP
-, , O
-the DT B-NP
-company NN I-NP
-'s POS B-NP
-Open NNP I-NP
-Door NNP I-NP
-system NN I-NP
-is VBZ B-VP
-sometimes RB B-ADVP
-the DT B-NP
-subject NN I-NP
-of IN B-PP
-memorandums NNS B-NP
-from IN B-PP
-the DT B-NP
-chief JJ I-NP
-executive NN I-NP
-. . O
-
-Federal NNP B-NP
-Express NNP I-NP
-goes VBZ B-VP
-further JJ B-ADVP
-in IN B-PP
-this DT B-NP
-respect NN I-NP
-than IN B-PP
-any DT B-NP
-company NN I-NP
-I PRP B-NP
-know VBP B-VP
-of IN B-PP
-with IN B-PP
-both DT B-NP
-Frederick NNP B-NP
-Smith NNP I-NP
-and CC O
-James NNP B-NP
-Barksdale NNP I-NP
-, , O
-chief JJ B-NP
-executive NN I-NP
-and CC O
-chief JJ B-NP
-operating VBG I-NP
-officer NN I-NP
-, , O
-respectively RB B-ADVP
-, , O
-sitting VBG B-VP
-in IN B-PRT
-on IN B-PP
-the DT B-NP
-Appeals NNP I-NP
-Board NNP I-NP
-almost RB B-NP
-every DT I-NP
-Tuesday NNP I-NP
-to TO B-VP
-decide VB I-VP
-cases NNS B-NP
-. . O
-
-Mr. NNP B-NP
-Ewing NNP I-NP
-is VBZ B-VP
-a DT B-NP
-consultant NN I-NP
-based VBN B-VP
-in IN B-PP
-Winchester NNP B-NP
-, , O
-Mass. NNP B-NP
-, , O
-and CC O
-author NN B-NP
-of IN B-PP
-`` `` O
-Justice NNP B-NP
-on IN B-PP
-the DT B-NP
-Job NNP I-NP
-: : O
-Resolving NNP B-VP
-Grievances NNP B-NP
-in IN B-PP
-the DT B-NP
-Nonunion NNP I-NP
-Workplace NN I-NP
-'' '' O
--LRB- ( O
-Harvard NNP B-NP
-Business NNP I-NP
-School NNP I-NP
-Press NNP I-NP
-, , O
-1989 CD B-NP
--RRB- ) O
-. . O
-
-Tokyo NNP B-NP
-stocks NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-active JJ B-NP
-trading NN I-NP
-Friday NNP B-NP
-, , O
-marking VBG B-VP
-the DT B-NP
-fourth JJ I-NP
-consecutive JJ I-NP
-daily JJ I-NP
-gain NN I-NP
-since IN B-PP
-Monday NNP B-NP
-'s POS B-NP
-sharp JJ I-NP
-fall NN I-NP
-. . O
-
-London JJ B-NP
-shares NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-in IN B-PP
-thin JJ B-NP
-trading NN I-NP
-. . O
-
-At IN B-PP
-Tokyo NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-of IN B-PP
-225 CD B-NP
-selected VBN I-NP
-issues NNS I-NP
-was VBD B-VP
-up IN B-ADVP
-112.16 CD B-NP
-points NNS I-NP
-to TO B-PP
-35486.38 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-advanced VBD B-VP
-266.66 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-early JJ B-NP
-trading NN I-NP
-in IN B-PP
-Tokyo NNP B-NP
-Monday NNP B-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-index NN I-NP
-rose VBD B-VP
-101.98 CD B-NP
-points NNS I-NP
-to TO B-PP
-35588.36 CD B-NP
-. . O
-
-Friday NNP B-NP
-'s POS B-NP
-volume NN I-NP
-on IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-one CD B-NP
-billion CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-862 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-Winners NNS B-NP
-outpaced VBD B-VP
-losers NNS B-NP
-, , O
-572 CD B-ADVP
-to TO I-ADVP
-368 CD I-ADVP
-, , O
-while IN B-SBAR
-181 CD B-NP
-issues NNS I-NP
-remained VBD B-VP
-unchanged JJ B-ADJP
-. . O
-
-With IN B-SBAR
-investors NNS B-NP
-relieved VBN B-ADJP
-at IN B-PP
-the DT B-NP
-overnight JJ I-NP
-gain NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-small-lot JJ B-NP
-buying NN I-NP
-orders NNS I-NP
-streamed VBD B-VP
-into IN B-PP
-the DT B-NP
-market NN I-NP
-from IN B-PP
-early JJ B-NP
-morning NN I-NP
-, , O
-making VBG B-VP
-traders NNS B-NP
-believe VBP B-VP
-the DT B-NP
-market NN I-NP
-was VBD B-VP
-back RB B-ADVP
-to TO B-PP
-normal JJ B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-, , O
-which WDT B-NP
-reached VBD B-VP
-as RB B-ADJP
-high JJ I-ADJP
-as IN B-PP
-35611.38 CD B-NP
-right NN B-ADVP
-after IN B-PP
-the DT B-NP
-opening NN I-NP
-, , O
-surrendered VBD B-VP
-part NN B-NP
-of IN B-PP
-its PRP$ B-NP
-early JJ I-NP
-advance NN I-NP
-toward IN B-PP
-the DT B-NP
-end NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-because IN B-PP
-of IN I-PP
-profit-taking NN B-NP
-. . O
-
-`` `` O
-Investors NNS B-NP
-, , B-NP
-especially RB I-NP
-dealers NNS B-NP
-, , O
-do VBP B-VP
-n't RB I-VP
-want VB I-VP
-to TO I-VP
-hold VB I-VP
-a DT B-NP
-position NN I-NP
-over IN B-PP
-the DT B-NP
-weekend NN I-NP
-, , O
-'' '' O
-a DT B-NP
-trader NN I-NP
-at IN B-PP
-Dai-ichi NNP B-NP
-Securities NNP I-NP
-said VBD B-VP
-, , O
-adding VBG B-VP
-, , O
-though RB B-ADVP
-, , O
-that IN B-SBAR
-the DT B-NP
-trading NN I-NP
-mood NN I-NP
-remained VBD B-VP
-positive JJ B-ADJP
-through IN B-PP
-the DT B-NP
-afternoon NN I-NP
-session NN I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-Stock NNP I-NP
-Price NNP I-NP
-Index NNP I-NP
--LRB- ( O
-Topix NNP B-NP
--RRB- ) O
-of IN B-PP
-all DT B-NP
-issues NNS I-NP
-listed VBN B-VP
-in IN B-PP
-the DT B-NP
-First NNP I-NP
-Section NN I-NP
-, , O
-which WDT B-NP
-gained VBD B-VP
-22.78 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-14.06 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.53 CD B-NP
-% NN I-NP
-, , O
-at IN B-PP
-2679.72 CD B-NP
-. . O
-
-The DT B-NP
-Second JJ I-NP
-Section NN I-NP
-index NN I-NP
-, , O
-which WDT B-NP
-rose VBD B-VP
-15.72 CD B-NP
-points NNS I-NP
-Thursday NNP B-NP
-, , O
-was VBD B-VP
-up IN B-ADVP
-11.88 CD B-NP
-points NNS I-NP
-, , O
-or CC O
-0.32 CD B-NP
-% NN I-NP
-, , O
-to TO B-VP
-close VB I-VP
-at IN B-PP
-3717.46 CD B-NP
-. . O
-
-Volume NN B-NP
-in IN B-PP
-the DT B-NP
-second JJ I-NP
-section NN I-NP
-was VBD B-VP
-estimated VBN I-VP
-at IN B-PP
-30 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-up IN B-ADVP
-from IN B-PP
-28 CD B-NP
-million CD I-NP
-Thursday NNP B-NP
-. . O
-
-In IN B-PP
-turmoil NN B-NP
-caused VBN B-VP
-by IN B-PP
-the DT O
-previous JJ B-NP
-Friday NNP I-NP
-'s POS B-NP
-plunge NN I-NP
-in IN B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-, , O
-the DT B-NP
-Nikkei NNP I-NP
-marked VBD B-VP
-a DT B-NP
-sharp JJ I-NP
-647.33-point JJ I-NP
-fall NN I-NP
-Monday NNP B-NP
-. . O
-
-But CC O
-the DT B-NP
-Nikkei NNP I-NP
-fell VBD B-VP
-an DT B-NP
-overall JJ I-NP
-1.8 CD I-NP
-% NN I-NP
-in IN B-PP
-value NN B-NP
-that DT B-NP
-day NN I-NP
-compared VBN B-PP
-with IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS I-NP
-far RB B-ADJP
-sharper JJR I-ADJP
-6.9 CD B-ADJP
-% NN I-ADJP
-drop NN B-NP
-on IN B-PP
-Oct. NNP B-NP
-13 CD I-NP
-. . O
-
-The DT B-NP
-Tokyo NNP I-NP
-market NN I-NP
-'s POS B-NP
-resiliency NN I-NP
-helped VBD B-VP
-participants NNS B-NP
-to TO B-VP
-regain VB I-VP
-confidence NN B-NP
-gradually RB B-ADVP
-as IN B-SBAR
-they PRP B-NP
-spent VBD B-VP
-more JJR B-NP
-time NN I-NP
-on IN B-PP
-analyzing VBG B-VP
-factors NNS B-NP
-that WDT B-NP
-caused VBD B-VP
-the DT B-NP
-Friday NNP I-NP
-plunge NN I-NP
-and CC O
-realized VBD B-VP
-these DT B-NP
-problems NNS I-NP
-were VBD B-VP
-unique JJ B-ADJP
-to TO B-PP
-New NNP B-NP
-York NNP I-NP
-stocks NNS I-NP
-and CC B-ADJP
-not RB I-ADJP
-directly RB B-ADJP
-related VBN I-ADJP
-to TO B-PP
-Tokyo NNP B-NP
-. . O
-
-The DT B-NP
-Nikkei NNP I-NP
-continued VBD B-VP
-to TO I-VP
-gain VB I-VP
-for IN B-PP
-the DT B-NP
-rest NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-, , O
-adding VBG B-VP
-1017.69 CD B-NP
-points NNS I-NP
-in IN B-PP
-four CD B-NP
-days NNS I-NP
--- : O
-more JJR B-VP
-than IN I-VP
-erasing VBG I-VP
-Monday NNP B-NP
-'s POS B-NP
-losses NNS I-NP
-. . O
-
-But CC O
-further JJ B-NP
-major JJ I-NP
-advances NNS I-NP
-on IN B-PP
-the DT B-NP
-Nikkei NNP I-NP
-are VBP B-VP
-n't RB I-VP
-foreseen VBN I-VP
-this DT B-NP
-week NN I-NP
-by IN B-PP
-market NN B-NP
-observers NNS I-NP
-. . O
-
-Investors NNS B-NP
-are VBP B-VP
-still RB I-VP
-waiting VBG I-VP
-to TO I-VP
-see VB I-VP
-how WRB B-ADVP
-the DT B-NP
-U.S. NNP I-NP
-government NN I-NP
-will MD B-VP
-decide VB I-VP
-on IN B-PP
-interest NN B-NP
-rates NNS I-NP
-and CC O
-how WRB B-ADVP
-the DT B-NP
-dollar NN I-NP
-will MD B-VP
-be VB I-VP
-stabilized VBN I-VP
-. . O
-
-Some DT B-NP
-high-priced JJ I-NP
-issues NNS I-NP
-made VBD B-VP
-a DT B-NP
-comeback NN I-NP
-Friday NNP B-NP
-. . O
-
-Pioneer NNP B-NP
-surged VBD B-VP
-450 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-3.16 CD I-NP
--RRB- ) O
-to TO B-PP
-6,050 CD B-NP
-yen NN I-NP
--LRB- ( O
-$ $ B-NP
-42.60 CD I-NP
--RRB- ) O
-. . O
-
-Kyocera NNP B-NP
-advanced VBD B-VP
-80 CD B-NP
-yen NN I-NP
-to TO B-PP
-5,440 CD B-NP
-. . O
-
-Fanuc NNP B-NP
-gained VBD B-VP
-100 CD B-NP
-to TO B-PP
-7,580 CD B-NP
-. . O
-
-Breweries NNP B-NP
-attracted VBD B-VP
-investors NNS B-NP
-because IN B-PP
-of IN I-PP
-their PRP$ B-NP
-land NN I-NP
-property NN I-NP
-holdings NNS I-NP
-that WDT B-NP
-could MD B-VP
-figure VB I-VP
-in IN B-PP
-development NN B-NP
-or CC O
-other JJ B-NP
-plans NNS I-NP
-, , O
-traders NNS B-NP
-said VBD B-VP
-. . O
-
-Sapporo NNP B-NP
-gained VBD B-VP
-80 CD B-NP
-to TO B-PP
-1,920 CD B-NP
-and CC O
-Kirin NNP B-NP
-added VBD B-VP
-60 CD B-NP
-to TO B-PP
-2,070 CD B-NP
-. . O
-
-Housings NNS B-NP
-, , I-NP
-constructions NNS I-NP
-and CC I-NP
-pharmaceuticals NNS I-NP
-continued VBD B-VP
-to TO I-VP
-be VB I-VP
-bought VBN I-VP
-following VBG B-PP
-Thursday NNP B-NP
-'s POS B-NP
-gains NNS I-NP
-because IN B-PP
-of IN I-PP
-strong JJ B-NP
-earnings NNS I-NP
-outlooks NNS I-NP
-. . O
-
-Daiwa NNP B-NP
-House NNP I-NP
-gained VBD B-VP
-50 CD B-NP
-to TO B-PP
-2,660 CD B-NP
-. . O
-
-Misawa NNP B-NP
-Homes NNP I-NP
-was VBD B-VP
-up IN B-ADVP
-20 CD B-NP
-at IN B-PP
-2,960 CD B-NP
-. . O
-
-Kajima NNP B-NP
-advanced VBD B-VP
-40 CD B-NP
-to TO B-PP
-2,120 CD B-NP
-and CC O
-Ohbayashi NNP B-NP
-added VBD B-VP
-50 CD B-NP
-to TO B-PP
-1,730 CD B-NP
-. . O
-
-Fujisawa NNP B-NP
-added VBD B-VP
-80 CD B-NP
-to TO B-PP
-2,010 CD B-NP
-and CC O
-Mochida NNP B-NP
-advanced VBD B-VP
-230 CD B-NP
-to TO B-PP
-4,400 CD B-NP
-. . O
-
-London JJ B-NP
-share NN I-NP
-prices NNS I-NP
-were VBD B-VP
-influenced VBN I-VP
-largely RB B-ADVP
-by IN B-PP
-declines NNS B-NP
-on IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-and CC O
-weakness NN B-NP
-in IN B-PP
-the DT B-NP
-British JJ I-NP
-pound NN I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-Financial NNP I-NP
-Times-Stock NNP I-NP
-Exchange NNP I-NP
-100-share JJ I-NP
-index NN I-NP
-ended VBD B-VP
-10.2 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-2179.1 CD B-NP
-, , O
-above IN B-ADVP
-its PRP$ B-NP
-intraday JJ I-NP
-low NN I-NP
-of IN B-PP
-2176.9 CD B-NP
-, , B-ADVP
-but CC I-ADVP
-off IN B-ADVP
-the DT B-NP
-day NN I-NP
-'s POS I-NP
-high NN B-NP
-of IN B-PP
-2189 CD B-NP
-. . O
-
-The DT B-NP
-index NN I-NP
-finished VBD B-VP
-2.4 CD B-NP
-% NN I-NP
-under IN B-PP
-its PRP$ B-NP
-close NN I-NP
-of IN B-PP
-2233.9 CD B-NP
-the DT B-NP
-previous JJ I-NP
-Friday NNP I-NP
-, , O
-although IN B-SBAR
-it PRP B-NP
-recouped VBD B-VP
-some DT B-NP
-of IN B-PP
-the DT B-NP
-sharp JJ I-NP
-losses NNS I-NP
-staged VBD B-VP
-early JJ B-NP
-last JJ I-NP
-week NN I-NP
-on IN B-PP
-the DT B-NP
-back RB I-NP
-of IN B-PP
-Wall NNP B-NP
-Street NNP I-NP
-'s POS B-NP
-fall NN I-NP
-. . O
-
-London NNP B-NP
-was VBD B-VP
-weak JJ B-ADJP
-throughout IN B-PP
-Friday NNP B-NP
-'s POS B-NP
-trading NN I-NP
-, , O
-however RB B-ADVP
-, , O
-on IN B-PP
-what WP B-NP
-dealers NNS B-NP
-attributed VBD B-VP
-to TO B-PP
-generally RB B-NP
-thin JJ I-NP
-interest NN I-NP
-ahead RB B-ADVP
-of IN B-PP
-the DT B-NP
-weekend NN I-NP
-and CC O
-this DT B-NP
-week NN I-NP
-'s POS I-NP
-potentially RB B-ADJP
-important JJ I-ADJP
-U.K. NNP B-NP
-trade NN I-NP
-figures NNS I-NP
-for IN B-PP
-September NNP B-NP
-. . O
-
-The DT B-NP
-FT-SE NNP I-NP
-100 CD I-NP
-largely RB B-ADVP
-remained VBD B-VP
-within IN B-PP
-an DT B-NP
-11-point JJ I-NP
-range NN I-NP
-establshed VBN B-VP
-within IN B-PP
-the DT B-NP
-first JJ I-NP
-hour NN I-NP
-of IN B-PP
-trading NN B-NP
-before IN B-PP
-it PRP B-NP
-eased VBD B-VP
-to TO B-PP
-an DT B-NP
-intraday JJ I-NP
-low JJ I-NP
-late RB B-ADVP
-in IN B-PP
-the DT B-NP
-session NN I-NP
-when WRB B-ADVP
-a DT B-NP
-flurry NN I-NP
-of IN B-PP
-program NN B-NP
-selling VBG I-NP
-pushed VBN B-VP
-Wall NNP B-NP
-Street NNP I-NP
-lower JJR B-ADVP
-. . O
-
-The DT B-NP
-FT NNP I-NP
-30-share JJ I-NP
-index NN I-NP
-closed VBD B-VP
-11.0 CD B-NP
-points NNS I-NP
-lower JJR B-ADVP
-at IN B-PP
-1761.0 CD B-NP
-. . O
-
-Volume NN B-NP
-was VBD B-VP
-extremely RB B-ADJP
-thin JJ I-ADJP
-at IN B-PP
-351.3 CD B-NP
-million CD I-NP
-shares NNS I-NP
-, , O
-the DT B-NP
-lightest JJS I-NP
-volume NN I-NP
-of IN B-PP
-the DT B-NP
-week NN I-NP
-and CC O
-modestly RB B-ADVP
-under IN B-PP
-Thursday NNP B-NP
-'s POS B-NP
-387.4 CD I-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-the DT B-NP
-day NN I-NP
-'s POS B-NP
-action NN I-NP
-was VBD B-VP
-featureless JJ B-ADJP
-outside IN B-PP
-some DT B-NP
-response NN I-NP
-to TO B-PP
-sterling NN B-NP
-'s POS B-NP
-early JJ I-NP
-weakness NN I-NP
-against IN B-PP
-the DT B-NP
-mark NN I-NP
-, , O
-and CC O
-fears NNS B-NP
-that IN B-SBAR
-Wall NNP B-NP
-Street NNP I-NP
-might MD B-VP
-open RB I-VP
-lower JJR B-ADVP
-after IN B-PP
-its PRP$ B-NP
-strong JJ I-NP
-leap NN I-NP
-forward RB B-ADVP
-Thursday NNP B-NP
-. . O
-
-They PRP B-NP
-added VBD B-VP
-that IN B-SBAR
-market-makers NNS B-NP
-were VBD B-VP
-largely RB I-VP
-sidelined VBN I-VP
-after IN B-PP
-aggressively RB B-VP
-supporting VBG I-VP
-the DT B-NP
-market NN I-NP
-Thursday NNP B-NP
-in IN B-PP
-their PRP$ B-NP
-quest NN I-NP
-to TO B-VP
-cover VB I-VP
-internal JJ B-NP
-shortages NNS I-NP
-of IN B-PP
-FT-SE NNP B-NP
-100 CD I-NP
-shares NNS I-NP
-. . O
-
-Interest NN B-NP
-may MD B-VP
-remain VB I-VP
-limited JJ B-ADJP
-into IN B-PP
-tomorrow NN B-NP
-'s POS B-NP
-U.K. NNP I-NP
-trade NN I-NP
-figures NNS I-NP
-, , O
-which WDT B-NP
-the DT B-NP
-market NN I-NP
-will MD B-VP
-be VB I-VP
-watching VBG I-VP
-closely RB B-ADVP
-to TO B-VP
-see VB I-VP
-if IN B-SBAR
-there EX B-NP
-is VBZ B-VP
-any DT B-NP
-improvement NN I-NP
-after IN B-PP
-disappointing JJ B-NP
-numbers NNS I-NP
-in IN B-PP
-the DT B-NP
-previous JJ I-NP
-two CD I-NP
-months NNS I-NP
-. . O
-
-The DT B-NP
-key JJ I-NP
-corporate JJ I-NP
-news NN I-NP
-of IN B-PP
-the DT B-NP
-day NN I-NP
-was VBD B-VP
-that IN B-SBAR
-British JJ B-NP
-Airways NNPS I-NP
-decided VBD B-VP
-to TO I-VP
-withdraw VB I-VP
-from IN B-PP
-a DT B-NP
-management-led JJ I-NP
-bid NN I-NP
-for IN B-PP
-UAL NNP B-NP
-Corp. NNP I-NP
-, , O
-the DT B-NP
-parent NN I-NP
-of IN B-PP
-United NNP B-NP
-Airlines NNPS I-NP
-. . O
-
-British JJ B-NP
-Airways NNPS I-NP
-rose VBD B-VP
-initially RB B-ADVP
-after IN B-PP
-announcing VBG B-VP
-its PRP$ B-NP
-withdrawal NN I-NP
-from IN B-PP
-the DT B-NP
-UAL NNP I-NP
-deal NN I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-they PRP B-NP
-viewed VBD B-VP
-the DT O
-initial JJ O
-# # O
-390-million CD O
--LRB- ( O
-$ $ B-ADJP
-622 CD O
-million CD O
--RRB- ) O
-outlay NN B-NP
-for IN B-PP
-a DT B-NP
-15 CD I-NP
-% NN I-NP
-stake NN I-NP
-in IN B-PP
-the DT B-NP
-airline NN I-NP
-as IN B-PP
-a DT B-NP
-bit NN I-NP
-much JJ I-NP
-. . O
-
-Its PRP$ B-NP
-shares NNS I-NP
-slid VBD B-VP
-in IN B-PP
-late JJ B-NP
-dealings NNS I-NP
-to TO B-VP
-close VB I-VP
-a DT B-NP
-penny NN I-NP
-per IN B-PP
-share NN B-NP
-lower JJR B-ADVP
-at IN B-PP
-197 CD B-NP
-pence NN I-NP
-. . O
-
-The DT B-NP
-airline NN I-NP
-was VBD B-VP
-the DT B-NP
-most RBS I-NP
-active JJ I-NP
-FT-SE NNP I-NP
-100 CD I-NP
-at IN B-PP
-8.2 CD B-NP
-million CD I-NP
-shares NNS I-NP
-traded VBN B-VP
-. . O
-
-The DT B-NP
-next JJ I-NP
-most RBS I-NP
-active JJ I-NP
-top-tier JJ I-NP
-stock NN I-NP
-was VBD B-VP
-B.A.T NNP B-NP
-Industries NNPS I-NP
-, , O
-the DT B-NP
-target NN I-NP
-of IN B-PP
-Sir NNP B-NP
-James NNP I-NP
-Goldsmith NNP I-NP
-'s POS B-NP
-# # B-ADJP
-13.4 CD O
-billion CD O
-bid NN B-NP
-. . O
-
-The DT B-NP
-company NN I-NP
-gained VBD B-VP
-shareholder NN B-NP
-approval NN I-NP
-Thursday NNP B-NP
-to TO B-VP
-restructure VB I-VP
-in IN B-PP
-a DT B-NP
-bid NN I-NP
-to TO B-VP
-fend VB I-VP
-off IN B-PRT
-the DT B-NP
-hostile JJ I-NP
-takeover NN I-NP
-. . O
-
-Sir NNP B-NP
-James NNP I-NP
-said VBD B-VP
-Thursday NNP B-NP
-night NN I-NP
-that IN B-SBAR
-his PRP$ B-NP
-plans NNS I-NP
-for IN B-PP
-the DT B-NP
-takeover NN I-NP
-had VBD B-VP
-n't RB I-VP
-changed VBN I-VP
-. . O
-
-B.A.T NNP B-NP
-ended VBD B-VP
-the DT B-NP
-day NN I-NP
-at IN B-PP
-778 CD B-NP
-, , O
-down JJ B-ADVP
-5 NN B-NP
-, , O
-on IN B-PP
-turnover NN B-NP
-of IN B-PP
-7.5 CD B-NP
-million CD I-NP
-shares NNS I-NP
-. . O
-
-Dealers NNS B-NP
-said VBD B-VP
-it PRP B-NP
-was VBD B-VP
-hit VBN I-VP
-by IN B-PP
-some DT B-NP
-profit-taking NN I-NP
-after IN B-PP
-gains NNS B-NP
-since IN B-PP
-mid-week NN B-NP
-. . O
-
-In IN B-PP
-other JJ B-NP
-active JJ I-NP
-shares NNS I-NP
-, , O
-Trusthouse NNP B-NP
-Forte NNP I-NP
-shed VB B-VP
-10 CD B-NP
-to TO B-PP
-294 CD B-NP
-on IN B-PP
-volume NN B-NP
-of IN B-PP
-6.4 CD B-NP
-million CD I-NP
-shares NNS I-NP
-after IN B-PP
-a DT B-NP
-Barclays NNP I-NP
-De NNP I-NP
-Zoete NNP I-NP
-Wedd NNP I-NP
-downgrading NN I-NP
-, , O
-while IN B-SBAR
-Hillsdown NNP B-NP
-Holdings NNP I-NP
-, , O
-a DT B-NP
-food NN I-NP
-products NNS I-NP
-concern VBP I-NP
-, , O
-was VBD B-VP
-boosted VBN I-VP
-2 CD B-NP
-to TO B-PP
-271 CD B-NP
-after IN O
-it PRP B-NP
-disclosed VBD B-VP
-it PRP B-NP
-would MD B-VP
-seek VB I-VP
-shareholder NN B-NP
-approval NN I-NP
-to TO B-VP
-begin VB I-VP
-share NN B-NP
-repurchases NNS I-NP
-. . O
-
-Elsewhere RB B-ADVP
-in IN B-PP
-Europe NNP B-NP
-, , O
-share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Stockholm NNP B-NP
-, , I-NP
-Brussels NNP I-NP
-and CC I-NP
-Milan NNP I-NP
-. . O
-
-Prices NNS B-NP
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Frankfurt NNP B-NP
-, , I-NP
-Zurich NNP I-NP
-, , I-NP
-Paris NNP I-NP
-and CC I-NP
-Amsterdam NNP I-NP
-. . O
-
-South JJ B-NP
-African JJ I-NP
-gold NN I-NP
-stocks NNS I-NP
-closed VBD B-VP
-moderately RB B-ADVP
-lower JJR I-ADVP
-. . O
-
-Share NN B-NP
-prices NNS I-NP
-closed VBD B-VP
-higher JJR B-ADVP
-in IN B-PP
-Sydney NNP B-NP
-, , O
-Taipei NNP B-NP
-, , O
-Wellington NNP B-NP
-, , O
-Manila NNP B-NP
-, , O
-Hong NNP B-NP
-Kong NNP I-NP
-and CC O
-Singapore NNP B-NP
-and CC O
-were VBD B-VP
-lower JJR B-ADJP
-in IN B-PP
-Seoul NNP B-NP
-. . O
-
-Here RB B-ADVP
-are VBP B-VP
-price NN B-NP
-trends NNS I-NP
-on IN B-PP
-the DT B-NP
-world NN I-NP
-'s POS B-NP
-major JJ I-NP
-stock NN I-NP
-markets NNS I-NP
-, , O
-as IN B-SBAR
-calculated VBN B-VP
-by IN B-PP
-Morgan NNP B-NP
-Stanley NNP I-NP
-Capital NNP I-NP
-International NNP I-NP
-Perspective NNP I-NP
-, , O
-Geneva NNP B-NP
-. . O
-
-To TO B-VP
-make VB I-VP
-them PRP B-NP
-directly RB B-ADJP
-comparable JJ I-ADJP
-, , O
-each DT B-NP
-index NN I-NP
-is VBZ B-VP
-based VBN I-VP
-on IN B-PP
-the DT B-NP
-close NN I-NP
-of IN B-PP
-1969 CD B-NP
-equaling VBG B-VP
-100 CD B-NP
-. . O
-
-The DT B-NP
-percentage NN I-NP
-change NN I-NP
-is VBZ B-VP
-since IN B-PP
-year-end NN B-NP
-. . O
-
-The DT B-NP
-U.S. NNP I-NP
-is VBZ B-VP
-required VBN I-VP
-to TO I-VP
-notify VB I-VP
-foreign JJ B-NP
-dictators NNS I-NP
-if IN B-SBAR
-it PRP B-NP
-knows VBZ B-VP
-of IN B-PP
-coup NN B-NP
-plans NNS I-NP
-likely JJ B-ADJP
-to TO B-VP
-endanger VB I-VP
-their PRP$ B-NP
-lives NNS I-NP
-, , O
-government NN B-NP
-officials NNS I-NP
-said VBD B-VP
-. . O
-
-The DT B-NP
-notification NN I-NP
-policy NN I-NP
-was VBD B-VP
-part NN B-NP
-of IN B-PP
-a DT B-NP
-set NN I-NP
-of IN B-PP
-guidelines NNS B-NP
-on IN B-PP
-handling NN B-VP
-coups NNS B-NP
-outlined VBN B-VP
-in IN B-PP
-a DT B-NP
-secret JJ I-NP
-1988 CD I-NP
-exchange NN I-NP
-of IN B-PP
-letters NNS B-NP
-between IN B-PP
-the DT B-NP
-Reagan NNP I-NP
-administration NN I-NP
-and CC O
-the DT B-NP
-Senate NNP I-NP
-Intelligence NNP I-NP
-Committee NNP I-NP
-. . O
-
-The DT B-NP
-existence NN I-NP
-of IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-has VBZ B-VP
-become VBN I-VP
-known VBN I-VP
-since IN B-SBAR
-President NNP B-NP
-Bush NNP I-NP
-disclosed VBD B-VP
-them PRP B-NP
-privately RB B-ADVP
-to TO B-PP
-seven CD B-NP
-Republican NNP I-NP
-senators NNS I-NP
-at IN B-PP
-a DT B-NP
-White NNP I-NP
-House NNP I-NP
-meeting NN I-NP
-last JJ B-NP
-Monday NNP I-NP
-. . O
-
-Officials NNS B-NP
-familiar JJ B-ADJP
-with IN B-PP
-the DT B-NP
-meeting NN I-NP
-said VBD B-VP
-Mr. NNP B-NP
-Bush NNP I-NP
-cited VBD B-VP
-the DT B-NP
-policy NN I-NP
-as IN B-PP
-an DT B-NP
-example NN I-NP
-of IN B-PP
-the DT B-NP
-sort NN I-NP
-of IN B-PP
-congressional JJ B-NP
-requirements NNS I-NP
-the DT B-NP
-administration NN I-NP
-contends VBZ B-VP
-contribute VB B-VP
-to TO B-PP
-the DT B-NP
-failure NN I-NP
-of IN B-PP
-such JJ B-NP
-covert JJ I-NP
-actions NNS I-NP
-as IN B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-futile JJ I-NP
-effort NN I-NP
-to TO B-VP
-oust VB I-VP
-Panamanian JJ B-NP
-dictator NN I-NP
-Manuel NNP I-NP
-Noriega NNP I-NP
-. . O
-
-According VBG B-PP
-to TO B-PP
-the DT B-NP
-officials NNS I-NP
-, , O
-Mr. NNP B-NP
-Bush NNP I-NP
-even RB B-ADVP
-read VB B-VP
-to TO B-PP
-the DT B-NP
-senators NNS I-NP
-selections NNS B-NP
-from IN B-PP
-a DT B-NP
-highly RB I-NP
-classified VBN I-NP
-letter NN I-NP
-from IN B-PP
-the DT B-NP
-committee NN I-NP
-to TO B-PP
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-discussing VBG B-VP
-the DT B-NP
-guidelines NNS I-NP
-. . O
-
-They PRP B-NP
-said VBD B-VP
-the DT B-NP
-president NN I-NP
-conceded VBD B-VP
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-did VBD B-VP
-n't RB I-VP
-affect VB I-VP
-his PRP$ B-NP
-decision NN I-NP
-to TO B-VP
-lend VB I-VP
-only RB B-NP
-minor JJ I-NP
-support NN I-NP
-to TO B-PP
-this DT B-NP
-month NN I-NP
-'s POS B-NP
-Panama NNP I-NP
-coup NN I-NP
-effort NN I-NP
-. . O
-
-No DT B-NP
-notification NN I-NP
-was VBD B-VP
-ever RB I-VP
-considered VBN I-VP
-, , O
-officials NNS B-NP
-said VBD B-VP
-, , O
-apparently RB B-ADVP
-because IN B-SBAR
-the DT B-NP
-U.S. NNP I-NP
-did VBD B-VP
-n't RB I-VP
-think VB I-VP
-the DT B-NP
-coup NN I-NP
-plotters NNS I-NP
-intended VBN B-VP
-to TO I-VP
-kill VB I-VP
-Mr. NNP B-NP
-Noriega NNP I-NP
-, , O
-but CC O
-merely RB B-VP
-sought VBD I-VP
-to TO I-VP
-imprison VB I-VP
-him PRP B-NP
-. . O
-
-What WP B-NP
-'s VBZ B-VP
-more JJR B-NP
-, , O
-both DT B-NP
-administration NN B-NP
-and CC O
-congressional JJ B-NP
-officials NNS I-NP
-hint VBP B-VP
-that IN B-SBAR
-the DT B-NP
-notification NN I-NP
-requirement NN I-NP
-is VBZ B-VP
-likely JJ B-ADJP
-to TO B-VP
-be VB I-VP
-dropped VBN I-VP
-from IN B-PP
-the DT B-NP
-guidelines NNS I-NP
-on IN B-PP
-coup NN B-NP
-attempts NNS I-NP
-that WDT B-NP
-are VBP B-VP
-being VBG I-VP
-rewritten VBN I-VP
-by IN B-PP
-the DT B-NP
-panel NN I-NP
-and CC O
-the DT B-NP
-White NNP I-NP
-House NNP I-NP
-. . O
-
-The DT B-NP
-rewriting VBG I-NP
-was VBD B-VP
-launched VBN I-VP
-at IN B-PP
-a DT B-NP
-meeting NN I-NP
-between IN B-PP
-Mr. NNP B-NP
-Bush NNP I-NP
-and CC O
-intelligence NN B-NP
-committee NN I-NP
-leaders NNS I-NP
-Oct. NNP B-NP
-12 CD I-NP
-, , O
-a DT B-NP
-few JJ I-NP
-days NNS I-NP
-before IN B-PP
-the DT B-NP
-meeting NN I-NP
-at IN B-PP
-which WDT B-NP
-the DT B-NP
-president NN I-NP
-complained VBD B-VP
-about IN B-PP
-the DT B-NP
-rules NNS I-NP
-. . O
-
-However RB B-ADVP
-, , O
-the DT B-NP
-disclosure NN I-NP
-of IN B-PP
diff --git a/paddle/trainer/tests/train_files.txt b/paddle/trainer/tests/train_files.txt
deleted file mode 100644
index 1c26891495..0000000000
--- a/paddle/trainer/tests/train_files.txt
+++ /dev/null
@@ -1 +0,0 @@
-trainer/tests/train_proto.bin
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
deleted file mode 100644
index a6dbdcae3f..0000000000
--- a/paddle/utils/BarrierStat.cpp
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/utils/BarrierStat.h"
-#include <string.h>
-#include <sys/types.h>
-#include <algorithm>
-#include <iomanip>
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Stat.h"
-
-DEFINE_bool(log_barrier_abstract,
-            true,
-            "if true, show abstract of barrier performance");
-DEFINE_int32(log_barrier_lowest_nodes,
-             5,
-             "how many lowest node will be logged");
-DEFINE_bool(log_barrier_show_log,
-            false,  // for performance tuning insight
-            "if true, always show barrier abstract even with little gap");
-
-namespace paddle {
-
-std::ostream &operator<<(std::ostream &output, const BarrierStatBase &stat) {
-  if (FLAGS_log_barrier_abstract) {
-    std::lock_guard<std::mutex> guard(stat.lock_);
-    stat.showAbstract(output);
-  }
-  return output;
-}
-
-BarrierStatBase::BarrierStatBase(uint16_t numConnThreads,
-                                 const std::string &name)
-    : totSamples_(0), numConnThreads_(numConnThreads), name_(name) {
-  abstract_.resize(numConnThreads_);
-  if (FLAGS_log_barrier_show_log) {
-    rateThreshold_ = 0.0;
-  } else {
-    /* probablity of abnormal node
-     * p = 1/n + (n/8)/(n+1), n = nodes, n > 1
-     * if the freq of lowest trainerId larger than p,
-     * output FLAGS_log_barrier_lowest_nodes lastTrainerId.
-     * numConnThreads_ indicates nodes
-     */
-    float n = (float)numConnThreads;
-    rateThreshold_ = 1.0 / n + (n / 8.0) / (n + 1.0);
-  }
-}
-
-BarrierEndStat::BarrierEndStat(uint16_t numConnThreads, const std::string &name)
-    : BarrierStatBase(numConnThreads, name) {
-  timeVector_.reset(new TimeVectorEnd(numConnThreads_));
-  reset(true);
-  LOG(INFO) << " create barrierEndStat: " << name
-            << " endBarrier warning rate: " << rateThreshold_;
-}
-
-/*
- * Note:
- * the design different pserver entity owns different statSet to obey
- * the background that different pserver runs separately.
- */
-void BarrierEndStat::updateStat(struct timeval &cur, int32_t trainerId) {
-  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
-
-  std::lock_guard<std::mutex> guard(lock_);
-  timeVector_->addTimeval(cur, trainerId);
-
-  if (timeVector_->full()) {
-    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
-    auto id = timeVector_->getLastTrainerId();
-    auto delta = timeToMicroSecond(timeVector_->getDelta());
-    auto secondDelta = timeToMicroSecond(timeVector_->get1NDelta());
-    auto lastTwoDelta = timeToMicroSecond(timeVector_->getMinus1NDelta());
-    auto midDelta = timeToMicroSecond(timeVector_->getMidNDelta());
-    // discard first sample, since first sample probably is abnormal.
-    if (totSamples_) {
-      abstract_[id].freq++;
-
-      if (delta < abstract_[id].minDelta) {
-        abstract_[id].minDelta = delta;
-      }
-      if (delta > abstract_[id].maxDelta) {
-        abstract_[id].maxDelta = delta;
-      }
-      abstract_[id].totDelta += delta;
-      abstract_[id].totSecondDelta += secondDelta;
-      abstract_[id].totLastTwoDelta += lastTwoDelta;
-      abstract_[id].totMidDelta += midDelta;
-
-      // update totAbstract_
-      totAbstract_.freq++;
-      if (delta < totAbstract_.minDelta) {
-        totAbstract_.minDelta = delta;
-      }
-      if (delta > totAbstract_.maxDelta) {
-        totAbstract_.maxDelta = delta;
-      }
-      totAbstract_.totDelta += delta;
-      totAbstract_.totSecondDelta += secondDelta;
-      totAbstract_.totLastTwoDelta += lastTwoDelta;
-      totAbstract_.totMidDelta += midDelta;
-    }
-
-    totSamples_++;
-    timeVector_->reset();
-  }
-}
-
-void BarrierEndStat::reset(bool clearRawData) {
-  int32_t i = 0;
-
-  totSamples_ = 0;
-
-  std::lock_guard<std::mutex> guard(abstractLock_);
-
-  if (clearRawData) {
-    timeVector_->reset();
-  }
-
-  for (auto &abstract : abstract_) {
-    memset((void *)&abstract, 0, sizeof(abstract));
-    abstract.minDelta = UINT64_MAX;
-    abstract.trainerId = i++;
-  }
-  memset((void *)&totAbstract_, 0, sizeof(Abstract));
-  totAbstract_.minDelta = UINT64_MAX;
-}
-
-void BarrierEndStat::showAbstract(std::ostream &output) const {
-  // do not support the case "<=2 pserver"
-  if (numConnThreads_ <= 2 || !totSamples_) {
-    return;
-  }
-
-  // duplicate freq info
-  std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(),
-            outputAbstract.end(),
-            [](const struct Abstract &a, const struct Abstract &b) {
-              return a.freq > b.freq;
-            });
-
-  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
-  if (rate < rateThreshold_) {
-    return;
-  }
-
-  output << std::setw(20) << name_ << std::endl;
-
-  /*
-   * Note:
-   * avgGap:        the average delta between 1 -- n arriving trainers
-   * avgSecondGap:  the average delta between 2 -- n arriving trainers
-   * avgLastTwoGap: the average delta between n-1 -- n  arriving trainers
-   * avgMidGap:     the average delta between n/2 -- n  arriving trainers
-   * rato: samples / totSamples
-   *
-   * the stat is based on per trainer if trainer_id is set, totAbstract is
-   * stat based on all trainers scope.
-   */
-  output << std::setw(42) << " " << std::setw(15) << "trainerId"
-         << std::setw(15) << "avgGap" << std::setw(15) << "avgSecondGap"
-         << std::setw(15) << "avgLastTwoGap" << std::setw(15) << "avgMidGap"
-         << std::setw(10) << "rate" << std::setw(10) << "samples"
-         << std::setw(10) << "totSamples" << std::endl;
-  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
-  if (!totAbstract_.freq) return;
-  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
-         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totSecondDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totLastTwoDelta / totAbstract_.freq) * 0.001
-         << std::setw(15)
-         << (totAbstract_.totMidDelta / totAbstract_.freq) * 0.001
-         << std::setw(10) << (float)totAbstract_.freq / (float)totSamples_
-         << std::setw(10) << (float)totAbstract_.freq << std::setw(10)
-         << (float)totSamples_ << std::endl;
-
-  // show lastTrainerId abstract
-  int count = 0;
-  for (auto &abstract : outputAbstract) {
-    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
-      break;
-    }
-    // output format control
-    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
-           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
-           << std::setw(15) << (abstract.totSecondDelta / abstract.freq) * 0.001
-           << std::setw(15)
-           << (abstract.totLastTwoDelta / abstract.freq) * 0.001
-           << std::setw(15) << (abstract.totMidDelta / abstract.freq) * 0.001
-           << std::setw(10) << (float)abstract.freq / (float)totSamples_
-           << std::setw(10) << (float)abstract.freq << std::setw(10)
-           << (float)totSamples_ << std::endl;
-  }
-}
-
-BarrierDeltaStat::BarrierDeltaStat(uint16_t numConnThreads,
-                                   const std::string &name)
-    : BarrierStatBase(numConnThreads, name) {
-  timeVector_.reset(new TimeVectorDelta(numConnThreads_));
-  reset(true);
-  LOG(INFO) << " create barrierDeltaStat: " << name
-            << " barrierDelta warning rate: " << rateThreshold_;
-}
-
-void BarrierDeltaStat::updateStat(uint64_t delta, int32_t trainerId) {
-  CHECK_LT(trainerId, numConnThreads_) << "trainerId is invalid in barrier";
-
-  std::lock_guard<std::mutex> guard(lock_);
-  timeVector_->addTimeval(delta, trainerId);
-
-  if (timeVector_->full()) {
-    std::lock_guard<std::mutex> abstractGuard(abstractLock_);
-    auto id = timeVector_->getMaxTrainerId();
-    auto delta = timeVector_->getDelta();
-    // discard first sample, since first sample probably is abnormal.
-    if (totSamples_) {
-      abstract_[id].freq++;
-
-      if (delta < abstract_[id].minDelta) {
-        abstract_[id].minDelta = delta;
-      }
-      if (delta > abstract_[id].maxDelta) {
-        abstract_[id].maxDelta = delta;
-      }
-      abstract_[id].totDelta += delta;
-
-      // update totAbstract_
-      totAbstract_.freq++;
-      if (delta < totAbstract_.minDelta) {
-        totAbstract_.minDelta = delta;
-      }
-      if (delta > totAbstract_.maxDelta) {
-        totAbstract_.maxDelta = delta;
-      }
-      totAbstract_.totDelta += delta;
-    }
-
-    totSamples_++;
-    timeVector_->reset();
-  }
-}
-
-void BarrierDeltaStat::reset(bool clearRawData) {
-  int32_t i = 0;
-
-  totSamples_ = 0;
-
-  std::lock_guard<std::mutex> guard(abstractLock_);
-
-  if (clearRawData) {
-    timeVector_->reset();
-  }
-
-  for (auto &abstract : abstract_) {
-    memset((void *)&abstract, 0, sizeof(abstract));
-    abstract.minDelta = UINT64_MAX;
-    abstract.trainerId = i++;
-  }
-  memset((void *)&totAbstract_, 0, sizeof(Abstract));
-  totAbstract_.minDelta = UINT64_MAX;
-}
-
-void BarrierDeltaStat::showAbstract(std::ostream &output) const {
-  // do not support the case "<=2 pserver"
-  if (numConnThreads_ <= 2 || !totSamples_) {
-    return;
-  }
-
-  // duplicate freq info
-  std::vector<struct Abstract> outputAbstract = abstract_;
-  std::sort(outputAbstract.begin(),
-            outputAbstract.end(),
-            [](const struct Abstract &a, const struct Abstract &b) {
-              return a.freq > b.freq;
-            });
-
-  auto rate = (float)outputAbstract[0].freq / (float)totSamples_;
-  if (rate < rateThreshold_) {
-    return;
-  }
-
-  output << std::setw(20) << name_ << std::endl;
-
-  /* Note:
-   * Gap means the delta from all trainers' forwardbackward
-   * avgGap: average Gap in log_period batches
-   * minGap: min Gap in log_period batches
-   * maxGap: max Gap in log_period batches
-   * trainerId: the slowest trainer_id
-   *
-   * the stat is based on per trainer if trainer_id is set, totAbstract is
-   * stat based on all trainers scope.
-   */
-  output << std::setw(42) << " " << std::setw(15) << "trainerId"
-         << std::setw(15) << "avgGap" << std::setw(10) << "minGap"
-         << std::setw(10) << "maxGap" << std::setw(10) << "rate"
-         << std::setw(10) << "samples" << std::setw(10) << "totSamples"
-         << std::endl;
-  // show totAbstract, it's valuable when lastTrainerId is even-distributed'
-  if (!totAbstract_.freq) return;
-  output << std::setw(42) << " " << std::setw(15) << "totAbstract"
-         << std::setw(15) << (totAbstract_.totDelta / totAbstract_.freq) * 0.001
-         << std::setw(10) << totAbstract_.minDelta * 0.001 << std::setw(10)
-         << totAbstract_.maxDelta * 0.001 << std::setw(10)
-         << (float)totAbstract_.freq / (float)totSamples_ << std::setw(10)
-         << (float)totAbstract_.freq << std::setw(10) << (float)totSamples_
-         << std::endl;
-
-  // show lastTrainerId abstract
-  int count = 0;
-  for (auto &abstract : outputAbstract) {
-    if (!abstract.freq || count++ >= FLAGS_log_barrier_lowest_nodes) {
-      break;
-    }
-    // output format control
-    output << std::setw(42) << " " << std::setw(15) << abstract.trainerId
-           << std::setw(15) << (abstract.totDelta / abstract.freq) * 0.001
-           << std::setw(10) << abstract.minDelta * 0.001 << std::setw(10)
-           << abstract.maxDelta * 0.001 << std::setw(10)
-           << (float)abstract.freq / (float)totSamples_ << std::setw(10)
-           << (float)abstract.freq << std::setw(10) << (float)totSamples_
-           << std::endl;
-  }
-}
-}  // namespace paddle
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
deleted file mode 100644
index a9c925eff6..0000000000
--- a/paddle/utils/BarrierStat.h
+++ /dev/null
@@ -1,425 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <sys/time.h>
-#include <iostream>
-#include <list>
-#include <memory>
-#include <mutex>
-#include <string>
-#include <unordered_map>
-
-#include "Locks.h"
-#include "Logging.h"
-#include "ThreadLocal.h"
-
-namespace paddle {
-
-inline uint64_t timeToMicroSecond(struct timeval time) {
-  return time.tv_sec * 1000000LU + time.tv_usec;
-}
-
-class TimeVectorEnd {
-  /*
-   * help class for gathering all barrier performance data
-   * which shows time point property.
-   * freqently used in barrier performance tuning API, such
-   * as tuning which is slowest node in sync-sgd mode training.
-   */
-public:
-  explicit TimeVectorEnd(uint16_t size) : size_(size) {
-    index_ = 0;
-    timeArray_.resize(size);
-    trainerIds_.resize(size);
-  }
-  ~TimeVectorEnd() {}
-
-  uint16_t size() { return size_; }
-
-  bool full() { return index_ == size_; }
-
-  bool empty() { return index_ == 0; }
-
-  void reset() { index_ = 0; }
-
-  void addTimeval(struct timeval time, int32_t trainerId) {
-    timeArray_[index_] = time;
-    trainerIds_[index_] = trainerId;
-    index_++;
-  }
-
-  struct timeval getDelta() const {
-    struct timeval delta;
-    CHECK_GT(size_, 1) << "not support with 1 pserver";
-    timersub(&timeArray_[size_ - 1], &timeArray_[0], &delta);
-    return delta;
-  }
-
-  /* 2, n delta */
-  struct timeval get1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[1], &delta);
-    return delta;
-  }
-
-  /* n-1, n delta */
-  struct timeval getMinus1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[size_ - 2], &delta);
-    return delta;
-  }
-
-  /* n/2, n delta */
-  struct timeval getMidNDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    struct timeval delta;
-    timersub(&timeArray_[size_ - 1], &timeArray_[size_ / 2], &delta);
-    return delta;
-  }
-
-  int32_t getLastTrainerId() const { return trainerIds_[index_ - 1]; }
-
-private:
-  uint16_t size_;
-  uint16_t index_;
-  std::vector<struct timeval> timeArray_;
-  std::vector<int32_t> trainerIds_;
-};
-
-class TimeVectorDelta {
-  /*
-   * help class for gathering performance data which shows time
-   * delta property, such as tuning the time distribution of
-   * forwardBackward time from all cluster nodes.
-   */
-public:
-  explicit TimeVectorDelta(uint16_t size)
-      : size_(size), min_(UINT64_MAX), max_(0) {
-    index_ = 0;
-    timeArray_.resize(size);
-  }
-  ~TimeVectorDelta() {}
-
-  uint16_t size() { return size_; }
-
-  bool full() { return index_ == size_; }
-
-  bool empty() { return index_ == 0; }
-
-  void reset() {
-    index_ = 0;
-    min_ = UINT64_MAX;
-    max_ = 0;
-  }
-
-  void addTimeval(uint64_t delta, int32_t trainerId) {
-    timeArray_[index_] = delta;
-    index_++;
-    if (delta < min_) {
-      min_ = delta;
-    }
-    if (delta > max_) {
-      max_ = delta;
-      maxTrainerId_ = trainerId;
-    }
-  }
-
-  uint64_t getDelta() const {
-    CHECK_GT(size_, 1) << "not support with 1 pserver";
-    return max_ - min_;
-  }
-
-  /* 2, n delta */
-  uint64_t get1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* n-1, n delta */
-  uint64_t getMinus1NDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* n/2, n delta */
-  uint64_t getMidNDelta() const {
-    CHECK_GT(size_, 2) << "not support with less than 2 pservers";
-    LOG(FATAL) << "Not implemented";
-  }
-
-  int32_t getMaxTrainerId() const { return maxTrainerId_; }
-
-private:
-  uint16_t size_;
-  uint16_t index_;
-  std::vector<uint64_t> timeArray_;
-
-private:
-  uint64_t min_;
-  uint64_t max_;
-  int32_t maxTrainerId_;
-};
-
-// total samples stats, us
-struct Abstract {
-  // last trainerId for barrier end, maxDelta trainerId for barrier delta
-  int32_t trainerId;
-  uint64_t minDelta;
-  uint64_t maxDelta;
-  uint64_t totDelta;
-  // first one is probably itself, so discard it.
-  uint64_t totSecondDelta;
-  // to confirm if last node destroy barrier performance.
-  uint64_t totLastTwoDelta;
-  // n/2-n delta
-  uint64_t totMidDelta;
-  uint64_t freq;
-};
-
-// barrier performance tunning stats
-class BarrierStatBase {
-public:
-  BarrierStatBase(uint16_t numConnThreads, const std::string &name);
-
-  virtual ~BarrierStatBase() {}
-
-  // if called at pserver end, then trainId means trainer's id.
-  // by default trainer does not use trainerId, so set it to -1
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) = 0;
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) = 0;
-
-  const std::string &getName() { return name_; }
-
-  virtual void reset(bool clearRawData = true) {}
-  // since the timeVector_ is not stateful, so it's not clear whether the
-  // the barrier delta is correct. if one timestamp was lost, the all data
-  // from barrier stat becomes rubbish. -_-
-  virtual bool checkPassBarrier() {
-    LOG(INFO) << "bug implementation found";
-    return false;
-  }
-
-protected:
-  virtual void showAbstract(std::ostream &output) const {}
-  friend std::ostream &operator<<(std::ostream &output,
-                                  const BarrierStatBase &stat);
-
-protected:
-  mutable std::mutex lock_;
-  std::mutex abstractLock_;  // see note on updaterStat
-  // each freqency for each barrier trainer
-  std::vector<struct Abstract> abstract_;
-  // it is valuable when do perf-tuining, if lastTrainerId acts uniform
-  // distribution
-  struct Abstract totAbstract_;
-  uint64_t totSamples_;
-
-protected:
-  uint16_t numConnThreads_;  // total updates needed
-  float rateThreshold_;
-  std::string name_;
-};
-
-// the end-time of arriving real/forged barrier position
-class BarrierEndStat : public BarrierStatBase {
-public:
-  BarrierEndStat(uint16_t numConnThreads, const std::string &name);
-  ~BarrierEndStat() {}
-
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1);
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1) {
-    LOG(INFO) << "have no delta updateStat in BarrierEndStat";
-  }
-  virtual void reset(bool clearRawData = true);
-  virtual bool checkPassBarrier() { return timeVector_->empty(); }
-
-protected:
-  /*
-   * LOG:
-   * readAllBlocks_denseUpdater
-   * trainerId      avgGap         avgSecondGap   avgLastTwoGap  avgMidGap rate
-   * 44             86.702         81.022         9.984          50.472 0.144737
-   * 46             87.723         82.939         8.737          50.019 0.118421
-   * 35             100.923        96.752         14.305         61.979
-   * 0.0657895
-   * log_barrier_abstract, log_barrier_lowest_nodes, log_barrier_threshold
-   * control details.
-   */
-  virtual void showAbstract(std::ostream &output) const;
-
-private:
-  std::unique_ptr<TimeVectorEnd> timeVector_;
-};
-
-// the delta-time from different trainers,
-// eg, find the degree of imbalance of BP time at pserver end
-// the entry value in timerVector_ is BP delta, do evaluation to BP delta.
-class BarrierDeltaStat : public BarrierStatBase {
-public:
-  BarrierDeltaStat(uint16_t numConnThreads, const std::string &name);
-  ~BarrierDeltaStat() {}
-
-  virtual void updateStat(uint64_t delta, int32_t trainerId = -1);
-  virtual void updateStat(struct timeval &cur, int32_t trainerId = -1) {
-    LOG(INFO) << "have no timeval updateStat in BarrierDeltaStat";
-  }
-
-  virtual void reset(bool clearRawData = true);
-
-  virtual bool checkPassBarrier() { return timeVector_->empty(); }
-
-protected:
-  virtual void showAbstract(std::ostream &outPut) const;
-
-private:
-  // store delta time in uint64_t, eg BP time of all trainers
-  std::unique_ptr<TimeVectorDelta> timeVector_;
-};
-
-// to distinguish different contexts for same parallel threads, and different
-// threads with same code-sgement, just use tagName to tag the run-time
-// position.
-// in Sparse, sendParallel threads can not only run in the stage of push&pull
-// with same thread group, but also run in the stage of pull&push with different
-// thread group, tag will be used to distinguish different run-time barrier
-// position.
-// trainerId in REGISTER_BARRIER_TIMER_SERVER is used to retreive lowest trainer
-// nodes.
-
-// end barrier
-#define __REGISTER_BARRIER_TIMER_SERVER(                            \
-    set, statName, numConnThreads, trainerId, ...)                  \
-  do {                                                              \
-    if (numConnThreads > 2) {                                       \
-      std::string internalName =                                    \
-          std::string(statName) + std::string(__VA_ARGS__);         \
-      BarrierStatPtr __stat =                                       \
-          (set).getStat(numConnThreads, internalName, BARRIER_END); \
-      struct timeval cur;                                           \
-      gettimeofday(&cur, nullptr);                                  \
-      __stat->updateStat(cur, trainerId);                           \
-    }                                                               \
-  } while (0);
-
-// end barrier with user-defined timer
-#define __REGISTER_BARRIER_TIMER_SERVER_SET(                        \
-    set, statName, numConnThreads, trainerId, cur, ...)             \
-  do {                                                              \
-    if (numConnThreads > 2) {                                       \
-      std::string internalName =                                    \
-          std::string(statName) + std::string(__VA_ARGS__);         \
-      BarrierStatPtr __stat =                                       \
-          (set).getStat(numConnThreads, internalName, BARRIER_END); \
-      __stat->updateStat(cur, trainerId);                           \
-    }                                                               \
-  } while (0);
-
-// delta barrier
-#define __REGISTER_BARRIER_DELTA_SERVER_SET(                          \
-    set, statName, numConnThreads, trainerId, delta, ...)             \
-  do {                                                                \
-    if (numConnThreads > 2) {                                         \
-      std::string internalName =                                      \
-          std::string(statName) + std::string(__VA_ARGS__);           \
-      BarrierStatPtr __stat =                                         \
-          (set).getStat(numConnThreads, internalName, BARRIER_DELTA); \
-      __stat->updateStat(delta, trainerId);                           \
-    }                                                                 \
-  } while (0);
-
-// check end barrier
-#define __CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...)   \
-  do {                                                              \
-    std::string internalName =                                      \
-        std::string(statName) + std::string(__VA_ARGS__);           \
-    BarrierStatPtr __stat =                                         \
-        (set).getStat(numConnThreads, internalName, BARRIER_END);   \
-    PCHECK(__stat->checkPassBarrier()) << internalName              \
-                                       << ": invalid barrier data"; \
-  } while (0);
-
-/*
- * Note:
- * with sync-sgd algriothm in cluster mode, lots of synchronize action exsit at
- * pserve end. these synchronizaton actions have impact on the efficiency of
- * parameter exchange. the synchronizaton(barrier) GAP is composed of lots of
- * factors, such as the forwardBackward variance, network fluncation. we try
- * to have a quantitative analysis on these factor, so we design lots of barrier
- * time to capture these performance. these barrier also can be placed at
- * implict barrier position.
- *
- * example:
- * in sync-sgd algorithm, each parameter server waits for all gradients from
- * all trainers, thus, an explict barrier point exsit before doing optimization.
- * the barrier timer located before the point can sense the barrier condition.
- *
- */
-
-// try to capture which trainer is slowest node in sync-sgd at pserver.
-#define REGISTER_SLOW_NODES_PROBE(                 \
-    set, statName, numConnThreads, trainerId, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER(                 \
-      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
-// try to check if all threads or trainers have passed barriers for data
-// accuracy.
-#define CHECK_BARRIER_TIMER(set, statName, numConnThreads, ...) \
-  __CHECK_BARRIER_TIMER((set), statName, numConnThreads, __VA_ARGS__)
-
-#ifdef PADDLE_DISABLE_TIMER
-
-#define REGISTER_BARRIER_TIMER_SERVER( \
-    set, statName, numConnThreads, trainerId, ...)
-#define REGISTER_BARRIER_TIMER_SERVER_SET( \
-    set, statName, numConnThreads, trainerId, cur, ...)
-#define REGISTER_BARRIER_DELTA_SERVER_SET( \
-    set, statName, numConnThreads, trainerId, cur, ...)
-
-#else
-
-/*
- * sensing barrier time distribution for all parallelization threads.
- * it provides low API for slow node check(REGISTER_SLOW_NODES_PROBE)
- */
-#define REGISTER_BARRIER_TIMER_SERVER(             \
-    set, statName, numConnThreads, trainerId, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER(                 \
-      (set), statName, numConnThreads, trainerId, __VA_ARGS__)
-
-/*
- * sensing barrier time distribution for all parallelization threads.
- * but time point for barrier performance is set by user.
- * eg, with this api, you can get implict barrier point such as the beginning
- * time distribution
- * for receiving data.
- */
-#define REGISTER_BARRIER_TIMER_SERVER_SET(              \
-    set, statName, numConnThreads, trainerId, cur, ...) \
-  __REGISTER_BARRIER_TIMER_SERVER_SET(                  \
-      (set), statName, numConnThreads, trainerId, cur, __VA_ARGS__)
-
-// try to capture time delta from all trainers, such as forwardBackward time
-// which implies
-// computation fluctuation
-#define REGISTER_BARRIER_DELTA_SERVER_SET(                \
-    set, statName, numConnThreads, trainerId, delta, ...) \
-  __REGISTER_BARRIER_DELTA_SERVER_SET(                    \
-      (set), statName, numConnThreads, trainerId, delta, __VA_ARGS__)
-
-#endif  // DISABLE_TIMER
-}  // namespace paddle
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index af59951752..7a4977935e 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_utils STATIC
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
     ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils gen_proto_cpp)
+add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
deleted file mode 100644
index cebca5a2a3..0000000000
--- a/paddle/utils/Compiler.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-/**
- * This header defines some useful attribute by each compiler. It is the
- * abstract layer of compilers.
- */
-#ifdef __GNUC__
-#define GCC_VERSION \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define GCC_VERSION
-#endif
-
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#if GCC_VERSION >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 6992e85622..52a6df9497 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -55,13 +55,17 @@ public:
    *        Else, just set status to popping.
    */
   void pop(const T& item) {
-    pushing() = false;
     auto& s = this->stack();
     if (item == s.top()) {
       s.pop();
     }
   }
 
+  /**
+   * @brief Indicate whether we are at forward or backward stage of computation
+   */
+  void set_stage(bool isForward) { pushing() = isForward; }
+
   /**
    * @brief clear current thread stack.
    */
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724..2e5ff76a06 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index cda1b5c37d..7cde983060 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -19,7 +19,21 @@ limitations under the License. */
 #include <stdio.h>
 #include <memory>
 #include <string>
-#include "Compiler.h"
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#ifdef __GNUC__
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
+#else
+#define __must_check
+#endif
 
 namespace paddle {
 
@@ -112,9 +126,11 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is something error.
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
    */
-  operator bool() const { return !this->isOK(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
   /**
    * @brief isOK return True if there is no error.
@@ -122,13 +138,6 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
 private:
   std::shared_ptr<std::string> msg_;
 };
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 320f671ed9..ea47cf23eb 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -14,12 +14,26 @@ limitations under the License. */
 
 #include "Flags.h"
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
+#ifdef PADDLE_WITH_MKLDNN
+// TODO(TJ): change to true when MKLDNN layers support multi-inputs
+DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
+#else
+DEFINE_bool(use_mkldnn, false, "Only support CPU training");
+#endif
+
+#ifdef PADDLE_WITH_MKLML
+// TODO(TJ): change to true when fully confirmed
+DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
+#else
+DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
+#endif
+
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index dc4faef833..b64295bca0 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -40,3 +40,5 @@ DECLARE_bool(show_layer_stat);
 DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkl_packed);
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index c7194d3bf1..ff1b1bf888 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -97,34 +97,6 @@ std::ostream& operator<<(std::ostream& outPut, const Stat& stat) {
   return outPut;
 }
 
-BarrierStatPtr StatSet::getStat(uint16_t numConnThreads,
-                                const std::string& name,
-                                BarrierStatType bType) {
-  {
-    ReadLockGuard guard(lock_);
-    auto it = barrierStatSet_.find(name);
-    if (it != barrierStatSet_.end()) {
-      return it->second;
-    }
-  }
-
-  std::lock_guard<RWLock> guard(lock_);
-  // test again with lock_guard
-  auto it = barrierStatSet_.find(name);
-  if (it != barrierStatSet_.end()) {
-    return it->second;
-  }
-
-  BarrierStatPtr stat;
-  if (bType == BARRIER_END) {
-    stat = std::make_shared<BarrierEndStat>(numConnThreads, name);
-  } else if (bType == BARRIER_DELTA) {
-    stat = std::make_shared<BarrierDeltaStat>(numConnThreads, name);
-  }
-  auto ret = barrierStatSet_.insert(std::make_pair(name, stat));
-  return ret.first->second;
-}
-
 void StatSet::printSegTimerStatus() {
   ReadLockGuard guard(lock_);
   LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
@@ -135,46 +107,20 @@ void StatSet::printSegTimerStatus() {
   }
 }
 
-void StatSet::printBarrierTimerStatus() {
-  ReadLockGuard guard(lock_);
-  if (barrierStatSet_.empty()) {
-    return;
-  }
-  // control barrierAbstact in runtime, so enable compliation
-  LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-            << "======= BarrierStatSet status ======" << std::endl;
-  for (auto& stat : barrierStatSet_) {
-    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ')
-              << *(stat.second);
-  }
-}
-
 void StatSet::printAllStatus() {
 #ifndef PADDLE_DISABLE_TIMER
   printSegTimerStatus();
 #endif
-  printBarrierTimerStatus();
   LOG(INFO) << std::setiosflags(std::ios::left)
             << "--------------------------------------------------"
             << std::endl;
 }
 
-void StatSet::printStatus(const std::string& name) {
-  ReadLockGuard guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  LOG(INFO) << *(iter->second);
-}
-
 void StatSet::reset(bool clearRawData) {
   ReadLockGuard guard(lock_);
   for (auto& stat : statSet_) {
     stat.second->reset();
   }
-  // reset barrierStat
-  for (auto& stat : barrierStatSet_) {
-    stat.second->reset(clearRawData);
-  }
 }
 
 void StatSet::setThreadInfo(const std::string& name, bool flag) {
@@ -184,13 +130,6 @@ void StatSet::setThreadInfo(const std::string& name, bool flag) {
   iter->second->setThreadInfo(flag);
 }
 
-void StatSet::deleteStat(const std::string& name) {
-  std::lock_guard<RWLock> guard(lock_);
-  auto iter = statSet_.find(name);
-  CHECK(iter != statSet_.end()) << name << " is not registed in " << name_;
-  statSet_.erase(iter);
-}
-
 StatInfo::~StatInfo() {
   if (stat_) {
     std::lock_guard<std::mutex> guard(stat_->lock_);
diff --git a/paddle/utils/Stat.h b/paddle/utils/Stat.h
index d9cc6e413a..79fd3b8cf0 100644
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
@@ -23,7 +23,6 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 
-#include "BarrierStat.h"
 #include "Locks.h"
 #include "Logging.h"
 #include "ThreadLocal.h"
@@ -60,12 +59,6 @@ public:
 
 class Stat;
 typedef std::shared_ptr<Stat> StatPtr;
-typedef std::shared_ptr<BarrierStatBase> BarrierStatPtr;
-
-enum BarrierStatType {
-  BARRIER_END = 0,
-  BARRIER_DELTA = 1,
-};
 
 class StatSet {
 public:
@@ -74,11 +67,8 @@ public:
 
   // print to LOG(INFO)
   void printSegTimerStatus();
-  void printBarrierTimerStatus();
   void printAllStatus();
 
-  void printStatus(const std::string& name);
-
   StatPtr getStat(const std::string& name) {
     {
       ReadLockGuard guard(lock_);
@@ -93,12 +83,6 @@ public:
     return ret.first->second;
   }
 
-  BarrierStatPtr getStat(uint16_t numConnThreads,
-                         const std::string& name,
-                         BarrierStatType bType);
-
-  void deleteStat(const std::string& name);
-
   // true for showing stats for each thread
   // false for showing stats aggragated over threads
   void setThreadInfo(const std::string& name, bool flag);
@@ -120,7 +104,6 @@ public:
 
 private:
   std::unordered_map<std::string, StatPtr> statSet_;
-  std::unordered_map<std::string, BarrierStatPtr> barrierStatSet_;
   const std::string name_;
   RWLock lock_;
 };
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index a4987c9ec2..0a27b8b97b 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    PCHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      PCHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { PCHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    PCHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index b18b73e06a..2755fdd9cd 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -320,6 +320,9 @@ void loadFileList(const std::string& fileListFileName,
 }
 
 double getMemoryUsage() {
+#if defined(__ANDROID__)
+  return 0.0;
+#else
   FILE* fp = fopen("/proc/meminfo", "r");
   CHECK(fp) << "failed to fopen /proc/meminfo";
   size_t bufsize = 256 * sizeof(char);
@@ -357,6 +360,7 @@ double getMemoryUsage() {
   delete[] buf;
   double usedMem = 1.0 - 1.0 * (freeMem + bufMem + cacheMem) / totalMem;
   return usedMem;
+#endif
 }
 
 SyncThreadPool* getGlobalSyncThreadPool() {
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 613844669d..9579881ea3 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -33,6 +33,13 @@ limitations under the License. */
 #include "Flags.h"
 #include "hl_gpu.h"
 
+#if defined(__ANDROID__) && (__ANDROID_API__ < 21)
+inline int rand_r(unsigned int* seedp) {
+  (void)seedp;
+  return rand();
+}
+#endif
+
 /**
  * Loop over the elements in a container
  * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach,
@@ -211,7 +218,7 @@ protected:
  * *d2* is peer device to enable direct access to by the d1 device.
  */
 inline void enablePeerAccess(int d1, int d2) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (hl_device_can_access_peer(d1, d2)) {
     SetDevice dev(d1);
     hl_device_enable_peer_access(d2);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index f53d6420bb..004d62451c 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
  * @return return true if paddle compiled with GPU
  */
 constexpr bool isWithGpu() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 3a0903d1f2..a4e6c8f7b8 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -40,6 +40,8 @@ void Semaphore::wait() { sem_wait(&m->sem); }
 
 void Semaphore::post() { sem_post(&m->sem); }
 
+/// SpinLockPrivate
+
 #ifdef PADDLE_USE_PTHREAD_SPINLOCK
 
 class SpinLockPrivate {
@@ -79,6 +81,8 @@ SpinLock::~SpinLock() { delete m; }
 void SpinLock::lock() { m->lock(); }
 void SpinLock::unlock() { m->unlock(); }
 
+/// ThreadBarrierPrivate
+
 #ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
@@ -136,6 +140,8 @@ public:
 
 #endif
 
+/// ThreadBarrier
+
 ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
 ThreadBarrier::~ThreadBarrier() { delete m; }
 void ThreadBarrier::wait() { m->wait(); }
diff --git a/paddle/utils/arch/osx/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
index c8e904d8f9..ac44461578 100644
--- a/paddle/utils/arch/osx/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -15,7 +15,12 @@ limitations under the License. */
 #include "paddle/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
-
+#if defined(__arm__) || defined(__arm64__)
+// TODO(liuyiqun): implement the arm version
+int fegetexcept(void) { return -1; }
+int feenableexcept(unsigned int excepts) { return -1; }
+int fedisableexcept(unsigned int excepts) { return -1; }
+#else
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
@@ -48,5 +53,5 @@ int fedisableexcept(unsigned int excepts) {
 
   return (fesetenv(&fenv) ? -1 : old_excepts);
 }
-
+#endif
 #endif
diff --git a/paddle/utils/enable_virtualenv.py b/paddle/utils/enable_virtualenv.py
index ccfaa7c147..4e998381e9 100644
--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 
 
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index aa923b3553..c770ce1698 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ add_executable(
 link_paddle_exe(test_CustomStackTracePrint)
 if(NOT APPLE)
     add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index b5d9f93f13..c320074fba 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) {
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + paddle::str::to_string(i));
       }
-      tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index fdf326b17a..6f311fa6b8 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error.isOK());
   error = paddle::Error("I'm the error");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_TRUE(error3);
+  ASSERT_FALSE(error3.isOK());
   ASSERT_STREQ("error3", error3.msg());
 }
diff --git a/paddle/utils/tests/test_StringUtils.cpp b/paddle/utils/tests/test_StringUtils.cpp
index fdc914d1bc..248f58a7f2 100644
--- a/paddle/utils/tests/test_StringUtils.cpp
+++ b/paddle/utils/tests/test_StringUtils.cpp
@@ -18,6 +18,6 @@ limitations under the License. */
 
 TEST(StringUtil, to) {
   ASSERT_NEAR(paddle::str::to<double>("12.45"), 12.45, 1e-5);
-  ASSERT_DEATH(paddle::str::to<double>("12.45x23"), ".*");
-  ASSERT_DEATH(paddle::str::to<int>(""), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<double>("12.45x23"), ".*");
+  ASSERT_DEATH_IF_SUPPORTED(paddle::str::to<int>(""), ".*");
 }
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 62d5b9e38b..556bcd1d7e 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,44 +1,56 @@
-set(proto_filenames
-    DataConfig.proto
-    DataFormat.proto
-    ModelConfig.proto
-    ParameterConfig.proto
-    ParameterService.proto
-    TrainerConfig.proto
-    ParameterServerConfig.proto)
+if (MOBILE_INFERENCE)
+    file(GLOB proto_filenames . ModelConfig.proto ParameterConfig.proto
+         TrainerConfig.proto DataConfig.proto)
+else()
+    file(GLOB proto_filenames . *.proto)
+endif()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+proto_library(paddle_proto SRCS ${proto_filenames})
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
 
 foreach(filename ${proto_filenames})
-    get_filename_component(base_filename ${filename} NAME_WE)
-    set(CUR_PROTO_GEN
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.h
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.cc)
-    set(PROTO_GEN
-        ${PROTO_GEN}
-        ${CUR_PROTO_GEN})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
-                  --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
-
+    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+    get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-        ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
-        ${CUR_PROTO_GEN_PY}
-        ${PROTO_GEN_PY})
+            ${CUR_PROTO_GEN_PY}
+            ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
+            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} protoc)
 endforeach()
 
-include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
-
-add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-add_library(paddle_proto STATIC
-    ${PROTO_GEN})
-target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+
+if (WITH_GOLANG)
+    add_custom_target(protoc-gen-go)
+    add_custom_command(TARGET protoc-gen-go
+            COMMAND go 
+            ARGS "get" "-u" "github.com/golang/protobuf/protoc-gen-go")
+
+    set(PROTO_GEN_GO)
+    file(GLOB proto_filenames . OptimizerConfig.proto)
+    foreach(filename ${proto_filenames})
+        message(STATUS ${filename})
+        get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+        get_filename_component(FIL_WE ${filename} NAME_WE)
+        set(CUR_PROTO_GEN_GO
+                ${PADDLE_SOURCE_DIR}/paddle/go/proto/${FIL_WE}.pb.go)
+        set(PROTO_GEN_GO
+                ${CUR_PROTO_GEN_GO}
+                ${PROTO_GEN_GO})
+        add_custom_command(OUTPUT ${CUR_PROTO_GEN_GO}
+                COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+                ARGS "--go_out=${PADDLE_SOURCE_DIR}/go/proto"
+                "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+                DEPENDS ${ABS_FIL} protoc protoc-gen-go)
+    endforeach()
+    add_custom_target(gen_proto_go ALL DEPENDS ${PROTO_GEN_GO})
+endif()
diff --git a/proto/DataConfig.proto b/proto/DataConfig.proto
index e895c184d9..0cb5d7afbb 100644
--- a/proto/DataConfig.proto
+++ b/proto/DataConfig.proto
@@ -15,14 +15,13 @@ syntax = "proto2";
 
 package paddle;
 
-
 message FileGroupConf {
-  optional uint32 queue_capacity = 1 [default = 1];
+  optional uint32 queue_capacity = 1 [ default = 1 ];
   // how many files to load for a load file thread
-  optional int32 load_file_count = 2 [default = 1];
+  optional int32 load_file_count = 2 [ default = 1 ];
   // how many threads to load files
   // Setting to be 5~10 is appropriate when loading files by hadoop vfs
-  optional int32 load_thread_num = 3 [default = 1];
+  optional int32 load_thread_num = 3 [ default = 1 ];
 };
 
 message DataConfig {
@@ -32,26 +31,28 @@ message DataConfig {
   // name of a text file which contains a list of file names at each line
   optional string files = 3;
 
-  optional int32 feat_dim = 4;//feature dimension of one frame
-  repeated int32 slot_dims = 5;//feature slot dims
-  optional int32 context_len = 6;//max neibour frame numbers
-  optional uint64 buffer_capacity = 7;//the number of samples
+  optional int32 feat_dim = 4;         // feature dimension of one frame
+  repeated int32 slot_dims = 5;        // feature slot dims
+  optional int32 context_len = 6;      // max neibour frame numbers
+  optional uint64 buffer_capacity = 7; // the number of samples
 
-  //part of data used in training
-  //if not -1, part of train data is used in training
-  optional int64 train_sample_num = 8 [default = -1];
+  // part of data used in training
+  // if not -1, part of train data is used in training
+  optional int64 train_sample_num = 8 [ default = -1 ];
 
-  //The number of documents processed once
-  optional int32  file_load_num = 9 [default = -1];
-  optional bool  async_load_data = 12 [default = false];
+  // The number of documents processed once
+  optional int32 file_load_num = 9 [ default = -1 ];
+  optional bool async_load_data = 12 [ default = false ];
   /// Note the field number 10, 11 and 13 have been deprecated.
-  optional bool for_test = 14 [default = false];  // whether this data is for test
+  optional bool for_test = 14
+      [ default = false ]; // whether this data is for test
   optional FileGroupConf file_group_conf = 15;
   repeated int32 float_slot_dims = 16;
 
   /// Note the field number 17, 18 and 19 have been deprecated.
 
-  // a list of values which will be used to create additional one dimensional float
+  // a list of values which will be used to create additional one dimensional
+  // float
   // values slots. These one dimensional slots can be used as the weight input
   // for cost layers.
   // Currently this is only supported by ProtoDataProvider.
@@ -65,21 +66,21 @@ message DataConfig {
 
   // for MultiDataProvider
   repeated DataConfig sub_data_configs = 24; // sub dataproviders
-  /*
-   * the ratio of each sub dataproviders:
-   * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
-   * then each mini-batch is combined by 10 instance from A and 90 instances
-   * from B.
-   */
+                                             /*
+                                              * the ratio of each sub dataproviders:
+                                              * e.g. sub dataprovider A's ratio is 1, B's ratio is 9, batch_size is 100,
+                                              * then each mini-batch is combined by 10 instance from A and 90 instances
+                                              * from B.
+                                              */
   optional int32 data_ratio = 25;
   /*
    * if one of the sub dataproviders is running out of data, then
    * (1) it is "main data", then finish current pass.
    * (2) it is not "main data", then reset it, and try getNextBatch again.
    */
-  optional bool is_main_data = 26 [default = true];
+  optional bool is_main_data = 26 [ default = true ];
 
-  // the usage ratio of instances. Setting to 1.0 means the use of all instances.
-  optional double usage_ratio = 27 [default = 1.0];
+  // the usage ratio of instances. Setting to 1.0 means the use of all
+  // instances.
+  optional double usage_ratio = 27 [ default = 1.0 ];
 };
-
diff --git a/proto/DataFormat.proto b/proto/DataFormat.proto
index 19b1499b02..7d963bc29f 100644
--- a/proto/DataFormat.proto
+++ b/proto/DataFormat.proto
@@ -17,27 +17,32 @@ package paddle;
 
 /*
  If values is not empty and ids is empty, this is a dense vector.
- If values is not empty and ids is not empty, this is a sparse vector. The position of each value
+ If values is not empty and ids is not empty, this is a sparse vector. The
+ position of each value
  is specified by ids.
- If values is empty and ids is not empty, this is a sparse vector whose non-zero values are 1.
+ If values is empty and ids is not empty, this is a sparse vector whose non-zero
+ values are 1.
  The position of each 1 is specified by ids.
 */
 message VectorSlot {
-  repeated float values = 1 [packed = true];
-  repeated uint32 ids = 2 [packed = true];
+  repeated float values = 1 [ packed = true ];
+  repeated uint32 ids = 2 [ packed = true ];
   /* For multidimensional data, for example "image width height depth" */
-  repeated uint32 dims = 3 [packed = true];
-  repeated string strs = 4; 
+  repeated uint32 dims = 3 [ packed = true ];
+  repeated string strs = 4;
 };
 
 /*
- SubseqSlot use to record whether VectorSlot or any other slot in future has subseq.
- If not all VectorSlot have subseq, we only store the one who has subseq, and use *slot_id* to record it.
- One vector_slots has one sequence, and it may have N subseq, thus the number of *lens* will be N too. 
+ SubseqSlot use to record whether VectorSlot or any other slot in future has
+ subseq.
+ If not all VectorSlot have subseq, we only store the one who has subseq, and
+ use *slot_id* to record it.
+ One vector_slots has one sequence, and it may have N subseq, thus the number of
+ *lens* will be N too.
 */
 message SubseqSlot {
-  required uint32 slot_id = 1; //the id of slot who has subseq
-  repeated uint32 lens = 2; // lengths of sub-sequence in the slot
+  required uint32 slot_id = 1; // the id of slot who has subseq
+  repeated uint32 lens = 2;    // lengths of sub-sequence in the slot
 };
 
 message SlotDef {
@@ -45,13 +50,14 @@ message SlotDef {
     VECTOR_DENSE = 0;
     VECTOR_SPARSE_NON_VALUE = 1;
     VECTOR_SPARSE_VALUE = 2;
-    INDEX = 3;  // This can be used as label, or word id, etc.
+    INDEX = 3; // This can be used as label, or word id, etc.
     VAR_MDIM_DENSE = 4;
     VAR_MDIM_INDEX = 5;
     STRING = 6;
   }
   required SlotType type = 1;
-  required uint32 dim = 2;  // For INDEX slots, this means the maximal index plus 1.
+  required uint32 dim =
+      2; // For INDEX slots, this means the maximal index plus 1.
 };
 
 message DataHeader {
@@ -60,11 +66,11 @@ message DataHeader {
 };
 
 message DataSample {
-  optional bool is_beginning = 1 [default = true]; // is the beginning of a sequence
+  optional bool is_beginning = 1
+      [ default = true ]; // is the beginning of a sequence
   repeated VectorSlot vector_slots = 2;
-  repeated uint32 id_slots = 3 [packed = true];
+  repeated uint32 id_slots = 3 [ packed = true ];
   /* use ids of VectorSlot */
   repeated VectorSlot var_id_slots = 4;
   repeated SubseqSlot subseq_slots = 5;
 };
-
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f9b53d6f6..1fbdd5bbd8 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -21,7 +21,6 @@ package paddle;
  * Various structs for the configuration of a neural network
  */
 
-
 message ExternalConfig {
   repeated string layer_names = 1;
   repeated string input_layer_names = 2;
@@ -68,7 +67,7 @@ message ConvConfig {
   required uint32 img_size = 8;
 
   // caffe mode for output size coherence
-  required bool caffe_mode = 9 [default = true];
+  required bool caffe_mode = 9 [ default = true ];
 
   // if filter_size_y is set , this convolutional layer will use
   // filters of size filter_size * filter_size_y pixels.
@@ -83,6 +82,15 @@ message ConvConfig {
 
   // if not set, use img_size
   optional uint32 img_size_y = 14;
+
+  optional uint32 dilation = 15 [ default = 1 ];
+  optional uint32 dilation_y = 16 [ default = 1 ];
+
+  optional uint32 filter_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
+  optional uint32 stride_z = 19 [ default = 1 ];
+  optional uint32 output_z = 20 [ default = 1 ];
+  optional uint32 img_size_z = 21 [ default = 1 ];
 }
 
 message PoolConfig {
@@ -99,7 +107,7 @@ message PoolConfig {
   optional uint32 start = 4;
 
   // Defines the stride size between successive pooling squares.
-  required uint32 stride = 5 [default = 1];
+  required uint32 stride = 5 [ default = 1 ];
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -109,7 +117,7 @@ message PoolConfig {
 
   // padding = 4, instructs the net to implicitly
   // pad the images with a 4-pixel border of zeros.
-  optional uint32 padding = 8 [default = 0];
+  optional uint32 padding = 8 [ default = 0 ];
 
   // if not set, use size_x
   optional uint32 size_y = 9;
@@ -125,6 +133,14 @@ message PoolConfig {
 
   // if not set, use padding
   optional uint32 padding_y = 13;
+
+  optional uint32 size_z = 14 [ default = 1 ];
+  optional uint32 stride_z = 15 [ default = 1 ];
+  optional uint32 output_z = 16 [ default = 1 ];
+  optional uint32 img_size_z = 17 [ default = 1 ];
+  optional uint32 padding_z = 18 [ default = 1 ];
+
+  optional bool exclude_mode = 19;
 }
 
 message SppConfig {
@@ -194,6 +210,13 @@ message MaxOutConfig {
   required uint32 groups = 2;
 }
 
+message RowConvConfig { required uint32 context_length = 1; }
+
+message SliceConfig {
+  required uint32 start = 1;
+  required uint32 end = 2;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -203,17 +226,21 @@ message ProjectionConfig {
   // For ShiftProjection
   optional int32 context_start = 5;
   optional int32 context_length = 6;
-  optional bool trainable_padding = 7 [default = false];
+  optional bool trainable_padding = 7 [ default = false ];
 
   // For convolution
   optional ConvConfig conv_conf = 8;
   optional int32 num_filters = 9;
 
   // For IdentityOffsetProjection
-  optional uint64 offset = 11 [default = 0];
+  optional uint64 offset = 11 [ default = 0 ];
 
   // For pool
   optional PoolConfig pool_conf = 12;
+
+  // For slice
+  // Each slice output is the input[start, end)
+  repeated SliceConfig slices = 13;
 }
 
 message OperatorConfig {
@@ -223,7 +250,7 @@ message OperatorConfig {
   required uint64 output_size = 4;
 
   // For DotMulOperator
-  optional double dotmul_scale = 5 [default = 1.0];
+  optional double dotmul_scale = 5 [ default = 1.0 ];
 
   // For ConvOperator
   optional ConvConfig conv_conf = 6;
@@ -246,6 +273,7 @@ message ImageConfig {
   // The size of input feature map.
   required uint32 img_size = 8;
   optional uint32 img_size_y = 9;
+  optional uint32 img_size_z = 10 [ default = 1 ];
 }
 
 message PriorBoxConfig {
@@ -262,6 +290,52 @@ message PadConfig {
   repeated uint32 pad_w = 4;
 }
 
+message ReshapeConfig {
+  repeated uint32 height_axis = 1;
+  repeated uint32 width_axis = 2;
+}
+
+message MultiBoxLossConfig {
+  required uint32 num_classes = 1;
+  required float overlap_threshold = 2;
+  required float neg_pos_ratio = 3;
+  required float neg_overlap = 4;
+  required uint32 background_id = 5;
+  required uint32 input_num = 6;
+  optional uint32 height = 7 [ default = 1 ];
+  optional uint32 width = 8 [ default = 1 ];
+}
+
+message DetectionOutputConfig {
+  required uint32 num_classes = 1;
+  required float nms_threshold = 2;
+  required uint32 nms_top_k = 3;
+  required uint32 background_id = 4;
+  required uint32 input_num = 5;
+  required uint32 keep_top_k = 6;
+  required float confidence_threshold = 7;
+  optional uint32 height = 8 [ default = 1 ];
+  optional uint32 width = 9 [ default = 1 ];
+}
+
+message ClipConfig {
+  required double min = 1;
+  required double max = 2;
+}
+
+message ROIPoolConfig {
+  required uint32 pooled_width = 1;
+  required uint32 pooled_height = 2;
+  required float spatial_scale = 3;
+  optional uint32 height = 4 [ default = 1 ];
+  optional uint32 width = 5 [ default = 1 ];
+}
+
+message ScaleSubRegionConfig {
+  required ImageConfig image_conf = 1;
+  required float value = 2;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -279,14 +353,19 @@ message LayerInputConfig {
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
   optional PadConfig pad_conf = 14;
+  optional RowConvConfig row_conv_conf = 15;
+  optional MultiBoxLossConfig multibox_loss_conf = 16;
+  optional DetectionOutputConfig detection_output_conf = 17;
+  optional ClipConfig clip_conf = 18;
+  optional ScaleSubRegionConfig scale_sub_region_conf = 19;
+  optional ROIPoolConfig roi_pool_conf = 20;
 }
 
 message LayerConfig {
-
   required string name = 1;
   required string type = 2;
   optional uint64 size = 3;
-  //optional ActivationConfig activation = 4;
+  // optional ActivationConfig activation = 4;
   optional string active_type = 4;
   repeated LayerInputConfig inputs = 5;
   optional string bias_parameter_name = 6;
@@ -299,7 +378,7 @@ message LayerConfig {
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8 [default = false];
+  optional bool shared_biases = 8 [ default = false ];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer
@@ -317,33 +396,35 @@ message LayerConfig {
 
   // the gpu device which the Layer's data in.
   // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 12 [default = -1];
+  optional int32 device = 12 [ default = -1 ];
 
-  // for recurrent layer. If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 13 [default = false];
+  // for recurrent layer. If true, the recurrence runs from the end to the
+  // beginning.
+  optional bool reversed = 13 [ default = false ];
 
-  // for lstmemory layer. Different types of nodes have different activation type.
-  optional string active_gate_type  = 14;
+  // for lstmemory layer. Different types of nodes have different activation
+  // type.
+  optional string active_gate_type = 14;
   optional string active_state_type = 15;
 
   // For NCELayer
   // The number of random negative labels for each sample
-  optional int32 num_neg_samples = 16 [default = 10];
+  optional int32 num_neg_samples = 16 [ default = 10 ];
 
   // For NCELayer
   // The distribution for generating the random negative labels.
   // A uniform distribution will be used if not provided
-  repeated double neg_sampling_dist = 17 [packed = true];
+  repeated double neg_sampling_dist = 17 [ packed = true ];
 
   // For MaxLayer
   // default: output VALUE of MaxLayer. set this flag to true for output INDEX
   // INDEX will be put in Argument::value as double values.
-  optional bool output_max_index = 19 [default = false];
+  optional bool output_max_index = 19 [ default = false ];
 
   /// The filed number 20 have been deprecated.
 
   // For self-normalized estimation
-  optional double softmax_selfnorm_alpha = 21 [default = 0.1];
+  optional double softmax_selfnorm_alpha = 21 [ default = 0.1 ];
 
   /// The filed numbers 22 and 23 have been deprecated.
 
@@ -354,14 +435,14 @@ message LayerConfig {
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional double coeff = 26 [default = 1.0];
+  optional double coeff = 26 [ default = 1.0 ];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
   optional string average_strategy = 27;
 
   // for error clipping
-  optional double error_clipping_threshold = 28 [default = 0.0];
+  optional double error_clipping_threshold = 28 [ default = 0.0 ];
 
   // for operators used by mixed layer
   repeated OperatorConfig operator_confs = 29;
@@ -389,43 +470,44 @@ message LayerConfig {
   optional uint32 beam_size = 39;
 
   // for seqlastins layer, whether select first instead last
-  optional bool select_first = 40 [default = false];
+  optional bool select_first = 40 [ default = false ];
 
   // for seqlastins layer, AverageLayer, MaxLayer and ExpandLayer
   // can be set to: 'non-seq','seq'
-  optional string trans_type = 41 [default = 'non-seq'];
+  optional string trans_type = 41 [ default = 'non-seq' ];
 
   // to indicate whether selective_fc layer
   // is used in sequence generation or not
-  optional bool selective_fc_pass_generation = 42 [default = false];
+  optional bool selective_fc_pass_generation = 42 [ default = false ];
 
   // to indicate whether selective_fc layer take its last input to
   // selected several columns and only compute the multiplications
   // between the input matrices and the selected columns of
   // the parameter matrices of this layer.
   // if set false, selective_fc degrades into fc.
-  optional bool has_selected_colums = 43 [default = true];
+  optional bool has_selected_colums = 43 [ default = true ];
 
   // this parameter is for speed consideration.
   // if number of the selected columns is less than
   // sample number * selective_fc output size * selective_fc_mull_mull_ratio
   // sparse multiplication is used, otherwise, using full multiplication.
-  optional double selective_fc_full_mul_ratio = 44 [default = 0.02];
+  optional double selective_fc_full_mul_ratio = 44 [ default = 0.02 ];
 
   // to indicate how many threads selective_fc use to to accelate
   // the plain_mul period
   // leave empty or set to 0 to disable multi-thread accleleration
-  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45 [default = 0];
+  optional uint32 selective_fc_parallel_plain_mul_thread_num = 45
+      [ default = 0 ];
 
   // for batch normalization layer
   // if set use_global_stats true, will use the loaded mean and variance.
   optional bool use_global_stats = 46;
 
   // use to compute moving mean and variance.
-  optional double moving_average_fraction = 47 [default = 0.9];
+  optional double moving_average_fraction = 47 [ default = 0.9 ];
 
   // bias size
-  optional uint32 bias_size = 48 [default = 0];
+  optional uint32 bias_size = 48 [ default = 0 ];
 
   // this parameter can be used as a user-defined parameter when necessary,
   // without changing the proto file.
@@ -440,12 +522,33 @@ message LayerConfig {
   optional uint64 width = 51;
 
   // blank label used in ctc loss
-  optional uint32 blank = 52 [default = 0];
+  optional uint32 blank = 52 [ default = 0 ];
 
-  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which 
+  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
   // controls the scope of pooling operation. can be set > 0.
   // leave empty or set to -1 to disable this stride pooling.
-  optional int32 seq_pool_stride = 53 [default = -1];
+  optional int32 seq_pool_stride = 53 [ default = -1 ];
+
+  // for crop layer
+  optional int32 axis = 54 [ default = 2 ];
+  repeated uint32 offset = 55;
+  repeated uint32 shape = 56;
+
+  // for HuberRegressionLoss
+  optional double delta = 57 [ default = 1.0 ];
+
+  // for 3D data
+  optional uint64 depth = 58 [ default = 1 ];
+
+  // for switch order layer
+  optional ReshapeConfig reshape_conf = 59;
+
+  // for batch normalization layer
+  // The small constant added to the variance to improve numeric stability.
+  optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
@@ -461,9 +564,9 @@ message EvaluatorConfig {
 
   // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
   // For multi binary labels: true if output > classification_threshold
-  optional double classification_threshold = 6 [default = 0.5];
+  optional double classification_threshold = 6 [ default = 0.5 ];
   // The positive label. -1 means average precision and recall
-  optional int32 positive_label = 7 [default = -1];
+  optional int32 positive_label = 7 [ default = -1 ];
 
   // load dict from this file
   optional string dict_file = 8;
@@ -472,10 +575,10 @@ message EvaluatorConfig {
   optional string result_file = 9;
 
   // top # results for max id printer
-  optional int32 num_results = 10 [default = 1];
+  optional int32 num_results = 10 [ default = 1 ];
 
   // whether to delimit the sequence in the seq_text_printer
-  optional bool delimited = 11 [default = true];
+  optional bool delimited = 11 [ default = true ];
 
   // Used by ChunkEvaluator
   // chunk of these types are not counted
@@ -483,14 +586,23 @@ message EvaluatorConfig {
 
   // Used by ClassificationErrorEvaluator
   // top # classification error
-  optional int32 top_k = 13 [default = 1];
+  optional int32 top_k = 13 [ default = 1 ];
+
+  // Used by DetectionMAPEvaluator
+  optional double overlap_threshold = 14 [ default = 0.5 ];
+
+  optional int32 background_id = 15 [ default = 0 ];
+
+  optional bool evaluate_difficult = 16 [ default = false ];
+
+  optional string ap_type = 17 [ default = "11point" ];
 }
 
 message LinkConfig {
   required string layer_name = 1;
   required string link_name = 2;
   // If true, this link has sub-sequence
-  optional bool has_subseq = 3 [default = false];
+  optional bool has_subseq = 3 [ default = false ];
 }
 
 message MemoryConfig {
@@ -503,18 +615,18 @@ message MemoryConfig {
   optional uint32 boot_with_const_id = 7;
 
   // memory is a sequence, initailized by a sequence boot layer
-  optional bool is_sequence = 6 [default = false];
+  optional bool is_sequence = 6 [ default = false ];
 }
 
 message GeneratorConfig {
   required uint32 max_num_frames = 1;
   required string eos_layer_name = 2;
-  optional int32 num_results_per_sample = 3 [default = 1];
+  optional int32 num_results_per_sample = 3 [ default = 1 ];
 
   // for beam search
-  optional int32 beam_size = 4 [default = 1];
+  optional int32 beam_size = 4 [ default = 1 ];
 
-  optional bool log_prob = 5 [default = true];
+  optional bool log_prob = 5 [ default = true ];
 }
 
 message SubModelConfig {
@@ -524,10 +636,10 @@ message SubModelConfig {
   repeated string output_layer_names = 4;
   repeated string evaluator_names = 5;
 
-  optional bool is_recurrent_layer_group = 6 [default = false];
+  optional bool is_recurrent_layer_group = 6 [ default = false ];
 
   // If true, the recurrence runs from the end to the beginning.
-  optional bool reversed = 7 [default = false];
+  optional bool reversed = 7 [ default = false ];
 
   // name and link name of memory
   repeated MemoryConfig memories = 8;
@@ -541,14 +653,15 @@ message SubModelConfig {
 
   optional GeneratorConfig generator = 11;
 
-  // the id of inlink which share info with outlinks, used in recurrent layer group
+  // the id of inlink which share info with outlinks, used in recurrent layer
+  // group
   optional int32 target_inlinkid = 12;
 }
 
 message ModelConfig {
   // type of the model.
   // Currently, "nn", "recurrent_nn" and "recursive_nn" are supported
-  required string type = 1 [default = "nn"];
+  required string type = 1 [ default = "nn" ];
 
   // layers should be ordered in such a way that the forward propagation
   // can be correctly executed by going from the first layer to the last layer
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
new file mode 100644
index 0000000000..b341d78d19
--- /dev/null
+++ b/proto/OptimizerConfig.proto
@@ -0,0 +1,164 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+syntax = "proto2";
+
+option optimize_for = LITE_RUNTIME;
+
+package paddle;
+
+message SGDConfig {
+  // SGD
+  // momentum: float >= 0. Parameter updates momentum.
+  // decay: float >= 0. Learning rate decay over each update.
+  // nesterov: boolean. Whether to apply Nesterov momentum.
+  optional double momentum = 21 [ default = 0.0 ];
+  optional double decay = 23 [ default = 0.0 ];
+  optional bool nesterov = 24 [ default = false ];
+}
+
+message AdadeltaConfig {
+  // Adadelta
+  // It is recommended to leave it at the default value.
+  // rho: float >= 0.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+
+  // reference : [Adadelta - an adaptive learning rate
+  // method](http://arxiv.org/abs/1212.5701)
+  optional double rho = 33 [ default = 0.90 ];
+  optional double epsilon = 31 [ default = 1e-5 ];
+  optional double decay = 32 [ default = 0.0 ];
+}
+
+message AdagradConfig {
+  // Adagrad
+  // epsilon: float >= 0.
+  // decay: float >= 0. Learning rate decay over each update.
+
+  // reference : [Adaptive Subgradient Methods for Online Learning and
+  // Stochastic
+  // Optimization](http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+  optional double epsilon = 41 [ default = 1e-5 ];
+  optional double decay = 42 [ default = 0.0 ];
+}
+
+message AdamConfig {
+  // Adaj
+  // beta_1: float, 0 < beta < 1. Generally close to 1.
+  // beta_2: float, 0 < beta < 1. Generally close to 1.
+  // epsilon: float >= 0. Fuzz factor.
+  // decay: float >= 0. Learning rate decay over each update.
+  // reference : [Adam - A Method for Stochastic
+  // Optimization](http://arxiv.org/abs/1412.6980v8)
+  optional double beta_1 = 41;
+  optional double beta_2 = 42;
+  optional double epsilon = 43;
+  optional double decay = 44;
+}
+
+message ConstLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [ default = 1.0 ];
+}
+
+message LinearLrConfig {
+  // learninRate Policy
+  optional double learning_rate = 1 [ default = 1.0 ];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+
+message TensorProto {
+  enum DataType {
+    PADDLE_ELEMENT_TYPE_INT32 = 0;
+    PADDLE_ELEMENT_TYPE_UINT32 = 1;
+    PADDLE_ELEMENT_TYPE_INT64 = 2;
+    PADDLE_ELEMENT_TYPE_UINT64 = 3;
+    PADDLE_ELEMENT_TYPE_FLOAT32 = 4;
+    PADDLE_ELEMENT_TYPE_FLOAT64 = 5;
+  }
+  optional DataType data_type = 1;
+  repeated bytes content = 2;
+}
+
+message LrPolicyState {
+  // learninRate Policy
+  optional double learning_rate = 1 [ default = 1.0 ];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+
+message SGDOptimizerState {
+  optional LrPolicyState lr_state = 101;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+}
+
+message AdadeltaOptimizerState {
+  // learning rate policy
+  optional LrPolicyState lr_state = 101;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+  optional TensorProto accum_delta = 3;
+  optional TensorProto update_delta = 4;
+}
+
+message AdagradOptimizerState {
+  optional LrPolicyState lr_state = 101;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto accum_gradient = 2;
+}
+
+message AdamOptimizerState {
+  optional LrPolicyState lr_state = 101;
+  optional double num_sample_passed = 104;
+  // state
+  optional TensorProto parameter = 1;
+  optional TensorProto momentums = 2;
+  optional TensorProto velocitys = 3;
+}
+
+message OptimizerConfig {
+  enum Optimizer {
+    SGD = 1;
+    Adadelta = 2;
+    Adagrad = 3;
+    Adam = 4;
+  }
+  optional Optimizer optimizer = 1;
+  optional SGDConfig sgd = 3;
+  optional AdadeltaConfig adadelta = 4;
+  optional AdagradConfig adagrad = 5;
+  optional AdamConfig adam = 6;
+
+  enum LrPolicy {
+    Const = 0;
+    Linear = 1;
+  }
+  optional LrPolicy lr_policy = 11;
+  optional ConstLrConfig const_lr = 12;
+  optional LinearLrConfig linear_lr = 13;
+
+  // common config of optimizer
+  // gradient clip when L2 exceeding value
+  optional double clip_norm = 101;
+  // gradient clip when L1 exceeding value
+  optional double clip_value = 102;
+}
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
index cbcd0af598..b13570a2c6 100644
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
@@ -25,56 +25,59 @@ enum ParameterInitStrategy {
 }
 
 message ParameterUpdaterHookConfig {
+  // hook type such as  'pruning'
   required string type = 1;
-  optional string purning_mask_filename = 2;
+  // this represents the ratio of zero element to be set by the Parameter
+  optional double sparsity_ratio = 2 [ default = 0.6 ];
 }
 
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  optional double learning_rate = 3 [default = 1.0];
-  optional double momentum = 4 [default = 0.0];
-  optional double initial_mean = 5 [default = 0.0];
-  optional double initial_std = 6 [default = 0.01];
+  optional double learning_rate = 3 [ default = 1.0 ];
+  optional double momentum = 4 [ default = 0.0 ];
+  optional double initial_mean = 5 [ default = 0.0 ];
+  optional double initial_std = 6 [ default = 0.01 ];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional double decay_rate = 7 [default = 0.0];
+  optional double decay_rate = 7 [ default = 0.0 ];
   // use L1-regularization if decay_rate_l1 set
-  optional double decay_rate_l1 = 8 [default = 0.0];
+  optional double decay_rate_l1 = 8 [ default = 0.0 ];
   // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
   repeated uint64 dims = 9;
   // the gpu device which the parameter in.
   // Only used by ParallelNeuralNetork. Ignored otherwise.
-  optional int32 device = 10 [default = -1];
+  optional int32 device = 10 [ default = -1 ];
   // how to init the parameter: 0 -> normal, 1 -> uniform
   // 0: treat initial_mean as mean, intial_std as standard deviation
   // 1: range is (initial_mean - initial_std) to (initial_mean + initial_std)
-  optional int32 initial_strategy = 11 [default = 0];
+  optional int32 initial_strategy = 11 [ default = 0 ];
   // define the variance when init the parameter, by height of the Matrix
-  optional bool initial_smart = 12 [default = false];
+  optional bool initial_smart = 12 [ default = false ];
   // apply regularization every # batches
-  optional int32 num_batches_regularization = 13 [default = 1];
+  optional int32 num_batches_regularization = 13 [ default = 1 ];
   // if is_sparse is true, para is sparse, else para is dense
-  optional bool is_sparse = 14[default = false];
-  // if para is sparse, format should be "csc" or "csr", empty means is not sparse
-  optional string format = 15 [default = ""];
+  optional bool is_sparse = 14 [ default = false ];
+  // if para is sparse, format should be "csc" or "csr", empty means is not
+  // sparse
+  optional string format = 15 [ default = "" ];
   // sparse remote update or not
-  optional bool sparse_remote_update = 16 [default = false];
+  optional bool sparse_remote_update = 16 [ default = false ];
   // gradient clipping threshold, no clipping by default
-  optional double gradient_clipping_threshold = 17 [default = 0.0];
+  optional double gradient_clipping_threshold = 17 [ default = 0.0 ];
   // static parameters are fixed when training
-  optional bool is_static = 18 [default = false];
+  optional bool is_static = 18 [ default = false ];
   // para_id should NOT be set by config_parser. It is for
   // internal use.
   optional uint64 para_id = 19;
 
   repeated ParameterUpdaterHookConfig update_hooks = 20;
   // setup load mat -> csr
-  optional bool need_compact = 21 [default = false];
+  optional bool need_compact = 21 [ default = false ];
   // whether to do sparse update for this parameter
-  optional bool sparse_update = 22 [default = false];
+  optional bool sparse_update = 22 [ default = false ];
 
   // whether this parameter is shared or not.
-  optional bool is_shared = 23 [default = false];
+  optional bool is_shared = 23 [ default = false ];
   // parameter block size
-  optional uint64 parameter_block_size = 24 [default = 0];
+  optional uint64 parameter_block_size = 24 [ default = 0 ];
 }
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
index 404f961379..bd63cf35b1 100644
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
@@ -15,13 +15,10 @@ syntax = "proto2";
 
 package paddle;
 
-
 /**
  * Configuration structure for ParameterClient2.
  */
-message ParameterClientConfig {
-  required int32 trainer_id = 1;
-}
+message ParameterClientConfig { required int32 trainer_id = 1; }
 
 /**
  * Configuration structure for ParameterServer2.
@@ -30,24 +27,24 @@ message ParameterServerConfig {
   // Number of ports for sending dense parameter,
   // following ports on parameter server will be visited
   // for sending dense parameter: [port, port+ports_num-1]
-  required int32 ports_num = 1 [default = 1];
+  required int32 ports_num = 1 [ default = 1 ];
   // Number of ports for sending sparse parameter,
   // following ports on parameter server will be visited
   // for sending sparse parameter:
   // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
-  required int32 ports_num_for_sparse = 2 [default = 0];
+  required int32 ports_num_for_sparse = 2 [ default = 0 ];
   // network device name for pservers
-  required string nics = 3 [default = "xgbe0,xgbe1"];
-  required string rdma_tcp = 4 [default = "tcp"];
+  required string nics = 3 [ default = "xgbe0,xgbe1" ];
+  required string rdma_tcp = 4 [ default = "tcp" ];
   // Listening port for pserver
-  required int32 port = 5 [default = 20134];
+  required int32 port = 5 [ default = 20134 ];
   // number of gradient servers
-  required int32 num_gradient_servers = 6 [default = 1];
+  required int32 num_gradient_servers = 6 [ default = 1 ];
   // number of threads for sync op exec
-  required int32 pserver_num_threads = 7 [default = 1];
+  required int32 pserver_num_threads = 7 [ default = 1 ];
   // control config_.async_lagged_grad_discard_ratio() min value
-  required double async_lagged_ratio_min = 8 [default = 1.0];
+  required double async_lagged_ratio_min = 8 [ default = 1.0 ];
   // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
   // use it as defalut value
-  required double async_lagged_ratio_default = 9 [default = 1.5];
+  required double async_lagged_ratio_default = 9 [ default = 1.5 ];
 }
\ No newline at end of file
diff --git a/proto/ParameterService.proto b/proto/ParameterService.proto
index c1c04d8cc5..e3c180ccc3 100644
--- a/proto/ParameterService.proto
+++ b/proto/ParameterService.proto
@@ -23,8 +23,8 @@ package paddle;
  */
 enum ParameterUpdateMode {
   // Set parameter
-   PSERVER_UPDATE_MODE_SET_PARAM = 0;//use local param
-   PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1;//set zero param
+  PSERVER_UPDATE_MODE_SET_PARAM = 0;      // use local param
+  PSERVER_UPDATE_MODE_SET_PARAM_ZERO = 1; // set zero param
 
   // Update parameter once a gradient is received
   PSERVER_UPDATE_MODE_ASYNC_SGD = 2;
@@ -37,7 +37,7 @@ enum ParameterUpdateMode {
 
   // No update. Only get parameters back.
   PSERVER_UPDATE_MODE_GET_PARAM = 5;
-  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6;//only get sparse rows
+  PSERVER_UPDATE_MODE_GET_PARAM_SPARSE = 6; // only get sparse rows
 };
 
 message ParameterBlock {
@@ -80,42 +80,34 @@ message SendParameterRequest {
   optional int32 trainer_id = 7;
 
   // send back parameter type on pserver, PARAMETER_VALUE by default
-  optional int32 send_back_parameter_type = 8 [default = 0];
+  optional int32 send_back_parameter_type = 8 [ default = 0 ];
 
   // forwardbackward time in usec
   optional uint64 forwardbackward_time = 9;
-
 }
 
-message WaitPassStartRequest {
-}
+message WaitPassStartRequest {}
 
-message WaitPassStartResponse {
-}
+message WaitPassStartResponse {}
 
-message WaitPassFinishRequest {
-}
+message WaitPassFinishRequest {}
 
-message WaitPassFinishResponse {
-}
+message WaitPassFinishResponse {}
 
 enum SyncObject {
   SYNC_DEFAULT = 0; // wait for the synchronizeBarrier_
-  SYNC_DATA = 1; // wait for the synchronizeDataBarrier_
+  SYNC_DATA = 1;    // wait for the synchronizeDataBarrier_
 }
 
 message SynchronizeRequest {
-  required SyncObject sync_object_id = 1 [default = SYNC_DEFAULT];
+  required SyncObject sync_object_id = 1 [ default = SYNC_DEFAULT ];
 
   optional int32 trainer_id = 2;
 }
 
-message SynchronizeResponse {
-}
+message SynchronizeResponse {}
 
-message SendParameterResponse  {
-  repeated ParameterBlock blocks = 1;
-}
+message SendParameterResponse { repeated ParameterBlock blocks = 1; }
 
 message SetConfigRequest {
   repeated ParameterConfig param_configs = 1;
@@ -125,26 +117,18 @@ message SetConfigRequest {
   required bool is_sparse_server = 6;
 }
 
-message SetConfigResponse{
-}
+message SetConfigResponse {}
 
-message GetStatusRequest {
-}
+message GetStatusRequest {}
 
-message GetStatusResponse {
-  required PServerStatus status = 1;
-}
+message GetStatusResponse { required PServerStatus status = 1; }
 
-message SetStatusRequest {
-  required PServerStatus status = 1;
-}
+message SetStatusRequest { required PServerStatus status = 1; }
 
-message SetStatusResponse {
-}
+message SetStatusResponse {}
 
 // create a column vector. The size is the dimension of parameter
-message CreateVectorRequest {
-}
+message CreateVectorRequest {}
 
 message CreateVectorResponse {
   // error message. Empty if success
@@ -153,9 +137,7 @@ message CreateVectorResponse {
   required int64 handle = 2;
 }
 
-message ReleaseVectorRequest {
-  required int64 handle = 1;
-}
+message ReleaseVectorRequest { required int64 handle = 1; }
 
 message ReleaseVectorResponse {
   // error message. Empty if success
@@ -164,9 +146,7 @@ message ReleaseVectorResponse {
 
 // Create a column major matrix. The number of rows is the dimension
 // of parameter. The number of columns is specifed by num_cols
-message CreateMatrixRequest {
-  required int32 num_cols = 1;
-}
+message CreateMatrixRequest { required int32 num_cols = 1; }
 
 message CreateMatrixResponse {
   // error message. Empty if success
@@ -175,16 +155,13 @@ message CreateMatrixResponse {
   required int64 handle = 2;
 }
 
-message ReleaseMatrixRequest {
-  required int64 handle = 1;
-}
+message ReleaseMatrixRequest { required int64 handle = 1; }
 
 message ReleaseMatrixResponse {
   // error message. Empty if success
   optional string return_message = 1;
 }
 
-
 /**
  * The operations are defined using the variables commented at Operation
  * and OperationResult
@@ -245,36 +222,36 @@ enum MatrixVectorOperation {
 
 message ProtoVector {
   required int64 dim = 1;
-  repeated double values = 2 [packed = true];
+  repeated double values = 2 [ packed = true ];
 }
 
 message ProtoMatrix {
   required int64 num_rows = 1;
   required int64 num_cols = 2;
-  repeated double values = 3 [packed = true];
+  repeated double values = 3 [ packed = true ];
 }
 
 message Operation {
   required MatrixVectorOperation operation = 1;
 
   // vector handles created on the pserver
-  repeated int64 pvectors = 2;        // u, v, w
+  repeated int64 pvectors = 2; // u, v, w
 
   // matrix handles created on the pserver
-  repeated int64 pmatrices = 3;       // A, B, C
+  repeated int64 pmatrices = 3; // A, B, C
 
-  repeated double scalars = 4;  	      // a, b, c
-  repeated ProtoVector vectors = 5;   // x, y, z
-  repeated ProtoMatrix matrices = 6;  // X, Y, Z
+  repeated double scalars = 4;       // a, b, c
+  repeated ProtoVector vectors = 5;  // x, y, z
+  repeated ProtoMatrix matrices = 6; // X, Y, Z
 }
 
 message OperationResult {
   // error message. Empty if success
   optional string return_message = 1;
-//
-  repeated double scalars = 2;  // d, e, f
+  //
+  repeated double scalars = 2;       // d, e, f
   repeated ProtoVector vectors = 3;  // p, q, r
-  repeated ProtoMatrix matrices = 4;  // P, Q, R
+  repeated ProtoMatrix matrices = 4; // P, Q, R
 }
 
 message DoOperationRequest {
@@ -301,18 +278,14 @@ message DoOperationResponse {
   required bool pass_finish = 3;
 }
 
-message LoadValueRequest {
-  required string dir_name = 1;
-}
+message LoadValueRequest { required string dir_name = 1; }
 
 message LoadValueResponse {
   // error message. Empty if success
   optional string return_message = 1;
 }
 
-message SaveValueRequest {
-  required string dir_name = 1;
-}
+message SaveValueRequest { required string dir_name = 1; }
 
 message SaveValueResponse {
   // error message. Empty if success
@@ -331,11 +304,11 @@ enum DataUpdateMode {
   // Client send it's own ref label to pserver
   DATA_UPDATE_MODE_SET_REF_LABEL = 4;
   // Client get all ref labels from all pservers
-  DATA_UPDATE_MODE_GET_REF_LABEL =5;
+  DATA_UPDATE_MODE_GET_REF_LABEL = 5;
   // Client send it's own ref grad to pserver
-  DATA_UPDATE_MODE_SET_REF_GRAD =6;
+  DATA_UPDATE_MODE_SET_REF_GRAD = 6;
   // Client get all ref grad from all pservers
-  DATA_UPDATE_MODE_GET_REF_GRAD =7;
+  DATA_UPDATE_MODE_GET_REF_GRAD = 7;
 }
 
 enum SendDataType {
@@ -360,7 +333,7 @@ message DataBlock {
   // byte size of one data type
   required int32 data_size = 2;
   // data_type
-  optional TransDataType data_type = 3 [default = TRANS_DOUBLE];
+  optional TransDataType data_type = 3 [ default = TRANS_DOUBLE ];
 }
 
 message SendDataRequest {
diff --git a/proto/TrainerConfig.proto b/proto/TrainerConfig.proto
index a819d20d11..aa4e5f4ca0 100644
--- a/proto/TrainerConfig.proto
+++ b/proto/TrainerConfig.proto
@@ -19,15 +19,15 @@ import "ModelConfig.proto";
 package paddle;
 
 message OptimizationConfig {
-  required int32 batch_size = 3;
-  required string algorithm = 4 [default = "async_sgd"];
-  optional int32 num_batches_per_send_parameter = 5 [default = 1];
-  optional int32 num_batches_per_get_parameter = 6 [default = 1];
+  optional int32 batch_size = 3 [ default = 1 ];
+  required string algorithm = 4 [ default = "async_sgd" ];
+  optional int32 num_batches_per_send_parameter = 5 [ default = 1 ];
+  optional int32 num_batches_per_get_parameter = 6 [ default = 1 ];
 
   required double learning_rate = 7;
-  optional double learning_rate_decay_a = 8 [default = 0];
-  optional double learning_rate_decay_b = 9 [default = 0];
-  optional string learning_rate_schedule = 27 [default = "constant"];
+  optional double learning_rate_decay_a = 8 [ default = 0 ];
+  optional double learning_rate_decay_b = 9 [ default = 0 ];
+  optional string learning_rate_schedule = 27 [ default = "constant" ];
   // learning rate will be scaled according to learning_rate_schedule
   // 1), constant:
   // lr = learning_rate
@@ -49,88 +49,92 @@ message OptimizationConfig {
 
   // owlqn related
   // L1-regularization
-  optional double l1weight = 10 [default = 0.1];
+  optional double l1weight = 10 [ default = 0.1 ];
   // L2-regularization
-  optional double l2weight = 11 [default = 0];
+  optional double l2weight = 11 [ default = 0 ];
   // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
   // then accept the step
-  optional double c1 = 12 [default = 0.0001];
+  optional double c1 = 12 [ default = 0.0001 ];
   // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional double backoff = 13 [default = 0.5];
+  optional double backoff = 13 [ default = 0.5 ];
   // how many "s"s and "y"s are kept in owlqn
-  optional int32 owlqn_steps = 14 [default = 10];
+  optional int32 owlqn_steps = 14 [ default = 10 ];
   // accept the step if encountered "max_backoff" times of "reduce the step"
-  optional int32 max_backoff = 15 [default = 5];
+  optional int32 max_backoff = 15 [ default = 5 ];
   // L2-regularization coefficient is reduced linearly from iteration 0 to
   // "l2weight_zero_iter", and set to 0 after "l2weight_zero_iter"
   // iterations. set "l2weight_zero_iter" to 0 to disable this strategy.
-  optional int32 l2weight_zero_iter = 17 [default = 0];
+  optional int32 l2weight_zero_iter = 17 [ default = 0 ];
 
   // averaged sgd
   // About average_window * numBatchProcessed parameter are used
   // for average. To be accurate, between average_window * numBatchProcessed
   // and 2 * average_window * numBatchProcessed parameters are used for
   // average.
-  optional double average_window = 18 [default = 0];
-  optional int64 max_average_window = 19 [default = 0x7fffffffffffffff];
+  optional double average_window = 18 [ default = 0 ];
+  optional int64 max_average_window = 19 [ default = 0x7fffffffffffffff ];
 
   //////////////////////////
   // Options Adaptive SGD //
   //////////////////////////
 
-  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta", "rmsprop"
-  // default learning method("momentum") use global decayed learning rate with momentum.
+  // learning method for sgd/asgd, such as "momentum", "adagrad", "adadelta",
+  // "rmsprop"
+  // default learning method("momentum") use global decayed learning rate with
+  // momentum.
   // "adagrad", "adadelta" and "rmsprop" can set momentum too.
-  optional string learning_method = 23 [default = "momentum"];
-  optional double ada_epsilon = 24 [default = 1e-6];
-  optional double ada_rou = 26 [default = 0.95];
+  optional string learning_method = 23 [ default = "momentum" ];
+  optional double ada_epsilon = 24 [ default = 1e-6 ];
+  optional double ada_rou = 26 [ default = 0.95 ];
 
   // Force to do average in cpu in order to save gpu memory usage
-  optional bool do_average_in_cpu = 25 [default = false];
+  optional bool do_average_in_cpu = 25 [ default = false ];
 
   // delta add rate in pserver, used while num_batches_per_send_parameter>1
   // will be divided by #machines automatically.
-  optional double delta_add_rate = 28 [default = 1.0];
+  optional double delta_add_rate = 28 [ default = 1.0 ];
 
   // We split a large size into smaller mini-batches, whose sizes are
   // determined by mini_batch_size. It only takes effect when there is
   // an ExternalMachine.
-  optional int32 mini_batch_size = 29 [default = 128];
+  optional int32 mini_batch_size = 29 [ default = 128 ];
 
   // automatically set if any one of parameters set sparse remote update flag
-  optional bool use_sparse_remote_updater = 30 [default = false];
+  optional bool use_sparse_remote_updater = 30 [ default = false ];
 
-  // how to update center parameter and feedback to local parameter, 
+  // how to update center parameter and feedback to local parameter,
   // when use local sgd update in cluster training.
-  // A option is elastic_average, proposed by the paper: Deep learning with elastic averaging SGD.
-  // If use elastic_average method, every trainer node should sample from whole data sets.
-  optional string center_parameter_update_method = 31 [default = "average"];
+  // A option is elastic_average, proposed by the paper: Deep learning with
+  // elastic averaging SGD.
+  // If use elastic_average method, every trainer node should sample from whole
+  // data sets.
+  optional string center_parameter_update_method = 31 [ default = "average" ];
 
   // shrink sparse parameter value
   // only works if parameter is remote sparse update and has L1 decay rate
-  optional double shrink_parameter_value = 32 [default = 0];
+  optional double shrink_parameter_value = 32 [ default = 0 ];
 
   ////////////////////////////
   // Options Adam Optimizer //
   ////////////////////////////
-  optional double adam_beta1 = 33 [default = 0.9];
-  optional double adam_beta2 = 34 [default = 0.999];
-  optional double adam_epsilon = 35 [default = 1e-8];
+  optional double adam_beta1 = 33 [ default = 0.9 ];
+  optional double adam_beta2 = 34 [ default = 0.999 ];
+  optional double adam_epsilon = 35 [ default = 1e-8 ];
 
   // arguments for learning rate scheduler
   // Format: num1:rate1,num2:rate2,...,numK:rateK
   // For learning_rate_schedule="manual", num is the number of samples,
   // For learning_rate_schedule="pass_manual",
   //  num is the number of passes (starting from 0)
-  optional string learning_rate_args = 36 [default = ""];
- 
+  optional string learning_rate_args = 36 [ default = "" ];
+
   // for async sgd gradient commit control.
   // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
   // current async gradient will be discard silently.
-  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
+  optional double async_lagged_grad_discard_ratio = 37 [ default = 1.5 ];
 
-  // global threshold for gradient clipping 
-  optional double gradient_clipping_threshold = 38 [default = 0.0];
+  // global threshold for gradient clipping
+  optional double gradient_clipping_threshold = 38 [ default = 0.0 ];
 };
 
 message TrainerConfig {
@@ -141,7 +145,7 @@ message TrainerConfig {
   repeated string config_files = 5;
 
   // the directory to save/load model files for each training path
-  optional string save_dir = 6 [default = "./output/model"];
+  optional string save_dir = 6 [ default = "./output/model" ];
 
   // Path of the initial model parameters.
   // If it was set, start_pass will be ignored.
@@ -149,7 +153,7 @@ message TrainerConfig {
 
   // Start training from this pass.
   // Will load parameter from the previous pass.
-  optional int32 start_pass = 8 [default = 0];
+  optional int32 start_pass = 8 [ default = 0 ];
 
   // file path to the trainer config file
   optional string config_file = 9;
diff --git a/python/.gitignore b/python/.gitignore
index cc7d0ece4a..1ba1d4c9b0 100644
--- a/python/.gitignore
+++ b/python/.gitignore
@@ -2,6 +2,7 @@
 build
 dist
 paddle.egg-info
+paddlepaddle_gpu.egg-info
 .idea
 paddle/proto/*.py
 paddle/proto/*.pyc
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 3640dd3a75..36919ab00b 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(OUTPUT_DIR
-    "${CMAKE_CURRENT_BINARY_DIR}/build")
 
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
@@ -7,21 +5,63 @@ file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
 file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
-             ${TRAINER_PY_FILES}
-             ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES}
-             ${V2_PY_FILES})
+  ${TRAINER_PY_FILES}
+  ${HELPERS_PY_FILES}
+  ${UTILS_PY_FILES}
+  ${V2_PY_FILES})
+
+add_custom_target(copy_paddle_master)
+
+SET(COPY_PADDLE_MASTER "")
+if(WITH_GOLANG)
+  SET(COPY_PADDLE_MASTER "copy_paddle_master")
+  add_custom_command(TARGET ${COPY_PADDLE_MASTER}
+    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
+    )
+  add_dependencies(copy_paddle_master paddle_master)
+endif(WITH_GOLANG)
+
+set(MKL_SHARED_LIBS "")
+set(MKL_DEPENDS "")
+if(WITH_MKLML)
+  list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
+  list(APPEND MKL_DEPENDS mklml)
+endif()
+
+if(WITH_MKLDNN)
+  list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
+  list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
+endif()
+
+if(WITH_GPU)
+  SET(PACKAGE_NAME "paddlepaddle-gpu")
+else()
+  SET(PACKAGE_NAME "paddlepaddle")
+endif()
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
-add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/core.so)
+
+
+add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND touch stub.cc
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
-add_custom_target(paddle_python ALL DEPENDS
-    ${OUTPUT_DIR}/.timestamp)
+set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model ${MKL_DEPENDS})
+if(WITH_SWIG_PY)
+    list(APPEND paddle_python_deps python_api_wheel)
+endif()
+add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
@@ -32,6 +72,7 @@ if (WITH_TESTING)
     add_subdirectory(paddle/v2/tests)
     add_subdirectory(paddle/v2/reader/tests)
     add_subdirectory(paddle/v2/plot/tests)
+    add_subdirectory(paddle/v2/fluid/tests)
   endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f662d68263..1030c94e16 100644
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -11,3 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+try:
+    from version import full_version as __version__
+    from version import commit as __git_commit__
+except ImportError:
+    import sys
+    sys.stderr.write('''Warning with import paddle: you should not 
+     import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
+                     )
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 7e305e2cd9..05635833bf 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -27,6 +27,14 @@ class SequenceType(object):
     SEQUENCE = 1
     SUB_SEQUENCE = 2
 
+    @classmethod
+    def tostring(cls, value):
+        for k in cls.__dict__:
+            if not k.startswith('__'):
+                if getattr(cls, k) == value:
+                    return cls.__name__ + '.' + k
+        return 'INVALID(' + str(value) + ')'
+
 
 # TODO(yuyang18): Add string data type here.
 class DataType(object):
@@ -35,6 +43,14 @@ class DataType(object):
     SparseValue = 2
     Index = 3
 
+    @classmethod
+    def tostring(cls, value):
+        for k in cls.__dict__:
+            if not k.startswith('__'):
+                if getattr(cls, k) == value:
+                    return cls.__name__ + '.' + k
+        return 'INVALID(' + str(value) + ')'
+
 
 class CacheType(object):
     NO_CACHE = 0  # No cache at all
@@ -69,6 +85,26 @@ class InputType(object):
         self.seq_type = seq_type
         self.type = tp
 
+    def __repr__(self):
+        """
+        Return a human readable representation like 'InputType(dim=25921, 
+            seq_type=SequenceType.NO_SEQUENCE, type=DataType.Dense)'
+        """
+        repr_str = type(self).__name__
+        repr_str += '('
+        serialize_func_map = {
+            'dim': repr,
+            'seq_type': SequenceType.tostring,
+            'type': DataType.tostring
+        }
+        for idx, k in enumerate(self.__slots__):
+            if idx != 0:
+                repr_str += ', '
+            repr_str += (
+                k + '=' + serialize_func_map.get(k, repr)(getattr(self, k)))
+        repr_str += ')'
+        return repr_str
+
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
     """
@@ -139,7 +175,7 @@ def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
 
 dense_vector = dense_slot
 sparse_binary_vector = sparse_non_value_slot
-sparse_vector = sparse_value_slot
+sparse_float_vector = sparse_value_slot
 integer_value = index_slot
 
 # dense_array can be used for variable-length input feature.
@@ -180,7 +216,7 @@ def sparse_binary_vector_sub_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def sparse_vector_sequence(dim):
+def sparse_float_vector_sequence(dim):
     """
     Data type of a sequence of sparse vector, which most elements are zero,
     others could be any float value.
@@ -190,11 +226,11 @@ def sparse_vector_sequence(dim):
     :return: An input type object
     :rtype: InputType
     """
-    return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
+    return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
-def sparse_vector_sub_sequence(dim):
-    return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+def sparse_float_vector_sub_sequence(dim):
+    return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
 def integer_value_sequence(value_range):
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5d540664a7..186b91c226 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -73,7 +73,6 @@ To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
 
 '''
-
 import copy
 import logging
 import os
@@ -127,6 +126,7 @@ def init_config_environment(
         g_config=TrainerConfig(),
         g_layer_map={},
         g_parameter_map={},
+        g_parameter_initializer_map={},
         g_extended_config_funcs={},
 
         # store command args of paddle_trainer
@@ -140,8 +140,13 @@ def init_config_environment(
         g_submodel_stack=[],
         g_add_submodel_suffix=False, ):
 
-    for k, v in locals().iteritems():
-        globals()[k] = copy.deepcopy(v)
+    # directly iterate through locals().iteritems() will change
+    # the size of locals() due to introducing k, v into scope
+    # which will break the process in some env
+
+    local_vars = copy.deepcopy(locals())
+    for k, v in local_vars.iteritems():
+        globals()[k] = v
 
 
 # Because type is widely used as a variable name in this code.
@@ -328,53 +333,34 @@ def RecurrentLayerGroupWithoutOutLinksBegin(name,
     SubModelBegin(name)
     g_current_submodel.is_recurrent_layer_group = True
     g_current_submodel.reversed = seq_reversed
-    g_current_submodel.target_inlinkid = -1
     in_links_count = 0
     for linkid, link in enumerate(in_links):
         if isinstance(link, basestring):
             name = link
-            has_subseq = False
         else:
             name = link.link_name
-            has_subseq = link.has_subseq
-        # assign target_inlinkid according to target_inlinkname
-        if target_inlinkname == name:
-            g_current_submodel.target_inlinkid = linkid
 
-        if in_links_count == 0:
-            in_links_has_subseq = has_subseq
-        else:
-            config_assert(
-                in_links_has_subseq == has_subseq,
-                "The sequence type of in_links should be the same in RecurrentLayerGroup"
-            )
         in_links_count += 1
         layer_name = MakeLayerNameInParentSubmodel(name)
         layer = g_layer_map[layer_name]
-        if has_subseq:
-            SequenceScatterAgentLayer(name=name, size=layer.size)
-        else:
-            ScatterAgentLayer(name=name, size=layer.size)
+        ScatterAgentLayer(
+            name=name, size=layer.size, width=layer.width, height=layer.height)
 
         pair = g_current_submodel.in_links.add()
         pair.layer_name = layer_name
         pair.link_name = MakeLayerNameInSubmodel(name)
-        pair.has_subseq = has_subseq
 
 
 @config_func
 def RecurrentLayerGroupSetOutLink(link):
     if isinstance(link, basestring):
         name = link
-        has_subseq = False
     else:
         name = link.link_name
-        has_subseq = link.has_subseq
     layer_name = MakeLayerNameInParentSubmodel(name)
     pair = g_current_submodel.out_links.add()
     pair.layer_name = MakeLayerNameInSubmodel(name)
     pair.link_name = layer_name
-    pair.has_subseq = has_subseq
 
 
 def RecurrentLayerGroupSetGenerator(generator=None):
@@ -389,8 +375,7 @@ def RecurrentLayerGroupBegin(name,
                              generator=None,
                              target_inlinkname="",
                              seq_reversed=False):
-    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed,
-                                            target_inlinkname)
+    RecurrentLayerGroupWithoutOutLinksBegin(name, in_links, seq_reversed)
     for link in out_links:
         RecurrentLayerGroupSetOutLink(link)
 
@@ -425,8 +410,6 @@ def RecurrentLayerGroupEnd(name):
         agent_name = GetLayerBaseName(pair.link_name)
         if prev_submodel.HasField("generator"):
             DataLayer(name=agent_name, size=layer.size)
-        elif pair.has_subseq:
-            SequenceGatherAgentLayer(name=agent_name, size=layer.size)
         else:
             GatherAgentLayer(name=agent_name, size=layer.size)
 
@@ -440,22 +423,22 @@ def model_type(name):
 
 @config_class
 class Bias(Cfg):
-    def __init__(
-            self,
-            parameter_name=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            gradient_clipping_threshold=None,
-            is_static=None,
-            is_shared=None, ):
+    def __init__(self,
+                 parameter_name=None,
+                 learning_rate=None,
+                 momentum=None,
+                 decay_rate=None,
+                 decay_rate_l1=None,
+                 initial_mean=None,
+                 initial_std=None,
+                 initial_strategy=None,
+                 initial_smart=None,
+                 num_batches_regularization=None,
+                 sparse_remote_update=None,
+                 gradient_clipping_threshold=None,
+                 is_static=None,
+                 is_shared=None,
+                 initializer=None):
         self.add_keys(locals())
 
 
@@ -466,6 +449,7 @@ class Input(Cfg):
             self,
             input_layer_name,
             parameter_name=None,
+            initializer=None,
             learning_rate=None,
             momentum=None,
             decay_rate=None,
@@ -522,6 +506,7 @@ class Projection(Input):
             initial_std=None,
             initial_strategy=None,
             initial_smart=None,
+            initializer=None,
             num_batches_regularization=None,
             sparse_remote_update=None,
             sparse_update=None,
@@ -579,6 +564,38 @@ class IdentityOffsetProjection(Projection):
                                                        **xargs)
         self.proj_conf.offset = offset
 
+    def calc_output_size(self, input_layer_config):
+        return 0  # depends on the outside MixedLayer
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+
+@config_class
+class SliceProjection(Projection):
+    type = 'slice'
+
+    def __init__(self, input_layer_name, slices, **xargs):
+        super(SliceProjection, self).__init__(input_layer_name, **xargs)
+        input = g_layer_map[input_layer_name]
+        if input.type in ["exconv", "cudnn_conv"]:
+            # the slice operator is for the channel dimension
+            assert input.num_filters is not None
+            channels = input.num_filters
+            image_size = input.size / channels
+            assert slices[len(slices) - 1][1] <= channels
+            for i in xrange(len(slices)):
+                slice = self.proj_conf.slices.add()
+                slice.start = slices[i][0] * image_size
+                slice.end = slices[i][1] * image_size
+                self.size += slice.end - slice.start
+        else:
+            config_assert(False,
+                          'Currently the input should be convolution layer')
+
     def calc_parameter_size(self, input_size, output_size):
         return 0
 
@@ -861,18 +878,52 @@ class Conv(Cfg):
                  caffe_mode=True,
                  filter_size_y=None,
                  padding_y=None,
-                 stride_y=None):
+                 stride_y=None,
+                 dilation=None,
+                 dilation_y=None):
         self.add_keys(locals())
         if filter_size_y is None:
             self.filter_size_y = filter_size
         if padding_y is None:
             self.padding_y = padding
+        if dilation_y is None:
+            self.dilation_y = dilation
         if stride_y is None:
             self.stride_y = stride
         if output_x is not None:
             config_assert(output_x <= 0)
 
 
+# please refer to the comments in proto/ModelConfig.proto
+@config_class
+class Conv3D(Cfg):
+    def __init__(self,
+                 filter_size,
+                 channels,
+                 padding=None,
+                 stride=None,
+                 groups=None,
+                 filter_channels=None,
+                 output_x=None,
+                 img_size=None,
+                 caffe_mode=True,
+                 filter_size_y=None,
+                 padding_y=None,
+                 stride_y=None,
+                 filter_size_z=None,
+                 padding_z=None,
+                 stride_z=None):
+        self.add_keys(locals())
+        self.filter_size_y = filter_size_y if filter_size_y else filter_size
+        self.filter_size_z = filter_size_z if filter_size_z else filter_size
+        self.padding_y = padding_y if padding_y else padding
+        self.padding_z = padding_z if padding_z else padding
+        self.stride_y = stride_y if stride_y else stride
+        self.stride_z = stride_z if stride_z else stride
+        if output_x is not None:
+            config_assert(output_x <= 0)
+
+
 @config_class
 class BilinearInterp(Cfg):
     def __init__(self, out_size_x=None, out_size_y=None, channels=None):
@@ -895,6 +946,31 @@ class Pool(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Pool3d(Cfg):
+    def __init__(
+            self,
+            pool_type,
+            channels,
+            size_x,
+            size_y=None,
+            size_z=None,
+            start=None,
+            stride=None,  # 1 by defalut in protobuf
+            stride_y=None,
+            stride_z=None,
+            padding=None,  # 0 by defalut in protobuf
+            padding_y=None,
+            padding_z=None):
+        self.add_keys(locals())
+        self.filter_size_y = size_y if size_y else size_x
+        self.filter_size_z = size_z if size_z else size_x
+        self.padding_y = padding_y if padding_y else padding
+        self.padding_z = padding_z if padding_z else padding
+        self.stride_y = stride_y if stride_y else stride
+        self.stride_z = stride_z if stride_z else stride
+
+
 @config_class
 class SpatialPyramidPool(Cfg):
     def __init__(self, pool_type, pyramid_height, channels):
@@ -1045,35 +1121,6 @@ def PyData(files=None,
     return data_config
 
 
-@config_func
-def ProtoData(files=None,
-              type=None,
-              file_group_queue_capacity=None,
-              load_file_count=None,
-              constant_slots=None,
-              load_thread_num=None,
-              **xargs):
-    data_config = create_data_config_proto(**xargs)
-    if type is None:
-        data_config.type = 'proto'
-    else:
-        data_config.type = type
-    data_config.files = files
-
-    # When type="proto_group", one data provider contains at most
-    # load_file_count files, and there are at most
-    # (queue_capacity + load_thread_num + 1) data providers in memory
-    if file_group_queue_capacity is not None:
-        data_config.file_group_conf.queue_capacity = file_group_queue_capacity
-    if load_file_count is not None:
-        data_config.file_group_conf.load_file_count = load_file_count
-    if load_thread_num is not None:
-        data_config.file_group_conf.load_thread_num = load_thread_num
-    if constant_slots:
-        data_config.constant_slots.extend(constant_slots)
-    return data_config
-
-
 #real data for training is actually provided by "sub_data" data providers.
 @config_func
 def MultiData(sub_data=[]):
@@ -1129,8 +1176,14 @@ def TestData(data_config, async_load_data=None):
 
 #caffe_mode: compute the output size using floor instead of ceil,
 #            which is consistent of caffe and CuDNN's convention.
-def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
-    output = (2 * padding + img_size - filter_size) / float(stride)
+def cnn_output_size(img_size,
+                    filter_size,
+                    padding,
+                    stride,
+                    caffe_mode,
+                    dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    output = (2 * padding + img_size - filter_s) / float(stride)
     if caffe_mode:
         return 1 + int(math.floor(output))
     else:
@@ -1139,8 +1192,14 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
 
 #calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
-def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
-    img_size = (output_size - 1) * stride + filter_size - 2 * padding
+def cnn_image_size(output_size,
+                   filter_size,
+                   padding,
+                   stride,
+                   caffe_mode,
+                   dilation=1):
+    filter_s = (filter_size - 1) * dilation + 1
+    img_size = (output_size - 1) * stride + filter_s - 2 * padding
     if not caffe_mode:
         img_size = img_size + 1
     return img_size
@@ -1159,18 +1218,32 @@ def get_img_size(input_layer_name, channels):
     return img_size, img_size_y
 
 
+def get_img3d_size(input_layer_name, channels):
+    input = g_layer_map[input_layer_name]
+    img_pixels = input.size / channels
+    img_size = input.width
+    img_size_y = input.height
+    img_size_z = input.depth
+
+    config_assert(
+        img_size * img_size_y * img_size_z == img_pixels,
+        "Input layer %s: Incorrect input image size %d * %d * %d for input image pixels %d"
+        % (input_layer_name, img_size, img_size_y, img_size_z, img_pixels))
+    return img_size, img_size_y, img_size_z
+
+
 def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     parse_image(bilinear, input_layer_name, bilinear_conf.image_conf)
     bilinear_conf.out_size_x = bilinear.out_size_x
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
-        'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
-    ], "pool-type %s is not in "
-                  "['max-projection', 'avg-projection', "
+        'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool'
+    ], "pool-type %s is not in " \
+              "['max-projection', 'avg-projection', 'max-pool-with-mask'," \
                   "'cudnn-max-pool', 'cudnn-avg-pool']" % pool.pool_type)
 
     pool_conf.channels = pool.channels
@@ -1194,6 +1267,47 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
                                          pool_conf.stride_y, not ceil_mode)
+    if exclude_mode != None:
+        pool_conf.exclude_mode = exclude_mode
+
+
+def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode):
+    pool_conf.pool_type = pool.pool_type
+    config_assert(pool.pool_type in ['max-projection', 'avg-projection'],
+                  "pool-type %s is not in "
+                  "['max-projection', 'avg-projection']" % pool.pool_type)
+
+    pool_conf.channels = pool.channels
+
+    pool_conf.size_x = pool.size_x
+    pool_conf.stride = pool.stride
+    pool_conf.padding = pool.padding
+
+    pool_conf.size_y = default(pool.size_y, pool_conf.size_x)
+    pool_conf.size_z = default(pool.size_z, pool_conf.size_x)
+    pool_conf.stride_y = default(pool.stride_y, pool_conf.stride)
+    pool_conf.stride_z = default(pool.stride_z, pool_conf.stride)
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+
+    pool_conf.img_size, pool_conf.img_size_y, pool_conf.img_size_z = \
+        get_img3d_size(input_layer_name, pool.channels)
+
+    config_assert(not pool.start, "start is deprecated in pooling.")
+
+    if pool.padding is not None:
+        pool_conf.padding = pool.padding
+    pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
+    pool_conf.padding_z = default(pool.padding_z, pool_conf.padding)
+    pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
+                                         pool_conf.padding, pool_conf.stride,
+                                         not ceil_mode)
+    pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
+                                         pool_conf.padding_y,
+                                         pool_conf.stride_y, not ceil_mode)
+    pool_conf.output_z = cnn_output_size(pool_conf.img_size_z, pool_conf.size_z,
+                                         pool_conf.padding_z,
+                                         pool_conf.stride_z, not ceil_mode)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
@@ -1211,6 +1325,12 @@ def parse_image(image, input_layer_name, image_conf):
         get_img_size(input_layer_name, image_conf.channels)
 
 
+def parse_image3d(image, input_layer_name, image_conf):
+    image_conf.channels = image.channels
+    image_conf.img_size, image_conf.img_size_y, image_conf.img_size_z = \
+        get_img3d_size(input_layer_name, image_conf.channels)
+
+
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
     config_assert(
@@ -1246,6 +1366,12 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
     conv_conf.stride_y = conv.stride_y
     conv_conf.groups = conv.groups
     conv_conf.caffe_mode = conv.caffe_mode
+    if not conv.dilation:
+        conv.dilation = 1
+        conv.dilation_y = 1
+    else:
+        conv_conf.dilation = conv.dilation
+        conv_conf.dilation_y = conv.dilation_y
 
     if not trans:
         conv_conf.filter_channels = conv.channels / conv.groups
@@ -1253,20 +1379,64 @@ def parse_conv(conv, input_layer_name, conv_conf, num_filters, trans=False):
             get_img_size(input_layer_name, conv.channels)
         conv_conf.output_x = cnn_output_size(
             conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
-            conv_conf.stride, conv_conf.caffe_mode)
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
         conv_conf.output_y = cnn_output_size(
             conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
-            conv_conf.stride_y, conv_conf.caffe_mode)
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
     else:
         conv_conf.filter_channels = num_filters / conv.groups
         conv_conf.output_x, conv_conf.output_y = \
             get_img_size(input_layer_name, conv.channels)
+        conv_conf.img_size = cnn_image_size(
+            conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode, conv.dilation)
+        conv_conf.img_size_y = cnn_image_size(
+            conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode, conv.dilation_y)
+
+
+#caffe_mode: compute the output size using floor instead of ceil,
+#            which is consistent of caffe and CuDNN's convention.
+def parse_conv3d(conv, input_layer_name, conv_conf, num_filters, trans=False):
+    conv_conf.filter_size = conv.filter_size
+    conv_conf.filter_size_y = conv.filter_size_y
+    conv_conf.filter_size_z = conv.filter_size_z
+    conv_conf.channels = conv.channels
+    conv_conf.padding = conv.padding
+    conv_conf.padding_y = conv.padding_y
+    conv_conf.padding_z = conv.padding_z
+    conv_conf.stride = conv.stride
+    conv_conf.stride_y = conv.stride_y
+    conv_conf.stride_z = conv.stride_z
+    conv_conf.groups = conv.groups
+    conv_conf.caffe_mode = conv.caffe_mode
+
+    if not trans:
+        conv_conf.filter_channels = conv.channels / conv.groups
+        conv_conf.img_size, conv_conf.img_size_y, conv_conf.img_size_z = \
+            get_img3d_size(input_layer_name, conv.channels)
+        conv_conf.output_x = cnn_output_size(
+            conv_conf.img_size, conv_conf.filter_size, conv_conf.padding,
+            conv_conf.stride, conv_conf.caffe_mode)
+        conv_conf.output_y = cnn_output_size(
+            conv_conf.img_size_y, conv_conf.filter_size_y, conv_conf.padding_y,
+            conv_conf.stride_y, conv_conf.caffe_mode)
+        conv_conf.output_z = cnn_output_size(
+            conv_conf.img_size_z, conv_conf.filter_size_z, conv_conf.padding_z,
+            conv_conf.stride_z, conv_conf.caffe_mode)
+    else:
+        conv_conf.filter_channels = num_filters / conv.groups
+        conv_conf.output_x, conv_conf.output_y, conv_conf.output_z = \
+            get_img3d_size(input_layer_name, conv.channels)
         conv_conf.img_size = cnn_image_size(
             conv_conf.output_x, conv_conf.filter_size, conv_conf.padding,
             conv_conf.stride, conv_conf.caffe_mode)
         conv_conf.img_size_y = cnn_image_size(
             conv_conf.output_y, conv_conf.filter_size_y, conv_conf.padding_y,
             conv_conf.stride_y, conv_conf.caffe_mode)
+        conv_conf.img_size_z = cnn_image_size(
+            conv_conf.output_z, conv_conf.filter_size_z, conv_conf.padding_z,
+            conv_conf.stride_z, conv_conf.caffe_mode)
 
 
 def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
@@ -1301,20 +1471,23 @@ def parse_maxout(maxout, input_layer_name, maxout_conf):
 
 # Define an evaluator
 @config_func
-def Evaluator(
-        name,
-        type,
-        inputs,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        top_k=None,
-        delimited=None,
-        excluded_chunk_types=None, ):
+def Evaluator(name,
+              type,
+              inputs,
+              chunk_scheme=None,
+              num_chunk_types=None,
+              classification_threshold=None,
+              positive_label=None,
+              dict_file=None,
+              result_file=None,
+              num_results=None,
+              top_k=None,
+              delimited=None,
+              excluded_chunk_types=None,
+              overlap_threshold=None,
+              background_id=None,
+              evaluate_difficult=None,
+              ap_type=None):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
     evaluator.name = MakeLayerNameInSubmodel(name)
@@ -1348,6 +1521,18 @@ def Evaluator(
     if excluded_chunk_types:
         evaluator.excluded_chunk_types.extend(excluded_chunk_types)
 
+    if overlap_threshold is not None:
+        evaluator.overlap_threshold = overlap_threshold
+
+    if background_id is not None:
+        evaluator.background_id = background_id
+
+    if evaluate_difficult is not None:
+        evaluator.evaluate_difficult = evaluate_difficult
+
+    if ap_type is not None:
+        evaluator.ap_type = ap_type
+
 
 class LayerBase(object):
     def __init__(
@@ -1359,7 +1544,8 @@ class LayerBase(object):
             device=None,
             active_type="",
             drop_rate=0.,
-            coeff=None):
+            coeff=None,
+            error_clipping_threshold=None):
         config_assert('@' not in name,
                       "layer name: %s contain special character @" % name)
         global g_current_submodel
@@ -1378,6 +1564,10 @@ class LayerBase(object):
 
         self.config = g_config.model_config.layers.add()
         assert isinstance(self.config, LayerConfig)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        mkldnn_acts = ['relu', 'tanh', 'softmax']
+        if use_mkldnn and active_type in mkldnn_acts:
+            active_type = "mkldnn_" + active_type
         self.config.name = name
         self.config.type = type
         self.config.active_type = active_type
@@ -1393,6 +1583,9 @@ class LayerBase(object):
         elif g_default_device is not None:
             self.config.device = g_default_device
 
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
         for input_index in xrange(len(self.inputs)):
             input = self.inputs[input_index]
             input_config = None
@@ -1480,7 +1673,8 @@ class LayerBase(object):
                     gradient_clipping_threshold=bias.
                     gradient_clipping_threshold,
                     is_static=bias.is_static,
-                    is_shared=bias.is_shared, )
+                    is_shared=bias.is_shared,
+                    initializer=bias.initializer)
             if for_self:
                 self.config.bias_parameter_name = bias.parameter_name
             else:
@@ -1537,7 +1731,8 @@ class LayerBase(object):
             format=format,
             is_static=input_config.is_static,
             is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks)
+            update_hooks=input_config.update_hooks,
+            initializer=input_config.initializer)
 
     def set_layer_size(self, size):
         if self.config.size == 0:
@@ -1551,6 +1746,9 @@ class LayerBase(object):
         self.config.height = height
         self.config.width = width
 
+    def set_layer_depth(self, depth):
+        self.config.depth = depth
+
     def set_cnn_layer(self,
                       input_layer_name,
                       height,
@@ -1573,17 +1771,53 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
         self.config.softmax_selfnorm_alpha = softmax_selfnorm_alpha
 
 
+@config_layer('cross_entropy_over_beam')
+class CrossEntropyOverBeamLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        config_assert(len(inputs) % 3 == 0, "Error input number.")
+        super(CrossEntropyOverBeamLayer, self).__init__(
+            name, 'cross_entropy_over_beam', 0, inputs, **xargs)
+        input_num = len(inputs) / 3
+        for i in range(input_num):
+            input_layer = self.get_input_layer(i * 3)
+            config_assert(input_layer.size == 1, (
+                "Inputs for this layer are made up of "
+                "several triples, in which the first one is scores over "
+                "all candidate paths, whose size should be equal to 1."))
+
+
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
-        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+    layer_type = 'fc'
+
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn_wgt = bool(
+            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
+        if use_mkldnn:
+            self.layer_type = 'mkldnn_fc'
+            config_assert(
+                len(inputs) == 1,
+                "MKLDNNFCLayer support one and only one input!")
+        super(FCLayer, self).__init__(
+            name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             psize = self.config.size * input_layer.size
             dims = [input_layer.size, self.config.size]
             format = self.inputs[input_index].format
             sparse = format == "csr" or format == "csc"
-
+            if use_mkldnn:
+                config_assert(not sparse,
+                              "MKLDNNFCLayer do not support sparse format yet")
+                if use_mkldnn_wgt:
+                    dims = [self.config.size, input_layer.size]
             if sparse:
                 psize = self.inputs[input_index].nnz
             else:
@@ -1592,6 +1826,13 @@ class FCLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims, sparse,
                                         format)
         self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
+
+@config_layer('mkldnn_fc')
+class MKLDNNFcLayer(FCLayer):
+    layer_type = 'mkldnn_fc'
 
 
 @config_layer('selective_fc')
@@ -1647,8 +1888,14 @@ class SelectiveFCLayer(LayerBase):
 
 @config_layer('print')
 class PrintLayer(LayerBase):
-    def __init__(self, name, inputs):
+    def __init__(self, name, inputs, format=None):
         super(PrintLayer, self).__init__(name, 'print', 0, inputs)
+        if format is None:
+            format = "\n".join([
+                "layer=" + input.input_layer_name + " %s"
+                for input in self.inputs
+            ])
+        self.config.user_arg = format
 
 
 @config_layer('priorbox')
@@ -1672,13 +1919,79 @@ class PriorBoxLayer(LayerBase):
         self.config.size = size
 
 
+@config_layer('multibox_loss')
+class MultiBoxLossLayer(LayerBase):
+    def __init__(self, name, inputs, input_num, num_classes, overlap_threshold,
+                 neg_pos_ratio, neg_overlap, background_id, **xargs):
+        super(MultiBoxLossLayer, self).__init__(name, 'multibox_loss', 0,
+                                                inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 2),
+            'MultiBoxLossLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].multibox_loss_conf.num_classes = num_classes
+        self.config.inputs[
+            0].multibox_loss_conf.overlap_threshold = overlap_threshold
+        self.config.inputs[0].multibox_loss_conf.neg_pos_ratio = neg_pos_ratio
+        self.config.inputs[0].multibox_loss_conf.neg_overlap = neg_overlap
+        self.config.inputs[0].multibox_loss_conf.background_id = background_id
+        self.config.inputs[0].multibox_loss_conf.input_num = input_num
+        self.config.size = 1
+
+
+@config_layer('detection_output')
+class DetectionOutputLayer(LayerBase):
+    def __init__(self, name, inputs, size, input_num, num_classes,
+                 nms_threshold, nms_top_k, keep_top_k, confidence_threshold,
+                 background_id, **xargs):
+        super(DetectionOutputLayer, self).__init__(name, 'detection_output', 0,
+                                                   inputs)
+        config_assert(
+            len(inputs) == (input_num * 2 + 1),
+            'DetectionOutputLayer does not have enough inputs')
+        config_assert(num_classes > background_id,
+                      'Classes number must greater than background ID')
+        self.config.inputs[0].detection_output_conf.num_classes = num_classes
+        self.config.inputs[
+            0].detection_output_conf.nms_threshold = nms_threshold
+        self.config.inputs[0].detection_output_conf.nms_top_k = nms_top_k
+        self.config.inputs[0].detection_output_conf.keep_top_k = keep_top_k
+        self.config.inputs[
+            0].detection_output_conf.confidence_threshold = confidence_threshold
+        self.config.inputs[
+            0].detection_output_conf.background_id = background_id
+        self.config.inputs[0].detection_output_conf.input_num = input_num
+        self.config.size = size
+
+
+@config_layer('roi_pool')
+class ROIPoolLayer(LayerBase):
+    def __init__(self, name, inputs, pooled_width, pooled_height, spatial_scale,
+                 num_channels, **xargs):
+        super(ROIPoolLayer, self).__init__(name, 'roi_pool', 0, inputs)
+        config_assert(len(inputs) == 2, 'ROIPoolLayer must have 2 inputs')
+        self.config.inputs[0].roi_pool_conf.pooled_width = pooled_width
+        self.config.inputs[0].roi_pool_conf.pooled_height = pooled_height
+        self.config.inputs[0].roi_pool_conf.spatial_scale = spatial_scale
+        self.set_cnn_layer(name, pooled_height, pooled_width, num_channels)
+
+
 @config_layer('data')
 class DataLayer(LayerBase):
-    def __init__(self, name, size, height=None, width=None, device=None):
+    def __init__(self,
+                 name,
+                 size,
+                 depth=None,
+                 height=None,
+                 width=None,
+                 device=None):
         super(DataLayer, self).__init__(
             name, 'data', size, inputs=[], device=device)
         if height and width:
             self.set_layer_height_width(height, width)
+        if depth:
+            self.set_layer_depth(depth)
 
 
 '''
@@ -1731,11 +2044,20 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
-        config_assert(len(self.inputs) == 1)
-        config_assert(self.input_layer.size % partial_sum == 0)
+
         input_layer = self.get_input_layer(0)
+        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
+        config_assert(input_layer.size % partial_sum == 0,
+                      "a wrong setting for partial_sum")
+
+        dims = [1, input_layer.size / partial_sum]
         self.set_layer_size(input_layer.size)
-        self.create_input_parameter(0, input_layer.size / partial_sum)
+        self.config.partial_sum = partial_sum
+        self.create_input_parameter(0, input_layer.size / partial_sum, dims)
+
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                        self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)
 
 
 @config_layer('conv')
@@ -1755,20 +2077,26 @@ class ConvLayerBase(LayerBase):
         if num_filters is not None:
             self.config.num_filters = num_filters
 
+        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
         use_gpu = int(g_command_config_args.get("use_gpu", 0))
         parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
 
-        # Automatically select cudnn_type for GPU and exconv for CPU
+        # Automatically select cudnn_type for GPU, exconv for CPU
+        # and mkldnn_conv for MKLDNN
         # if set type=conv, but still reserve the way user specify
-        # exconv or cudnn_conv manually.
+        # exconv, mkldnn_conv or cudnn_conv manually.
         if self.layer_type == "cudnn_conv":
             config_assert(use_gpu, "cudnn_conv only support GPU")
 
+        if self.layer_type == "mkldnn_conv":
+            config_assert(use_mkldnn, "mkldnn_conv only support MKLDNN")
+
         if (use_gpu == 1 and self.layer_type != "exconv" and
+                self.layer_type != "mkldnn_conv" and
             (parallel_nn == 0 or self.config.device > -1)):
             self.layer_type = "cudnn_conv"
         else:
-            self.layer_type = "exconv"
+            self.layer_type = "mkldnn_conv" if use_mkldnn else "exconv"
         # need to specify layer in config
         self.config.type = self.layer_type
 
@@ -1792,7 +2120,7 @@ class ConvLayerBase(LayerBase):
 
     def calc_parameter_size(self, conv_conf):
         return self.config.num_filters * conv_conf.filter_channels \
-                    * (conv_conf.filter_size * conv_conf.filter_size_y)
+               * (conv_conf.filter_size * conv_conf.filter_size_y)
 
 
 @config_layer('exconv')
@@ -1800,6 +2128,11 @@ class ConvLayer(ConvLayerBase):
     layer_type = 'exconv'
 
 
+@config_layer('mkldnn_conv')
+class ConvLayer(ConvLayerBase):
+    layer_type = 'mkldnn_conv'
+
+
 @config_layer('cudnn_conv')
 class ConvLayer(ConvLayerBase):
     layer_type = 'cudnn_conv'
@@ -1876,15 +2209,102 @@ class ConvTransLayer(ConvTransLayerBase):
     layer_type = 'cudnn_convt'
 
 
+@config_layer('conv_3d')
+class Conv3DLayerBase(LayerBase):
+    def __init__(self,
+                 name,
+                 inputs=[],
+                 bias=True,
+                 num_filters=None,
+                 shared_biases=True,
+                 **xargs):
+        super(Conv3DLayerBase, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
+
+        if num_filters is not None:
+            self.config.num_filters = num_filters
+
+        # need to specify layer in config
+        self.config.type = self.layer_type
+
+        trans = False
+        if self.config.type == "deconv3d":
+            trans = True
+
+        if shared_biases is not None:
+            self.config.shared_biases = shared_biases
+
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            conv_conf = self.config.inputs[input_index].conv_conf
+            parse_conv3d(
+                self.inputs[input_index].conv,
+                input_layer.name,
+                conv_conf,
+                num_filters,
+                trans=trans
+            )  # for z-axis pad:0, strid:1, filter_size:1, img_size:1
+            psize = self.calc_parameter_size(conv_conf)
+            self.create_input_parameter(input_index, psize)
+            if trans:
+                self.set_cnn_layer(name, conv_conf.img_size_z,
+                                   conv_conf.img_size_y, conv_conf.img_size,
+                                   self.config.num_filters)
+            else:
+                self.set_cnn_layer(name, conv_conf.output_z, conv_conf.output_y,
+                                   conv_conf.output_x, self.config.num_filters)
+
+        psize = self.config.size
+        if shared_biases:
+            psize = self.config.num_filters
+        self.create_bias_parameter(bias, psize, [psize, 1])
+
+    def calc_parameter_size(self, conv_conf):
+        return self.config.num_filters * conv_conf.filter_channels \
+               * (conv_conf.filter_size * conv_conf.filter_size_y \
+                  * conv_conf.filter_size_z)
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
+
+@config_layer('conv3d')
+class Conv3DLayer(Conv3DLayerBase):
+    layer_type = 'conv3d'
+
+
+@config_layer('deconv3d')
+class Conv3DLayer(Conv3DLayerBase):
+    layer_type = 'deconv3d'
+
+
 @config_layer('norm')
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
         super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn = True if use_mkldnn and self.inputs[
+            0].norm.norm_type == 'cmrnorm-projection' else False
+        self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
             parse_norm(self.inputs[input_index].norm, input_layer.name,
                        norm_conf)
+            norm_conf.scale = self.inputs[
+                input_index].norm.scale if use_mkldnn else norm_conf.scale
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
             if norm_conf.norm_type == "cross-channel-norm":
@@ -1894,17 +2314,59 @@ class NormLayer(LayerBase):
 
 @config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(self, name, inputs, ceil_mode=True, **xargs):
-        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs)
+    layer_type = 'pool'
+
+    def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None,
+                 **xargs):
+        use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0))
+        if self.layer_type == "mkldnn_pool":
+            config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN")
+        self.layer_type = 'mkldnn_pool' if use_mkldnn else 'pool'
+        super(PoolLayer, self).__init__(
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf, ceil_mode)
+                       pool_conf, ceil_mode, exclude_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
 
+@config_layer('mkldnn_pool')
+class MKLDNNPoolLayer(PoolLayer):
+    layer_type = 'mkldnn_pool'
+
+
+@config_layer('pool3d')
+class Pool3DLayer(LayerBase):
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(Pool3DLayer, self).__init__(
+            name, 'pool3d', 0, inputs=inputs, **xargs)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            pool_conf = self.config.inputs[input_index].pool_conf
+            parse_pool3d(self.inputs[input_index].pool, input_layer.name,
+                         pool_conf, ceil_mode)
+            self.set_cnn_layer(name, pool_conf.output_z, pool_conf.output_y,
+                               pool_conf.output_x, pool_conf.channels)
+
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth,
+                      height,
+                      width,
+                      channels,
+                      is_print=True):
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
+
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
@@ -1937,6 +2399,31 @@ class PadLayer(LayerBase):
         self.config.size = out_ch * out_h * out_w
 
 
+@config_layer('crop')
+class CropLayer(LayerBase):
+    def __init__(self, name, inputs, axis, offset, shape, **xargs):
+        super(CropLayer, self).__init__(name, 'crop', 0, inputs=inputs, **xargs)
+        self.config.axis = axis
+        self.config.offset.extend(offset)
+        self.config.shape.extend(shape)
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -1944,11 +2431,13 @@ class BatchNormLayer(LayerBase):
     def __init__(self,
                  name,
                  inputs,
-                 active_type="linear",
                  bias=True,
+                 img3D=False,
                  use_global_stats=True,
+                 epsilon=1e-5,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
+                 mean_var_names=None,
                  **xargs):
         if inputs is None:
             inputs = []
@@ -1962,6 +2451,7 @@ class BatchNormLayer(LayerBase):
         # If not use is_static, even set learning_rate = 0, decay_rate = 0,
         # these paras will change if set average_window in configure.
         use_gpu = bool(int(g_command_config_args.get("use_gpu", 0)))
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
         is_shared = True if not use_gpu else False
         for i in xrange(2):
             inputs.append(
@@ -1975,45 +2465,93 @@ class BatchNormLayer(LayerBase):
 
         parallel_nn = bool(int(g_command_config_args.get("parallel_nn", 0)))
         cudnn_version = int(g_command_config_args.get("cudnn_version", 0))
-        # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
-        # Also based on cudnn version.
+        # Automatically select cudnn_batch_norm for GPU, batch_norm for CPU
+        # and mkldnn_batch_norm for MKLDNN. Also based on cudnn version.
+        if batch_norm_type == "mkldnn_batch_norm":
+            config_assert(use_mkldnn, "mkldnn_batch_norm only support MKLDNN")
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-            ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4007
-        self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
+                not use_mkldnn and batch_norm_type != "mkldnn_batch_norm" and \
+                ((not parallel_nn) or self.config.device > -1)
+        if use_cudnn:
+            self.layer_type = "cudnn_batch_norm"
+        else:
+            self.layer_type = "mkldnn_batch_norm" if use_mkldnn else "batch_norm"
         super(BatchNormLayer, self).__init__(
-            name,
-            self.layer_type,
-            0,
-            active_type=active_type,
-            inputs=inputs,
-            **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
 
         if use_global_stats is not None:
             self.config.use_global_stats = use_global_stats
         if moving_average_fraction is not None:
             self.config.moving_average_fraction = moving_average_fraction
+        if epsilon is not None:
+            assert epsilon >= 1e-5, "epsilon must be no less than 1e-5."
+            self.config.epsilon = epsilon
 
         input_layer = self.get_input_layer(0)
         image_conf = self.config.inputs[0].image_conf
-        parse_image(self.inputs[0].image, input_layer.name, image_conf)
-
-        # Only pass the width and height of input to batch_norm layer
-        # when either of it is non-zero.
-        if input_layer.width != 0 or input_layer.height != 0:
-            self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
-                               image_conf.channels, False)
+        if img3D:
+            parse_image3d(self.inputs[0].image, input_layer.name, image_conf)
+            # Only pass the width and height of input to batch_norm layer
+            # when either of it is non-zero.
+            if input_layer.width != 0 or input_layer.height != 0:
+                self.set_cnn_layer(
+                    input_layer_name=name,
+                    depth=image_conf.img_size_z,
+                    height=image_conf.img_size_y,
+                    width=image_conf.img_size,
+                    channels=image_conf.channels,
+                    is_print=True)
+            else:
+                self.set_layer_size(input_layer.size)
         else:
-            self.set_layer_size(input_layer.size)
+            parse_image(self.inputs[0].image, input_layer.name, image_conf)
+            # Only pass the width and height of input to batch_norm layer
+            # when either of it is non-zero.
+            if input_layer.width != 0 or input_layer.height != 0:
+                self.set_cnn_layer(
+                    input_layer_name=name,
+                    height=image_conf.img_size_y,
+                    width=image_conf.img_size,
+                    channels=image_conf.channels,
+                    is_print=True)
+            else:
+                self.set_layer_size(input_layer.size)
 
         psize = self.calc_parameter_size(image_conf)
         dims = [1, psize]
+        if mean_var_names is not None:
+            assert len(mean_var_names) == 2
+            self.inputs[1].parameter_name = mean_var_names[0]
+            self.inputs[2].parameter_name = mean_var_names[1]
+
         self.create_input_parameter(0, psize)
         self.create_input_parameter(1, psize, dims)
         self.create_input_parameter(2, psize, dims)
 
         self.create_bias_parameter(bias, psize)
 
+    def set_cnn_layer(self,
+                      input_layer_name,
+                      depth=None,
+                      height=None,
+                      width=None,
+                      channels=None,
+                      is_print=True):
+        depthIsNone = False
+        if depth is None:
+            depth = 1
+            depthIsNone = True
+        size = depth * height * width * channels
+        self.set_layer_size(size)
+        self.set_layer_height_width(height, width)
+        self.set_layer_depth(depth)
+        if is_print and depthIsNone:
+            print("output for %s: c = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, height, width, size))
+        elif is_print:
+            print("output for %s: c = %d, d = %d, h = %d, w = %d, size = %d" %
+                  (input_layer_name, channels, depth, height, width, size))
+
     def calc_parameter_size(self, image_conf):
         return image_conf.channels
 
@@ -2077,8 +2615,53 @@ class MaxOutLayer(LayerBase):
         maxout_conf = self.config.inputs[0].maxout_conf
         parse_maxout(self.inputs[0].maxout, input_layer.name, maxout_conf)
         out_channels = maxout_conf.image_conf.channels / maxout_conf.groups
-        self.set_cnn_layer(name, g_layer_map[input_layer.name].height,
-                           g_layer_map[input_layer.name].width, out_channels)
+        self.set_cnn_layer(name, maxout_conf.image_conf.img_size_y,
+                           maxout_conf.image_conf.img_size, out_channels)
+
+
+@config_layer('row_conv')
+class RowConvLayer(LayerBase):
+    def __init__(self, name, inputs, context_length, **xargs):
+        super(RowConvLayer, self).__init__(
+            name, 'row_conv', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'row convolution layer must have one and only one input.')
+        input_layer = self.get_input_layer(0)
+        row_conv_conf = self.config.inputs[0].row_conv_conf
+        row_conv_conf.context_length = context_length
+        self.set_layer_size(input_layer.size)
+        psize = context_length * input_layer.size
+        dims = [context_length, input_layer.size]
+        self.create_input_parameter(0, psize, dims)
+
+
+@config_layer('clip')
+class ClipLayer(LayerBase):
+    def __init__(self, name, inputs, min, max, **xargs):
+        super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ClipLayer must have one and only one input.')
+        config_assert(min < max, 'min must be less than max.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.config.inputs[0].clip_conf.min = min
+        self.config.inputs[0].clip_conf.max = max
+
+
+@config_layer('scale_shift')
+class ScaleShiftLayer(LayerBase):
+    def __init__(self, name, inputs, bias=True, **xargs):
+        super(ScaleShiftLayer, self).__init__(
+            name, 'scale_shift', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ScaleShiftLayer must have one and only one input.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.create_input_parameter(0, 1, [1, 1])
+        self.create_bias_parameter(bias, 1)
 
 
 # key: cost type
@@ -2098,13 +2681,14 @@ def define_cost(class_name, cost_type):
 
 
 define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
+define_cost('CrossEntropyOverBeamCostLayer', 'cross_entropy_over_beam')
 define_cost('RankingCost', 'rank-cost')
 define_cost('AucValidation', 'auc-validation')
 define_cost('PnpairValidation', 'pnpair-validation')
 define_cost('SumOfSquaresCostLayer', 'square_error')
 define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
-define_cost('HuberTwoClass', 'huber')
+define_cost('HuberTwoClassification', 'huber_classification')
 define_cost('SumCost', 'sum_cost')
 define_cost('SmoothL1Cost', 'smooth_l1')
 
@@ -2134,7 +2718,7 @@ Usage:
              max_sort_size = -1, inputs = ["output", "score"])
 
   Input data: Samples of the same query should be loaded as a sequence,
-          by ProtoDataProvider or PyDataProvider etc.. User should provide
+          by PyDataProvider etc.. User should provide
           scores for each sample. The score slot should be the 2nd
           input of lambdaRank layer.
 
@@ -2166,6 +2750,17 @@ class LambdaCost(LayerBase):
         self.config.max_sort_size = max_sort_size
 
 
+@config_layer('huber_regression')
+class HuberRegressionLoss(LayerBase):
+    def __init__(self, name, inputs, delta=1., coeff=1., device=None):
+        super(HuberRegressionLoss, self).__init__(
+            name, 'huber_regression', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, 'HuberRegression must have 2 inputs')
+        self.config.delta = delta
+        self.config.coeff = coeff
+
+
 @config_layer('nce')
 class NCELayer(LayerBase):
     def __init__(self,
@@ -2214,16 +2809,37 @@ class NCELayer(LayerBase):
 
 @config_layer('addto')
 class AddToLayer(LayerBase):
+    layer_type = 'addto'
+
     def __init__(self, name, inputs, bias=True, **xargs):
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_addto":
+            config_assert(use_mkldnn, "mkldnn_addto only support MKLDNN")
+        self.layer_type = 'mkldnn_addto' if use_mkldnn else 'addto'
         super(AddToLayer, self).__init__(
-            name, 'addto', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
+
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
+
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
+@config_layer('mkldnn_addto')
+class MKLDNNAddtoLayer(AddToLayer):
+    layer_type = 'mkldnn_addto'
+
+
 @config_layer('agent')
 class AgentLayer(LayerBase):
     def __init__(self, name, size, device=None):
@@ -2231,13 +2847,6 @@ class AgentLayer(LayerBase):
             name, 'agent', size, inputs=[], device=device)
 
 
-@config_layer('sequence_agent')
-class SequenceAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceAgentLayer, self).__init__(
-            name, 'sequence_agent', size, inputs=[], device=device)
-
-
 @config_layer('gather_agent')
 class GatherAgentLayer(LayerBase):
     def __init__(self, name, size, device=None):
@@ -2247,23 +2856,11 @@ class GatherAgentLayer(LayerBase):
 
 @config_layer('scatter_agent')
 class ScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
+    def __init__(self, name, size, width=None, height=None, device=None):
         super(ScatterAgentLayer, self).__init__(
             name, 'scatter_agent', size, inputs=[], device=device)
-
-
-@config_layer('sequence_gather_agent')
-class SequenceGatherAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceGatherAgentLayer, self).__init__(
-            name, 'sequence_gather_agent', size, inputs=[], device=device)
-
-
-@config_layer('sequence_scatter_agent')
-class SequenceScatterAgentLayer(LayerBase):
-    def __init__(self, name, size, device=None):
-        super(SequenceScatterAgentLayer, self).__init__(
-            name, 'sequence_scatter_agent', size, inputs=[], device=device)
+        if height and width:
+            self.set_layer_height_width(height, width)
 
 
 @config_layer('multiplex')
@@ -2281,12 +2878,12 @@ class MultiplexLayer(LayerBase):
 
 
 @config_func
-def Link(
-        name,
-        has_subseq=False, ):
+def Link(name, has_subseq=False):
+    """
+    Still keeping has_subseq for backward compatibility
+    """
     link_config = LinkConfig()
     link_config.link_name = name
-    link_config.has_subseq = has_subseq
     return link_config
 
 
@@ -2319,20 +2916,13 @@ def Memory(name,
         config_assert(name is not None, "name needs cannot be None")
         memory_name = name + "+delay1"
     agent_name = memory_name
-    if is_sequence:
-        config_assert(
-            boot_layer is not None,
-            "there must be boot_layer in network when is_sequence = True")
-        agent_layer = SequenceAgentLayer(agent_name, size)
-    else:
-        agent_layer = AgentLayer(agent_name, size)
+    agent_layer = AgentLayer(agent_name, size)
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
     if name is not None:
         memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
-    memory.is_sequence = is_sequence
     options = sum((boot_layer is not None, bool(boot_bias),
                    boot_with_const_id is not None))
     config_assert(
@@ -2406,15 +2996,23 @@ class ExpandLayer(LayerBase):
 
 @config_layer('featmap_expand')
 class FeatMapExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None, num_filters=None, bias=False):
+    def __init__(self,
+                 name,
+                 inputs,
+                 num_filters=None,
+                 as_row_vector=True,
+                 bias=False,
+                 **xargs):
         super(FeatMapExpandLayer, self).__init__(
-            name, 'featmap_expand', 0, inputs=inputs, device=device)
+            name, 'featmap_expand', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1, 'ExpandLayer takes 1 and only 1 inputs')
         if num_filters is not None:
             self.config.num_filters = num_filters
         else:
             logger.fatal("FeatMapExpandLayer must specify num_filters.")
+        if not as_row_vector:
+            self.config.user_arg = "as_col_vec"
         self.set_layer_size(self.get_input_layer(0).size * num_filters)
 
 
@@ -2424,14 +3022,16 @@ class MaxLayer(LayerBase):
                  name,
                  inputs,
                  trans_type='non-seq',
-                 active_type='linear',
                  bias=False,
                  output_max_index=None,
+                 stride=-1,
                  **xargs):
         super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
-        self.config.active_type = active_type
+        self.config.seq_pool_stride = stride
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             self.set_layer_size(input_layer.size)
@@ -2473,18 +3073,12 @@ class SequenceLastInstanceLayer(LayerBase):
     def __init__(self,
                  name,
                  inputs,
-                 active_type='linear',
                  trans_type='non-seq',
                  bias=False,
                  stride=-1,
                  **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
-            name,
-            'seqlastins',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqlastins', 0, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
         if trans_type == 'seq':
@@ -2500,7 +3094,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
     def __init__(self,
                  name,
                  inputs,
-                 active_type='linear',
                  trans_type='non-seq',
                  bias=False,
                  stride=-1,
@@ -2508,7 +3101,6 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
         super(SequenceFirstInstanceLayer, self).__init__(
             name,
             inputs=inputs,
-            active_type=active_type,
             trans_type=trans_type,
             bias=bias,
             stride=stride,
@@ -2518,14 +3110,9 @@ class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
 
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         super(SequenceConcatLayer, self).__init__(
-            name,
-            'seqconcat',
-            0,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqconcat', 0, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
@@ -2536,20 +3123,9 @@ class SequenceConcatLayer(LayerBase):
 
 @config_layer('seqreshape')
 class SequenceReshapeLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 size,
-                 inputs,
-                 active_type='linear',
-                 bias=False,
-                 **xargs):
+    def __init__(self, name, size, inputs, bias=False, **xargs):
         super(SequenceReshapeLayer, self).__init__(
-            name,
-            'seqreshape',
-            size,
-            inputs=inputs,
-            active_type=active_type,
-            **xargs)
+            name, 'seqreshape', size, inputs=inputs, **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
@@ -2558,9 +3134,9 @@ class SequenceReshapeLayer(LayerBase):
 
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
+    def __init__(self, name, inputs, bias=False, **xargs):
         super(SubSequenceLayer, self).__init__(
-            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'subseq', 0, inputs=inputs, **xargs)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
@@ -2568,6 +3144,86 @@ class SubSequenceLayer(LayerBase):
         self.create_bias_parameter(bias, size)
 
 
+@config_layer('seq_slice')
+class SeqSliceLayer(LayerBase):
+    def __init__(self, name, inputs, starts, ends, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sequence slice layer '
+                                      'is a single sequence input.')
+        else:
+            inputs = [inputs]
+
+        if starts is not None:
+            if isinstance(starts, list):
+                assert len(starts) == 1, (
+                    'the start indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                starts = starts[0]
+            inputs.append(starts)
+
+        if ends is not None:
+            if isinstance(ends, list):
+                assert len(ends) == 1, (
+                    'the end indices for sequence slice layer cannot '
+                    'be a list having more than one element.')
+                ends = ends[0]
+            inputs.append(ends)
+        assert len(inputs) >= 2, (
+            'the sequence slice layer has at least two inputs.')
+
+        super(SeqSliceLayer, self).__init__(
+            name, 'seq_slice', 0, inputs=inputs, **xargs)
+
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+        if len(inputs) == 3:
+            assert (
+                self.get_input_layer(1).size == self.get_input_layer(2).size), (
+                    'If start and end indices are both given to'
+                    'sequence slice layer, they should have the same width.')
+        elif len(inputs) == 2:
+            self.config.select_first = (starts is not None)
+
+
+@config_layer('sub_nested_seq')
+class SubNestedSequenceLayer(LayerBase):
+    def __init__(self, name, inputs, selected_indices, bias=False, **xargs):
+        if isinstance(inputs, list):
+            assert len(inputs) == 1, ('the first input of sub_nested_seq '
+                                      'layer is a single nested sequence.')
+            inputs = inputs[0]
+        if isinstance(selected_indices, list):
+            assert len(selected_indices) == 1, (
+                'the second input of '
+                'sub_nested_seq layer is a single layer which is a '
+                'set of selected indices.')
+            selected_indices = selected_indices[0]
+
+        super(SubNestedSequenceLayer, self).__init__(
+            name,
+            'sub_nested_seq',
+            0,
+            inputs=[inputs, selected_indices],
+            **xargs)
+        input_layer0 = self.get_input_layer(0)
+        size = input_layer0.size
+        self.set_layer_size(size)
+
+
+@config_layer('dot_prod')
+class DotProdLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(DotProdLayer, self).__init__(
+            name, 'dot_prod', 0, inputs, device=device)
+        config_assert(len(inputs) == 2, 'DotProdLayer must have 2 inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            "Two inputs should have the same size.")
+        self.set_layer_size(1)
+
+
 @config_layer('out_prod')
 class OuterProdLayer(LayerBase):
     def __init__(self, name, inputs, device=None):
@@ -2679,6 +3335,30 @@ class SumToOneNormLayer(LayerBase):
         self.set_layer_size(input_layer0.size)
 
 
+@config_layer('row_l2_norm')
+class RowL2NormLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(RowL2NormLayer, self).__init__(
+            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+
+
+@config_layer('cos')
+class CosSimLayer(LayerBase):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
+        super(CosSimLayer, self).__init__(
+            name, 'cos', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2,
+            'The CosSimLayer expects two and only two inputs.')
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            'The two inputs of CosSimLayer must have the same dimensionality.')
+        self.config.cos_scale = cos_scale
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
@@ -2686,10 +3366,24 @@ class CosSimVecMatLayer(LayerBase):
             name, 'cos_vm', size, inputs=inputs, device=device)
         self.config.cos_scale = cos_scale
         config_assert(
-            len(self.inputs) == 2, 'CosSimVecMatLayer must have 2 inputs')
+            len(self.inputs) == 2, 'The CosSimVecMatLayer must have 2 inputs.')
         config_assert(
             size * self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'Wrong input size for CosSimVecMatLayer')
+            'Wrong input size for CosSimVecMatLayer.')
+
+
+@config_layer('l2_distance')
+class L2DistanceLayer(LayerBase):
+    def __init__(self, name, inputs, device=None):
+        super(L2DistanceLayer, self).__init__(
+            name, 'l2_distance', 1, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 2, ('The L2DistanceLayer must have '
+                                    'and only have 2 inputs.'))
+        config_assert(
+            self.get_input_layer(0).size == self.get_input_layer(1).size,
+            ('Two inputs of the L2DistanceLayer must have '
+             'the same dimensionality.'))
 
 
 @config_layer('sampling_id')
@@ -2716,13 +3410,16 @@ class AverageLayer(LayerBase):
                  inputs,
                  average_strategy='average',
                  trans_type='non-seq',
-                 active_type='linear',
                  bias=False,
+                 stride=-1,
                  **xargs):
         super(AverageLayer, self).__init__(
-            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
+            name, 'average', 0, inputs=inputs, **xargs)
         self.config.average_strategy = average_strategy
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
+        self.config.seq_pool_stride = stride
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -2730,18 +3427,6 @@ class AverageLayer(LayerBase):
         self.create_bias_parameter(bias, self.config.size)
 
 
-@config_layer('cos')
-class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=1, device=None):
-        super(CosSimLayer, self).__init__(
-            name, 'cos', 1, inputs=inputs, device=device)
-        config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
-        config_assert(
-            self.get_input_layer(0).size == self.get_input_layer(1).size,
-            'inputs of CosSimLayer must have same dim')
-        self.config.cos_scale = cos_scale
-
-
 @config_layer('tensor')
 class TensorLayer(LayerBase):
     def __init__(self, name, size, inputs, bias=True, **xargs):
@@ -2761,13 +3446,7 @@ class TensorLayer(LayerBase):
 
 @config_layer('mixed')
 class MixedLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 size=0,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
+    def __init__(self, name, inputs, size=0, bias=True, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(MixedLayer, self).__init__(
             name, 'mixed', size, inputs=inputs, **xargs)
@@ -2849,9 +3528,6 @@ class MixedLayer(LayerBase):
             self.config.bias_size = psize
             self.create_bias_parameter(bias, psize)
 
-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-
 
 # like MixedLayer, but no bias parameter
 @config_func
@@ -2861,21 +3537,41 @@ def ExpressionLayer(name, inputs, **xargs):
 
 @config_layer('concat')
 class ConcatenateLayer(LayerBase):
+    layer_type = 'concat'
+
     def __init__(self, name, inputs, bias=False, **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         config_assert(not bias, 'ConcatenateLayer cannot support bias.')
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        if self.layer_type == "mkldnn_concat":
+            config_assert(use_mkldnn, "mkldnn_concat only support MKLDNN")
+        self.layer_type = 'mkldnn_concat' if use_mkldnn else 'concat'
         super(ConcatenateLayer, self).__init__(
-            name, 'concat', 0, inputs=inputs, **xargs)
+            name, self.layer_type, 0, inputs=inputs, **xargs)
         size = 0
         for input_index in xrange(len(self.inputs)):
+            assert self.get_input_layer(0).height == self.get_input_layer(
+                input_index).height
+            assert self.get_input_layer(0).width == self.get_input_layer(
+                input_index).width
+            assert self.get_input_layer(0).depth == self.get_input_layer(
+                input_index).depth
             input_layer = self.get_input_layer(input_index)
             input = self.inputs[input_index]
             if self.config.size == 0:
                 size += input_layer.size
 
+        self.set_layer_height_width(self.get_input_layer(0).height, \
+                                    self.get_input_layer(0).width)
+        self.set_layer_depth(self.get_input_layer(0).depth)
         self.set_layer_size(size)
 
 
+@config_layer('mkldnn_concat')
+class MKLDNNConcatLayer(ConcatenateLayer):
+    layer_type = 'mkldnn_concat'
+
+
 # like concat layer, but each input layer was processed by a Projection.
 @config_layer('concat2')
 class ConcatenateLayer2(LayerBase):
@@ -2931,8 +3627,13 @@ class ConcatenateLayer2(LayerBase):
 
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
+    layer_type = 'recurrent'
+
     def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
-        super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs,
+        use_mkl_packed = bool(
+            int(g_command_config_args.get("use_mkl_packed", 0)))
+        self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
+        super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
                                              **xargs)
         config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
         input_layer = self.get_input_layer(0)
@@ -3130,6 +3831,16 @@ class CTCLayer(LayerBase):
         config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
 
 
+@config_layer('kmax_seq_score')
+class KmaxSeqScoreLayer(LayerBase):
+    def __init__(self, name, inputs, beam_size, **xargs):
+        super(KmaxSeqScoreLayer, self).__init__(
+            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
+        self.config.beam_size = beam_size
+
+
 @config_layer('warp_ctc')
 class WarpCTCLayer(LayerBase):
     def __init__(self,
@@ -3158,6 +3869,69 @@ class RecurrentLayerGroup(LayerBase):
             name, 'recurrent_layer_group', 0, inputs=[], device=device)
 
 
+@config_layer('switch_order')
+class SwitchOrderLayer(LayerBase):
+    def __init__(self, name, inputs, reshape, **xargs):
+        super(SwitchOrderLayer, self).__init__(
+            name, 'switch_order', 0, inputs=inputs, **xargs)
+        self.config.reshape_conf.height_axis.extend(reshape['height'])
+        self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
+
+
+@config_layer('scale_sub_region')
+class ScaleSubRegionLayer(LayerBase):
+    def __init__(self, name, inputs, value, **xargs):
+        super(ScaleSubRegionLayer, self).__init__(
+            name, 'scale_sub_region', 0, inputs=inputs, **xargs)
+        scale_sub_region_conf = self.config.inputs[0].scale_sub_region_conf
+        scale_sub_region_conf.value = value
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = scale_sub_region_conf.image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
+                           image_conf.channels)
+
+
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
@@ -3172,11 +3946,15 @@ def Layer(name, type, **xargs):
 @config_func
 def ParameterHook(type, **kwargs):
     if type == 'pruning':
-        mask_filename = kwargs.get('mask_filename', None)
-        assert mask_filename is not None
         hook = ParameterUpdaterHookConfig()
         hook.type = type
-        hook.purning_mask_filename = mask_filename
+        sparsity_ratio = kwargs.get('sparsity_ratio', None)
+        if sparsity_ratio is not None:
+            hook.sparsity_ratio = sparsity_ratio
+        return hook
+    elif type == 'dpruning':
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
         return hook
     else:
         return None
@@ -3204,7 +3982,8 @@ def Parameter(name,
               need_compact=None,
               is_static=None,
               is_shared=None,
-              update_hooks=None):
+              update_hooks=None,
+              initializer=None):
 
     config_assert(name not in g_parameter_map,
                   'Duplicated parameter name: ' + name)
@@ -3283,15 +4062,20 @@ def Parameter(name,
 
     if update_hooks is not None:
         if hasattr(update_hooks, '__call__'):
-            update_hooks = update_hooks(para.name)
+            update_hooks = update_hooks()
 
         if isinstance(update_hooks, list):
             for hook in update_hooks:
                 para.update_hooks.extend([hook])
         else:
-            para.update_hooks.extend(update_hooks)
+            para.update_hooks.extend([update_hooks])
 
     g_parameter_map[name] = para
+    if initializer is not None:
+        config_assert(
+            callable(initializer),
+            "parameter initializer should be a callable object")
+        g_parameter_initializer_map[name] = initializer
 
 
 @config_func
@@ -3546,11 +4330,7 @@ def update_g_config():
     return g_config
 
 
-def begin_parse(config_arg_str=''):
-    '''
-    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
-    passed to config script as a dictionary CONFIG_ARGS
-    '''
+def begin_parse():
     init_config_environment()
     for hook in _parse_config_hooks:
         hook()
@@ -3568,8 +4348,12 @@ def begin_parse(config_arg_str=''):
 
 
 def parse_config(trainer_config, config_arg_str):
-    begin_parse(config_arg_str)
+    '''
+    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
+    passed to config script as a dictionary CONFIG_ARGS
+    '''
 
+    begin_parse()
     config_args = {}
 
     if config_arg_str:
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
old mode 100755
new mode 100644
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index c749fa827f..00efc01c05 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -17,7 +17,8 @@ __all__ = [
     "IdentityActivation", "LinearActivation", 'SequenceSoftmaxActivation',
     'ExpActivation', "ReluActivation", "BReluActivation", "SoftReluActivation",
     "STanhActivation", "AbsActivation", "SquareActivation", "BaseActivation",
-    "LogActivation", "SqrtActivation", "ReciprocalActivation"
+    "LogActivation", "SqrtActivation", "ReciprocalActivation",
+    "SoftSignActivation"
 ]
 
 
@@ -243,8 +244,20 @@ class ReciprocalActivation(BaseActivation):
     Reciprocal Activation.
 
     .. math::
-       f(z) = 1/z
+       f(z)=\\frac{1}{z}
     """
 
     def __init__(self):
         BaseActivation.__init__(self, 'reciprocal', False)
+
+
+class SoftSignActivation(BaseActivation):
+    """
+    SoftSign Activation.
+
+    .. math::
+       f(z)=\\frac{z}{1 + |z|}
+    """
+
+    def __init__(self):
+        BaseActivation.__init__(self, 'softsign', False)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index d1167a234c..e6f87ce61b 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -14,7 +14,8 @@
 
 from paddle.trainer.config_parser import *
 __all__ = [
-    'ParamAttr', 'ExtraAttr', 'ParameterAttribute', 'ExtraLayerAttribute'
+    'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute',
+    'ExtraLayerAttribute'
 ]
 
 
@@ -55,6 +56,40 @@ def is_compatible_with(x, Type):
         return False
 
 
+class HookAttribute(object):
+    """
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
+    during training process of a layer with parameters, such as img_conv layer, fc layer.
+
+    :param  type: Hook type, currently supported types:
+                        'pruning' :  user specify a sparsity_ratio before training started, and the
+                            network will prune the parameters based on the sparsity_ratio.
+                            eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
+                            The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
+                                                                       num_channels=3, num_filters=64,
+                                                                       param_attr=ParameterAttribute(update_hooks=hk) )
+                            The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
+    :type type: string
+
+    :param sparsity_ratio: Must be specified if hook type is 'pruning',
+                        it represents the ratio of the zero elements to be set by the Parameter.
+    :type sparsity_ratio: float or None
+
+    """
+
+    def __init__(self, type, sparsity_ratio=None):
+        self.type = type
+        self.sparsity_ratio = sparsity_ratio
+        if self.sparsity_ratio is not None:
+            assert is_compatible_with(
+                self.sparsity_ratio,
+                float), 'sparisity_ratio must be float type'
+            assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] '
+
+    def __call__(self):
+        return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio)
+
+
 class ParameterAttribute(object):
     """
     Parameter Attributes object. To fine-tuning network training process, user
@@ -95,6 +130,12 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param update_hooks: A HookAttribute object.
+    :type update_hooks: HookAttribute
+    :param initializer: If not None, it should be a callable object which accepts
+                        a parameter name and returns numpy array for the initial
+                        value of the parameter
+    :type initializer: callable object
     """
 
     def __init__(self,
@@ -109,7 +150,9 @@ class ParameterAttribute(object):
                  learning_rate=None,
                  momentum=None,
                  gradient_clipping_threshold=None,
-                 sparse_update=False):
+                 sparse_update=False,
+                 update_hooks=None,
+                 initializer=None):
         self.attr = {}
 
         if is_static:
@@ -161,6 +204,11 @@ class ParameterAttribute(object):
                 is_compatible_with(gradient_clipping_threshold, float):
             self.attr['gradient_clipping_threshold'] = \
                 gradient_clipping_threshold
+        if initializer is not None:
+            self.attr['initializer'] = initializer
+
+        if update_hooks:
+            self.attr['update_hooks'] = update_hooks
 
     def set_default_parameter_name(self, name):
         """
@@ -226,7 +274,7 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s cannot support %s" %
+                raise NotImplementedError("Layer %s does not support %s" %
                                           (layer_name, key))
 
     @staticmethod
@@ -237,5 +285,6 @@ class ExtraLayerAttribute(object):
             return attr.attr
 
 
+HookAttr = HookAttribute
 ParamAttr = ParameterAttribute
 ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index a5234f3e47..0eeaf7eabb 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -16,12 +16,22 @@ from paddle.trainer.config_parser import *
 from default_decorators import *
 
 __all__ = [
-    "evaluator_base", "classification_error_evaluator", "auc_evaluator",
-    "pnpair_evaluator", "precision_recall_evaluator", "ctc_error_evaluator",
-    "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
-    "value_printer_evaluator", "gradient_printer_evaluator",
-    "maxid_printer_evaluator", "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator", "classification_error_printer_evaluator"
+    "evaluator_base",
+    "classification_error_evaluator",
+    "auc_evaluator",
+    "pnpair_evaluator",
+    "precision_recall_evaluator",
+    "ctc_error_evaluator",
+    "chunk_evaluator",
+    "sum_evaluator",
+    "column_sum_evaluator",
+    "value_printer_evaluator",
+    "gradient_printer_evaluator",
+    "maxid_printer_evaluator",
+    "maxframe_printer_evaluator",
+    "seqtext_printer_evaluator",
+    "classification_error_printer_evaluator",
+    "detection_map_evaluator",
 ]
 
 
@@ -31,10 +41,11 @@ class EvaluatorAttribute(object):
     FOR_RANK = 1 << 2
     FOR_PRINT = 1 << 3
     FOR_UTILS = 1 << 4
+    FOR_DETECTION = 1 << 5
 
     KEYS = [
         "for_classification", "for_regression", "for_rank", "for_print",
-        "for_utils"
+        "for_utils", "for_detection"
     ]
 
     @staticmethod
@@ -57,22 +68,25 @@ def evaluator(*attrs):
     return impl
 
 
-def evaluator_base(
-        input,
-        type,
-        label=None,
-        weight=None,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        delimited=None,
-        top_k=None,
-        excluded_chunk_types=None, ):
+def evaluator_base(input,
+                   type,
+                   label=None,
+                   weight=None,
+                   name=None,
+                   chunk_scheme=None,
+                   num_chunk_types=None,
+                   classification_threshold=None,
+                   positive_label=None,
+                   dict_file=None,
+                   result_file=None,
+                   num_results=None,
+                   delimited=None,
+                   top_k=None,
+                   excluded_chunk_types=None,
+                   overlap_threshold=None,
+                   background_id=None,
+                   evaluate_difficult=None,
+                   ap_type=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -107,6 +121,14 @@ def evaluator_base(
     :type weight: LayerOutput.
     :param top_k: number k in top-k error rate
     :type top_k: int
+    :param overlap_threshold: In detection tasks to filter detection results
+    :type overlap_threshold: float
+    :param background_id: Identifier of background class
+    :type background_id: int
+    :param evaluate_difficult: Whether to evaluate difficult objects
+    :type evaluate_difficult: bool
+    :param ap_type: How to calculate average persicion
+    :type ap_type: str
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
@@ -136,7 +158,61 @@ def evaluator_base(
         delimited=delimited,
         num_results=num_results,
         top_k=top_k,
-        excluded_chunk_types=excluded_chunk_types, )
+        excluded_chunk_types=excluded_chunk_types,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
+
+
+@evaluator(EvaluatorAttribute.FOR_DETECTION)
+@wrap_name_default()
+def detection_map_evaluator(input,
+                            label,
+                            overlap_threshold=0.5,
+                            background_id=0,
+                            evaluate_difficult=False,
+                            ap_type="11point",
+                            name=None):
+    """
+    Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection.
+
+    The detection mAP Evaluator based on the output of detection_output layer counts
+    the true positive and the false positive bbox and integral them to get the
+    mAP.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval =  detection_map_evaluator(input=det_output,label=lbl)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param label: Label layer.
+    :type label: LayerOutput
+    :param overlap_threshold: The bbox overlap threshold of a true positive.
+    :type overlap_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :param evaluate_difficult: Whether evaluate a difficult ground truth.
+    :type evaluate_difficult: bool
+    """
+    if not isinstance(input, list):
+        input = [input]
+
+    if label:
+        input.append(label)
+
+    evaluator_base(
+        name=name,
+        type="detection_map",
+        input=input,
+        label=label,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
@@ -230,9 +306,9 @@ def auc_evaluator(
 def pnpair_evaluator(
         input,
         label,
-        info,
-        name=None,
-        weight=None, ):
+        query_id,
+        weight=None,
+        name=None, ):
     """
     Positive-negative pair rate Evaluator which adapts to rank task like
     learning to rank. This evaluator must contain at least three layers.
@@ -241,27 +317,35 @@ def pnpair_evaluator(
 
     .. code-block:: python
 
-       eval = pnpair_evaluator(input, info, label)
+       eval = pnpair_evaluator(input, label, query_id)
 
-    :param name: Evaluator name.
-    :type name: None|basestring
     :param input: Input Layer name. The output prediction of network.
     :type input: LayerOutput
     :param label: Label layer name.
     :type label: LayerOutput
-    :param info: Label layer name. (TODO, explaination)
-    :type info: LayerOutput
+    :param query_id: Query_id layer name. Query_id indicates that which query
+     each sample belongs to. Its shape should be
+     the same as output of Label layer.
+    :type query_id: LayerOutput
     :param weight: Weight Layer name. It should be a matrix with size
-                  [sample_num, 1]. (TODO, explaination)
+                  [sample_num, 1] which indicates the weight of each sample.
+                  The default weight of sample is 1 if the weight layer is None.
+                  And the pair weight is the mean of the two samples' weight.
     :type weight: LayerOutput
+    :param name: Evaluator name.
+    :type name: None|basestring
     """
+    if not isinstance(input, list):
+        input = [input]
+    if label:
+        input.append(label)
+    if query_id:
+        input.append(query_id)
     evaluator_base(
-        name=name,
-        type="pnpair",
         input=input,
-        label=label,
-        info=info,
-        weight=weight)
+        type="pnpair",
+        weight=weight,
+        name=name, )
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
@@ -362,12 +446,12 @@ def chunk_evaluator(
 
     .. code-block:: text
 
-        Scheme    Description                                                                                  
+        Scheme    Description
         plain    Use the same label for the whole chunk.
-        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside. 
+        IOB      Two labels for chunk type X, B-X for chunk begining and I-X for chunk inside.
         IOE      Two labels for chunk type X, E-X for chunk ending and I-X for chunk inside.
-        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk. 
-   
+        IOBES    Four labels for chunk type X, B-X for chunk begining, I-X for chunk inside, E-X for chunk end and S-X for single word chunk.
+
     To make it clear, let's illustrate by an NER example.
     Assuming that there are three named entity types including ORG, PER and LOC which are called 'chunk type' here,
     if 'IOB' scheme were used, the label set will be extended to a set including B-ORG, I-ORG, B-PER, I-PER, B-LOC, I-LOC and O,
@@ -384,7 +468,7 @@ def chunk_evaluator(
         tagType = label % numTagType
         chunkType = label / numTagType
         otherChunkType = numChunkTypes
-    
+
     The following table shows the mapping rule between tagType and tag type in each scheme.
 
     .. code-block:: text
@@ -408,7 +492,7 @@ def chunk_evaluator(
         O      6
 
     In this example, chunkType has three values: 0 for ORG, 1 for PER, 2 for LOC, because the scheme is
-    "IOB" so tagType has two values: 0 for B and 1 for I. 
+    "IOB" so tagType has two values: 0 for B and 1 for I.
     Here we will use I-LOC to explain the above mapping rules in detail.
     For I-LOC, the label id is 5, so we can get tagType=1 and chunkType=2, which means I-LOC is a part of NER chunk LOC
     and the tag is I.
@@ -419,7 +503,7 @@ def chunk_evaluator(
 
        eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
-    
+
     :param input: The input layers.
     :type input: LayerOutput
     :param label: An input layer containing the ground truth label.
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
old mode 100755
new mode 100644
index 67aeb94def..eac2cb3168
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -11,16 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import functools
 import collections
 import inspect
 
+import paddle.trainer.config_parser as cp
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
     ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
-from .poolings import MaxPooling, AvgPooling, BasePoolingType
+from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \
+    CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling
 from .attrs import *
 from .default_decorators import *
 
@@ -31,31 +32,32 @@ except ImportError:
 import copy
 
 __all__ = [
-    "full_matrix_projection",
-    "AggregateLevel",
-    "ExpandLevel",
-    "identity_projection",
-    "dotmul_projection",
-    "dotmul_operator",
-    "repeat_layer",
-    "seq_reshape_layer",
-    "table_projection",
-    "mixed_layer",
-    "data_layer",
-    "embedding_layer",
-    "fc_layer",
-    "grumemory",
-    "pooling_layer",
-    "lstmemory",
-    "last_seq",
-    "first_seq",
-    "cos_sim",
-    "hsigmoid",
-    "conv_projection",
-    "mse_cost",
-    "regression_cost",
+    'full_matrix_projection',
+    'AggregateLevel',
+    'ExpandLevel',
+    'identity_projection',
+    'dotmul_projection',
+    'dotmul_operator',
+    'repeat_layer',
+    'seq_reshape_layer',
+    'table_projection',
+    'mixed_layer',
+    'data_layer',
+    'embedding_layer',
+    'fc_layer',
+    'grumemory',
+    'pooling_layer',
+    'lstmemory',
+    'last_seq',
+    'first_seq',
+    'cos_sim',
+    'l2_distance_layer',
+    'hsigmoid',
+    'conv_projection',
+    'square_error_cost',
+    'regression_cost',
     'classification_cost',
-    "LayerOutput",
+    'LayerOutput',
     'img_conv_layer',
     'img_pool_layer',
     'batch_norm_layer',
@@ -76,6 +78,7 @@ __all__ = [
     'trans_layer',
     'rotate_layer',
     'sum_to_one_norm_layer',
+    'row_l2_norm_layer',
     'get_output_layer',
     'LayerType',
     'context_projection',
@@ -103,24 +106,49 @@ __all__ = [
     'nce_layer',
     'cross_entropy_with_selfnorm',
     'cross_entropy',
+    'BeamInput',
+    'cross_entropy_over_beam',
     'multi_binary_label_cross_entropy',
     'sum_cost',
     'rank_cost',
     'lambda_cost',
-    'huber_cost',
+    'huber_regression_cost',
+    'huber_classification_cost',
     'block_expand_layer',
     'maxout_layer',
+    'dot_prod_layer',
     'out_prod_layer',
     'printer_layer',
     'print_layer',
     'priorbox_layer',
     'cross_channel_norm_layer',
+    'multibox_loss_layer',
+    'detection_output_layer',
+    'roi_pool_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
     'smooth_l1_cost',
     'layer_support',
     'multiplex_layer',
+    'row_conv_layer',
+    'dropout_layer',
+    'prelu_layer',
+    'switch_order_layer',
+    'gated_unit_layer',
+    'crop_layer',
+    'sub_nested_seq_layer',
+    'clip_layer',
+    'slice_projection',
+    'seq_slice_layer',
+    'kmax_seq_score_layer',
+    'img_pool3d_layer',
+    'scale_shift_layer',
+    'img_conv3d_layer',
+    'resize_layer',
+    'sub_seq_layer',
+    'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -129,29 +157,33 @@ class LayerType(object):
     Layer type enumerations.
     """
 
-    DATA = "data"
-    MIXED_LAYER = "mixed"
-    LSTMEMORY = "lstmemory"
-    GRUMEMORY = "gated_recurrent"
-    SEQUENCE_LAST_INSTANCE = "seqlastins"
-    SEQUENCE_FIRST_INSTANCE = "seqfirstins"
-    SEQUENCE_RESHAPE = "seqreshape"
-    POOLING_MAX = "max"
+    DATA = 'data'
+    MIXED_LAYER = 'mixed'
+    LSTMEMORY = 'lstmemory'
+    GRUMEMORY = 'gated_recurrent'
+    SEQUENCE_LAST_INSTANCE = 'seqlastins'
+    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
+    SEQUENCE_RESHAPE = 'seqreshape'
+    POOLING_MAX = 'max'
     POOLING_AVG = 'average'
-    FC_LAYER = "fc"
+    FC_LAYER = 'fc'
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
     COSINE_SIM = 'cos'
+    L2_DISTANCE = 'l2_distance'
     HSIGMOID = 'hsigmoid'
-    CONV_LAYER = "conv"
-    CONVTRANS_LAYER = "convt"
-    EXCONV_LAYER = "exconv"
-    EXCONVTRANS_LAYER = "exconvt"
-    CUDNNCONV_LAYER = "cudnn_conv"
-    POOL_LAYER = "pool"
+    CONV_LAYER = 'conv'
+    CONVTRANS_LAYER = 'convt'
+    EXCONV_LAYER = 'exconv'
+    EXCONVTRANS_LAYER = 'exconvt'
+    CUDNNCONV_LAYER = 'cudnn_conv'
+    CUDNNCONVTRANS_LAYER = 'cudnn_convt'
+    POOL_LAYER = 'pool'
+    POOL3D_LAYER = 'pool3d'
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
+    ROW_L2_NORM_LAYER = 'row_l2_norm'
     ADDTO_LAYER = 'addto'
 
     CONCAT_LAYER = 'concat'
@@ -169,6 +201,7 @@ class LayerType(object):
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
     ROTATE_LAYER = 'rotate'
+    DOT_PROD_LAYER = 'dot_prod'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -188,30 +221,56 @@ class LayerType(object):
     SPP_LAYER = "spp"
     PAD_LAYER = "pad"
     MULTIPLEX_LAYER = "multiplex"
+    ROW_CONV_LAYER = "row_conv"
+
+    PRINT_LAYER = 'print'
+    PRIORBOX_LAYER = 'priorbox'
+    MULTIBOX_LOSS_LAYER = 'multibox_loss'
+    DETECTION_OUTPUT_LAYER = 'detection_output'
+    ROI_POOL_LAYER = 'roi_pool'
+
+    CTC_LAYER = 'ctc'
+    WARP_CTC_LAYER = 'warp_ctc'
+    CRF_LAYER = 'crf'
+    CRF_DECODING_LAYER = 'crf_decoding'
+    NCE_LAYER = 'nce'
 
-    PRINT_LAYER = "print"
-    PRIORBOX_LAYER = "priorbox"
+    CONV3D_LAYER = 'conv3d'
+    DECONV3D_LAYER = 'deconv3d'
 
-    CTC_LAYER = "ctc"
-    WARP_CTC_LAYER = "warp_ctc"
-    CRF_LAYER = "crf"
-    CRF_DECODING_LAYER = "crf_decoding"
-    NCE_LAYER = 'nce'
+    RANK_COST = 'rank-cost'
+    LAMBDA_COST = 'lambda_cost'
+    HUBER_REGRESSION = 'huber_regression'
+    HUBER_CLASSIFICATION = 'huber_classification'
+    CROSS_ENTROPY = 'multi-class-cross-entropy'
+    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
+    CROSS_ENTROPY_OVER_BEAM = 'cross_entropy_over_beam'
+    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
+    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
+    SUM_COST = 'sum_cost'
+    SMOOTH_L1 = 'smooth_l1'
+
+    PRELU = 'prelu'
+    SWITCH_ORDER_LAYER = 'switch_order'
+    CROP_LAYER = 'crop'
+    SUB_NESTED_SEQ = 'sub_nested_seq'
+    CLIP_LAYER = 'clip'
+    SEQ_SLICE = 'seq_slice'
 
-    RANK_COST = "rank-cost"
-    LAMBDA_COST = "lambda_cost"
-    HUBER = "huber"
-    CROSS_ENTROPY = "multi-class-cross-entropy"
-    CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
-    SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
-    MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
-    SUM_COST = "sum_cost"
-    SMOOTH_L1 = "smooth_l1"
+    KMAX_SEQ_SCORE = 'kmax_seq_score'
+    SCALE_SHIFT_LAYER = 'scale_shift'
+
+    RESIZE = 'resize'
+    SUB_SEQ_LAYER = 'subseq'
+
+    SCALE_SUB_REGION_LAYER = 'scale_sub_region'
+
+    FACTORIZATION_MACHINE = 'factorization_machine'
 
     @staticmethod
     def is_layer_type(type_name):
         """
-        If type_name is a layer type.
+        Whether type_name is a layer type.
 
         :param type_name: layer type name. Because layer type enumerations are
                           strings.
@@ -273,7 +332,7 @@ class LayerOutput(object):
     :param activation: Layer Activation.
     :type activation: BaseActivation.
     :param parents: Layer's parents.
-    :type parents: list|tuple|collections.Sequence
+    :type parents: list | tuple | collections.Sequence
     """
 
     def __init__(self,
@@ -305,17 +364,17 @@ class LayerOutput(object):
         self.outputs = outputs
         self.reverse = reverse
 
-    def __repr__(self):
-        """
-        Disable __repr__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
+    @property
+    def width(self):
+        return cp.g_layer_map[self.full_name].width
 
-    def __str__(self):
-        """
-        Disable __str__ for debug reason. Will be implemented when release
-        """
-        assert False, "this method should not be invoked"
+    @property
+    def height(self):
+        return cp.g_layer_map[self.full_name].height
+
+    @property
+    def depth(self):
+        return cp.g_layer_map[self.full_name].depth
 
     def set_input(self, input):
         """
@@ -382,7 +441,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += full_matrix_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -390,13 +449,13 @@ def full_matrix_projection(input, size=0, param_attr=None):
                                      size=100,
                                      param_attr=ParamAttr(name='_proj'))
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of this layer.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A FullMatrixProjection Object.
+    :return: FullMatrixProjection Object.
     :rtype: FullMatrixProjection
     """
     proj = FullMatrixProjection(
@@ -409,12 +468,12 @@ def full_matrix_projection(input, size=0, param_attr=None):
 def trans_full_matrix_projection(input, size=0, param_attr=None):
     """
     Different from full_matrix_projection, this projection performs matrix
-    multiplication, using transpose of weight.
+    multiplication, using the transpose of weight.
 
     ..  math::
         out.row[i] += in.row[i] * w^\mathrm{T}
 
-    :math:`w^\mathrm{T}` means transpose of weight.
+    :math:`w^\mathrm{T}` means the transpose of weight.
     The simply usage is:
 
     .. code-block:: python
@@ -426,13 +485,13 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
                                                 initial_mean=0.0,
                                                 initial_std=0.01))
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TransposedFullMatrixProjection Object.
+    :return: TransposedFullMatrixProjection Object.
     :rtype: TransposedFullMatrixProjection
     """
     proj = TransposedFullMatrixProjection(
@@ -462,7 +521,7 @@ def table_projection(input, size=0, param_attr=None):
        with mixed_layer(size=100) as m:
            m += table_projection(input=layer)
 
-    2. When used as an independant object like this, you must set the size:
+    2. When used as an independent object like this, you must set the size:
 
     .. code-block:: python
 
@@ -471,13 +530,13 @@ def table_projection(input, size=0, param_attr=None):
                                param_attr=ParamAttr(name='_proj'))
 
 
-    :param input: Input layer, which must contains id fields.
+    :param input: The input of this layer, which must contains id fields.
     :type input: LayerOutput
-    :param size: The parameter size. Means the width of parameter.
+    :param size: The dimension of the output.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A TableProjection Object.
+    :return: TableProjection Object.
     :rtype: TableProjection
     """
     proj = TableProjection(
@@ -488,7 +547,7 @@ def table_projection(input, size=0, param_attr=None):
 
 def identity_projection(input, offset=None, size=None):
     """
-    1. IdentityProjection if offset=None. It performs:
+    1. If offset=None, it performs IdentityProjection as follows:
 
     .. math::
        out.row[i] += in.row[i]
@@ -500,9 +559,8 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer)
 
 
-    2. IdentityOffsetProjection if offset!=None. It likes IdentityProjection,
-    but layer size may be smaller than input size.
-    It select dimesions [offset, offset+layer_size) from input:
+    2. If offset!=None, It executes IdentityOffsetProjection and takes the
+       elements of the input in the range [offset, offset+size) as output.
 
     .. math::
        out.row[i] += in.row[i + \\textrm{offset}]
@@ -514,14 +572,20 @@ def identity_projection(input, offset=None, size=None):
        proj = identity_projection(input=layer,
                                   offset=10)
 
-    Note that both of two projections should not have any parameter.
+    Note that neither of the projections have trainable parameter.
 
-    :param input: Input Layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param offset: Offset, None if use default.
+    :param offset: The offset from the start of the input. The input's
+                   elements in the range [offset, offset+size) will be
+                   taken as output. If this parameter is not set or set
+                   to None, the output will be the same as the input.
     :type offset: int
-    :return: A IdentityProjection or IdentityOffsetProjection object
-    :rtype: IdentityProjection or IdentityOffsetProjection
+    :param size: The dimension of this layer. It will be neglected
+                 when offset is None or not set.
+    :type size: int
+    :return: IdentityProjection or IdentityOffsetProjection object
+    :rtype: IdentityProjection | IdentityOffsetProjection
     """
     if offset is None:
         proj = IdentityProjection(input_layer_name=input.name)
@@ -535,11 +599,47 @@ def identity_projection(input, offset=None, size=None):
     return proj
 
 
+def slice_projection(input, slices):
+    """
+    slice_projection slices the input value into multiple parts,
+    then selects and merges some of them into a new output.
+
+    .. math::
+       output = [input.slices()]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
+
+    Note that slice_projection has no trainable parameter.
+
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param slices: A list of start and end offsets of each slice.
+    :type slices: list of tuple
+    :return: SliceProjection object.
+    :rtype: SliceProjection
+    """
+    assert len(slices) >= 1
+    start = 0
+    for i in xrange(len(slices)):
+        assert len(slices[i]) == 2
+        # The start position of the next slice needs to be greater than
+        # or equal to the end position of the previous slice.
+        assert slices[i][0] >= start
+        assert slices[i][1] >= slices[i][0]
+        start = slices[i][1]
+    proj = SliceProjection(input_layer_name=input.name, slices=slices)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
-    scaling_projection multiplies the input with a scalar parameter and add to
-    the output.
+    scaling_projection multiplies the input with a scalar parameter.
 
     .. math::
        out += w * in
@@ -550,11 +650,11 @@ def scaling_projection(input, param_attr=None):
 
        proj = scaling_projection(input=layer)
 
-    :param input: Input Layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A ScalingProjection object
+    :return: ScalingProjection object.
     :rtype: ScalingProjection
     """
     proj = ScalingProjection(input_layer_name=input.name, **param_attr.attr)
@@ -565,8 +665,8 @@ def scaling_projection(input, param_attr=None):
 @wrap_param_attr_default()
 def dotmul_projection(input, param_attr=None):
     """
-    DotMulProjection with a layer as input.
-    It performs element-wise multiplication with weight.
+    DotMulProjection takes a layer as input and performs
+    element-wise multiplication with weight.
 
     ..  math::
         out.row[i] += in.row[i] .* weight
@@ -579,11 +679,11 @@ def dotmul_projection(input, param_attr=None):
 
        proj = dotmul_projection(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: A DotMulProjection Object.
+    :return: DotMulProjection object.
     :rtype: DotMulProjection
     """
     proj = DotMulProjection(
@@ -600,7 +700,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
        out.row[i] += scale * (a.row[i] .* b.row[i])
 
     where :math:`.*` means element-wise multiplication, and
-    scale is a config scalar, its default value is one.
+    scale is a config scalar, its default value is 1.
 
     The example usage is:
 
@@ -608,13 +708,13 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
 
        op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
 
-    :param a: Input layer1
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: Input layer2
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: config scalar, default value is one.
+    :param scale: A scalar to scale the product. Its default value is 1.
     :type scale: float
-    :return: A DotMulOperator Object.
+    :return: DotMulOperator object.
     :rtype: DotMulOperator
     """
     if 'x' in kwargs or 'y' in kwargs:
@@ -640,28 +740,29 @@ def context_projection(input,
     """
     Context Projection.
 
-    It just simply reorganizes input sequence, combines "context_len" sequence
-    to one context from context_start. "context_start" will be set to
-    -(context_len - 1) / 2 by default. If context position out of sequence
+    It just reorganizes input sequence, combines "context_len" elements of the
+    sequence to one context from context_start. "context_start" will be set to
+    -(context_len - 1) / 2 by default. When context position is out of sequence
     length, padding will be filled as zero if padding_attr = False, otherwise
     it is trainable.
 
-    For example, origin sequence is [A B C D E F G], context len is 3, then
-    after context projection and not set padding_attr, sequence will
+    For example, origin sequence is [A B C D E F G], context len is 3, padding_attr
+    is not set, then after context projection, sequence will
     be [ 0AB ABC BCD CDE DEF EFG FG0 ].
 
-    :param input: Input Sequence.
+    :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
-    :param context_len: context length.
+    :param context_len: The length of the context.
     :type context_len: int
-    :param context_start: context start position. Default is
+    :param context_start: The start position of the context. The default value is
                           -(context_len - 1)/2
     :type context_start: int
-    :param padding_attr: Padding Parameter Attribute. If false, it means padding
-                         always be zero. Otherwise Padding is learnable, and
-                         parameter attribute is set by this parameter.
-    :type padding_attr: bool|ParameterAttribute
-    :return: Projection
+    :param padding_attr: Parameter attribute of the padding. If the parameter is
+                         set to False, padding will be zero. In other cases, the
+                         padding is trainable, and its parameter attribute is set
+                         by this parameter.
+    :type padding_attr: bool | ParameterAttribute
+    :return: Projection object.
     :rtype: Projection
     """
     context_start = -(
@@ -693,20 +794,19 @@ class MixedLayerType(LayerOutput):
 
     def __init__(self, name, size, act, bias_attr, layer_attr, parents=None):
         """
-        Ctor.
-        :param name: layer name.
+        :param name: The name of this layer.
         :type name: basestring
-        :param size: layer size.
+        :param size: The dimension of this layer.
         :type size: int
-        :param act: activation type.
+        :param act: Activation type.
         :type act: BaseActivation
-        :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                          something not type of ParameterAttribute. None will
-                          get a default Bias.
-        :type bias_attr: ParameterAttribute or None means has bias. Any other
-                         type means no bias.
-        :param layer_attr: Extra Layer Attribute.
-        :type layer_attr: ExtraLayerAttribute or None
+        :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                          whose type is not ParameterAttribute, no bias is defined. If the
+                          parameter is set to True, the bias is initialized to zero.
+        :type bias_attr: ParameterAttribute | None | bool | Any
+        :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                           details.
+        :type layer_attr: ExtraLayerAttribute | None
         """
         LayerOutput.__init__(
             self,
@@ -771,12 +871,12 @@ def mixed_layer(size=0,
                 bias_attr=False,
                 layer_attr=None):
     """
-    Mixed Layer. A mixed layer will add all inputs together, then activate.
-    Each inputs is a projection or operator.
+    Mixed Layer. A mixed layer will add all inputs together, then activate the sum.
+    Each input is a projection or operator.
 
     There are two styles of usages.
 
-    1. When not set inputs parameter, use mixed_layer like this:
+    1. When the parameter input is not set, use mixed_layer like this:
 
     .. code-block:: python
 
@@ -792,21 +892,21 @@ def mixed_layer(size=0,
                        input=[full_matrix_projection(input=layer1),
                               full_matrix_projection(input=layer2)])
 
-    :param name: mixed layer name. Can be referenced by other layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: layer size.
+    :param size: The dimension of this layer.
     :type size: int
-    :param input: inputs layer. It is an optional parameter. If set,
-                  then this function will just return layer's name.
-    :param act: Activation Type.
+    :param input: The input of this layer. It is an optional parameter.
+    :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
-    :param layer_attr: The extra layer config. Default is None.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :return: MixedLayerType object can add inputs or layer name.
+    :return: MixedLayerType object.
     :rtype: MixedLayerType
     """
 
@@ -828,7 +928,8 @@ def mixed_layer(size=0,
 
 
 @layer_support()
-def data_layer(name, size, height=None, width=None, layer_attr=None):
+def data_layer(name, size, depth=None, height=None, width=None,
+               layer_attr=None):
     """
     Define DataLayer For NeuralNetwork.
 
@@ -838,16 +939,17 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
         data = data_layer(name="input", size=1000)
 
-    :param name: Name of this data layer.
+    :param name: The name of this layer.
     :type name: basestring
-    :param size: Size of this data layer.
+    :param size: The dimension of this data layer.
     :type size: int
-    :param height: Height of this data layer, used for image
-    :type height: int|None
-    :param width: Width of this data layer, used for image
-    :type width: int|None
-    :param layer_attr: Extra Layer Attribute.
-    :type layer_attr: ExtraLayerAttribute.
+    :param height: The height of the input image data.
+    :type height: int | None
+    :param width: The width of the input image data.
+    :type width: int | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -855,31 +957,41 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
         type=LayerType.DATA,
         name=name,
         size=size,
+        depth=depth,
         height=height,
         width=width,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
-    return LayerOutput(name, LayerType.DATA, size=size)
+    if depth is None:
+        depth = 1
+    num_filters = None
+    if height is not None and width is not None:
+        num_filters = size / (width * height * depth)
+        assert num_filters * width * height * depth == size, \
+                "size=%s width=%s height=%s depth=%s" % (size, width, height, depth)
+
+    return LayerOutput(name, LayerType.DATA, size=size, num_filters=num_filters)
 
 
 @wrap_name_default("embedding")
 @wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING)
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
 
-    :param name: Name of this embedding layer.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer for this embedding. NOTE: must be Index Data.
+    :param input: The input of this layer, whose type must be Index Data.
     :type input: LayerOutput
-    :param size: The embedding dimension.
+    :param size: The dimension of the embedding vector.
     :type size: int
     :param param_attr: The embedding parameter attribute. See ParameterAttribute
                       for details.
-    :type param_attr: ParameterAttribute|None
-    :param layer_attr: Extra layer Config. Default is None.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -906,7 +1018,7 @@ def fc_layer(input,
              bias_attr=None,
              layer_attr=None):
     """
-    Helper for declare fully connected layer.
+    The fully connected layer.
 
     The example usage is:
 
@@ -924,22 +1036,23 @@ def fc_layer(input,
        with mixed_layer(size=1024) as fc:
            fc += full_matrix_projection(input=layer)
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer. Could be a list/tuple of input layer.
-    :type input: LayerOutput|list|tuple
-    :param size: The layer dimension.
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
+    :param size: The dimension of this layer.
     :type size: int
-    :param act: Activation Type. Default is tanh.
+    :param act: Activation Type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -951,6 +1064,13 @@ def fc_layer(input,
         if isinstance(param_attr, collections.Sequence):
             assert len(input) == len(param_attr)
         else:
+            if "parameter_name" in param_attr.attr and len(input) > 1:
+                logger.fatal(
+                    "When the name field of param_attr is manually specified "
+                    "and the input is a list, the param_attr should also be a "
+                    "list with each item being the param_attr for each input "
+                    "item. If only one named param_attr is provided, all the "
+                    "input items would share this parameter.")
             param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
 
     assert isinstance(input, collections.Sequence)
@@ -970,15 +1090,17 @@ def fc_layer(input,
 
 
 @wrap_name_default("print")
-def printer_layer(input, name=None):
+def printer_layer(input, format=None, name=None):
     """
-    Print the output value of input layers. This layer is useful for debugging.
+    Print the output value of the layers specified by the parameter input.
+    This layer is useful for debugging.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer. Could be a list/tuple of input layer.
-    :type input: LayerOutput|list|tuple
-    :return: LayerOutput
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
         input = [input]
@@ -988,6 +1110,7 @@ def printer_layer(input, name=None):
 
     Layer(
         name=name,
+        format=format,
         type=LayerType.PRINT_LAYER,
         inputs=[l.name for l in input], )
     # this layer don't return anything, can not be input of other layer.
@@ -1011,20 +1134,21 @@ def priorbox_layer(input,
     """
     Compute the priorbox and set the variance. This layer is necessary for ssd.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param image: The network input image.
     :type image: LayerOutput
     :param aspect_ratio: The aspect ratio.
     :type aspect_ratio: list
     :param variance: The bounding box variance.
-    :type min_size: The min size of the priorbox width/height.
+    :type min_size: The minimum size of the priorbox width/height.
     :param min_size: list
-    :type max_size: The max size of the priorbox width/height. Could be NULL.
+    :type max_size: The maximum size of the priorbox width/height. It could be NULL.
     :param max_size: list
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     # plus one for ratio 1.
     num_filters = (len(aspect_ratio) * 2 + 1 + len(max_size)) * 4
@@ -1046,21 +1170,223 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("multibox_loss")
+def multibox_loss_layer(input_loc,
+                        input_conf,
+                        priorbox,
+                        label,
+                        num_classes,
+                        overlap_threshold=0.5,
+                        neg_pos_ratio=3.0,
+                        neg_overlap=0.5,
+                        background_id=0,
+                        name=None):
+    """
+    Compute the location loss and the confidence loss for ssd.
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input_loc: The input predicted locations.
+    :type input_loc: LayerOutput | List of LayerOutput
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput | List of LayerOutput
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param label: The input label.
+    :type label: LayerOutput
+    :param num_classes: The number of the classification.
+    :type num_classes: int
+    :param overlap_threshold: The threshold of the overlap.
+    :type overlap_threshold: float
+    :param neg_pos_ratio: The ratio of the negative bounding box to
+                          the positive bounding box.
+    :type neg_pos_ratio: float
+    :param neg_overlap: The negative bounding box overlap threshold.
+    :type neg_overlap: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+    input_loc_num = len(input_loc)
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+    input_conf_num = len(input_conf)
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name, label.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox, label]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    Layer(
+        name=name,
+        type=LayerType.MULTIBOX_LOSS_LAYER,
+        inputs=inputs,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        overlap_threshold=overlap_threshold,
+        neg_pos_ratio=neg_pos_ratio,
+        neg_overlap=neg_overlap,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.MULTIBOX_LOSS_LAYER, parents=parents, size=1)
+
+
+@wrap_name_default("detection_output")
+def detection_output_layer(input_loc,
+                           input_conf,
+                           priorbox,
+                           num_classes,
+                           nms_threshold=0.45,
+                           nms_top_k=400,
+                           keep_top_k=200,
+                           confidence_threshold=0.01,
+                           background_id=0,
+                           name=None):
+    """
+    Apply the NMS to the output of network and compute the predict bounding
+    box location. The output's shape of this layer could be zero if there is
+    no valid bounding box.
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input_loc: The input predict locations.
+    :type input_loc: LayerOutput | List of LayerOutput.
+    :param input_conf: The input priorbox confidence.
+    :type input_conf: LayerOutput | List of LayerOutput.
+    :param priorbox: The input priorbox location and the variance.
+    :type priorbox: LayerOutput
+    :param num_classes: The number of the classes.
+    :type num_classes: int
+    :param nms_threshold: The Non-maximum suppression threshold.
+    :type nms_threshold: float
+    :param nms_top_k: The bounding boxes number kept of the NMS's output.
+    :type nms_top_k: int
+    :param keep_top_k: The bounding boxes number kept of the layer's output.
+    :type keep_top_k: int
+    :param confidence_threshold: The classification confidence threshold.
+    :type confidence_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if isinstance(input_loc, LayerOutput):
+        input_loc = [input_loc]
+    assert isinstance(input_loc, collections.Sequence)  # list or tuple
+    for each in input_loc:
+        assert isinstance(each, LayerOutput)
+    input_loc_num = len(input_loc)
+
+    if isinstance(input_conf, LayerOutput):
+        input_conf = [input_conf]
+    assert isinstance(input_conf, collections.Sequence)  # list or tuple
+    for each in input_conf:
+        assert isinstance(each, LayerOutput)
+    input_conf_num = len(input_conf)
+
+    # Check the input layer number.
+    assert input_loc_num == input_conf_num
+
+    inputs = [priorbox.name]
+    inputs.extend([l.name for l in input_loc])
+    inputs.extend([l.name for l in input_conf])
+    parents = [priorbox]
+    parents.extend(input_loc)
+    parents.extend(input_conf)
+
+    size = keep_top_k * 7
+
+    Layer(
+        name=name,
+        type=LayerType.DETECTION_OUTPUT_LAYER,
+        inputs=inputs,
+        size=size,
+        input_num=input_loc_num,
+        num_classes=num_classes,
+        nms_threshold=nms_threshold,
+        nms_top_k=nms_top_k,
+        keep_top_k=keep_top_k,
+        confidence_threshold=confidence_threshold,
+        background_id=background_id)
+    return LayerOutput(
+        name, LayerType.DETECTION_OUTPUT_LAYER, parents=parents, size=size)
+
+
+@wrap_name_default("roi_pool")
+def roi_pool_layer(input,
+                   rois,
+                   pooled_width,
+                   pooled_height,
+                   spatial_scale,
+                   num_channels=None,
+                   name=None):
+    """
+    A layer used by Fast R-CNN to extract feature maps of ROIs from the last
+    feature map.
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param rois: The input ROIs' data.
+    :type rois: LayerOutput.
+    :param pooled_width: The width after pooling.
+    :type pooled_width: int
+    :param pooled_height: The height after pooling.
+    :type pooled_height: int
+    :param spatial_scale: The spatial scale between the image and feature map.
+    :type spatial_scale: float
+    :param num_channels: The number of the input channels.
+    :type num_channels: int
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    size = num_channels * pooled_width * pooled_height
+    Layer(
+        name=name,
+        type=LayerType.ROI_POOL_LAYER,
+        inputs=[input.name, rois.name],
+        pooled_width=pooled_width,
+        pooled_height=pooled_height,
+        spatial_scale=spatial_scale,
+        num_channels=num_channels)
+    return LayerOutput(
+        name, LayerType.ROI_POOL_LAYER, parents=[input, rois], size=size)
+
+
 @wrap_name_default("cross_channel_norm")
 def cross_channel_norm_layer(input, name=None, param_attr=None):
     """
-    Normalize a layer's output. This layer is necessary for ssd.
-    This layer applys normalize across the channels of each sample to
-    a conv layer's output and scale the output by a group of trainable
-    factors which dimensions equal to the channel's number.
+    Normalize a layer's output. This layer is necessary for ssd. This
+    layer applys normalization across the channels of each sample to
+    a convolutional layer's output and scales the output by a group of
+    trainable factors whose dimensions equal to the channel's number.
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
     :type param_attr: ParameterAttribute
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     assert input.num_filters is not None
     Layer(
@@ -1095,10 +1421,16 @@ def pooling_layer(input,
                   name=None,
                   bias_attr=None,
                   agg_level=AggregateLevel.TO_NO_SEQUENCE,
+                  stride=-1,
                   layer_attr=None):
     """
     Pooling layer for sequence inputs, not used for Image.
 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and returns the pooling value of the sequence in the window as the output. Thus,
+    a long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
+
     The example usage is:
 
     .. code-block:: python
@@ -1110,17 +1442,21 @@ def pooling_layer(input,
     :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
                       AggregateLevel.TO_SEQUENCE
     :type agg_level: AggregateLevel
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
-                         SumPooling, SquareRootNPooling.
-    :type pooling_type: BasePoolingType|None
-    :param bias_attr: Bias parameter attribute. False if no bias.
-    :type bias_attr: ParameterAttribute|None|False
-    :param layer_attr: The Extra Attributes for layer, such as dropout.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param pooling_type: Type of pooling. MaxPooling is the default pooling.
+    :type pooling_type: BasePoolingType | None
+    :param stride: The step size between successive pooling regions.
+    :type stride: int
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1134,12 +1470,16 @@ def pooling_layer(input,
         extra_dict['output_max_index'] = pooling_type.output_max_index
     extra_dict.update(ExtraLayerAttribute.to_kwargs(layer_attr))
 
+    if agg_level == AggregateLevel.TO_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=pooling_type.name,
         inputs=[Input(input.name)],
         bias=ParamAttr.to_bias(bias_attr),
         trans_type=agg_level,
+        stride=stride,
         **extra_dict)
 
     return LayerOutput(
@@ -1151,13 +1491,13 @@ def pooling_layer(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
-@layer_support(DROPOUT)
+@layer_support()
 def lstmemory(input,
               name=None,
+              size=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               state_act=None,
               bias_attr=None,
               param_attr=None,
@@ -1190,33 +1530,34 @@ def lstmemory(input,
     NOTE: This is a low level user interface. You can use network.simple_lstm
     to config a simple plain lstm layer.
 
-    Please refer to **Generating Sequences With Recurrent Neural Networks** for
-    more details about LSTM.
+    Reference:
+        `Generating Sequences With Recurrent Neural Networks
+        <https://arxiv.org/pdf/1308.0850.pdf>`_
 
-    Link_ goes as below.
-
-    .. _Link: http://arxiv.org/abs/1308.0850
-
-    :param name: The lstmemory layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layer name.
+    :param size: DEPRECATED. The dimension of the lstm cell.
+    :type size: int
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param reverse: is sequence process reversed or not.
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
-    :param act: activation type, TanhActivation by default. :math:`h_t`
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
+    :param gate_act: Activation type of this layer's gates. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: state activation type, TanhActivation by default.
+    :param state_act: Activation type of the state. TanhActivation is the default activation.
     :type state_act: BaseActivation
-
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute|None|False
-    :param layer_attr: Extra Layer attribute
-    :type layer_attr: ExtraLayerAttribute|None
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1225,15 +1566,15 @@ def lstmemory(input,
     assert state_act.support_hppl
     assert act.support_hppl
     assert input.size is not None and input.size % 4 == 0
+
     if size is not None:
         if input.size / 4 == size:
             plog = logger.warning
         else:
             plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -1258,13 +1599,13 @@ def lstmemory(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
-@layer_support(DROPOUT)
+@layer_support()
 def grumemory(input,
+              size=None,
               name=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               bias_attr=None,
               param_attr=None,
               layer_attr=None):
@@ -1304,14 +1645,14 @@ def grumemory(input,
         h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}}
 
     NOTE: In PaddlePaddle's implementation, the multiplication operations
-    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in
-    gate_recurrent layer. Consequently, an additional mixed_layer with
+    :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed
+    in gate_recurrent layer. Consequently, an additional mixed_layer with
     full_matrix_projection or a fc_layer must be included before grumemory
     is called.
 
-    More details can be found by referring to `Empirical Evaluation of Gated
-    Recurrent Neural Networks on Sequence Modeling.
-    <https://arxiv.org/abs/1412.3555>`_
+    Reference:
+        `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling
+        <https://arxiv.org/abs/1412.3555>`_
 
     The simple usage is:
 
@@ -1319,29 +1660,30 @@ def grumemory(input,
 
        gru = grumemory(input)
 
-    :param name: The gru layer name.
-    :type name: None|basestring
-    :param input: input layer.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
     :type input: LayerOutput.
-    :param reverse: Whether sequence process is reversed or not.
+    :param size: DEPRECATED. The dimension of the gru cell.
+    :type size: int
+    :param reverse: Whether the input sequence is processed in a reverse order.
     :type reverse: bool
-    :param act: activation type, TanhActivation by default. This activation
+    :param act: Activation type, TanhActivation is the default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
     :type act: BaseActivation
-    :param gate_act: gate activation type, SigmoidActivation by default.
-                     This activation affects the :math:`z_t` and :math:`r_t`. It is the
-                     :math:`\\sigma` in the above formula.
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation. This activation affects the :math:`z_t`
+                     and :math:`r_t`. It is the :math:`\\sigma` in the above formula.
     :type gate_act: BaseActivation
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
-    :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute|None|False
-    :param layer_attr: Extra Layer attribute
-    :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1353,9 +1695,9 @@ def grumemory(input,
             plog = logger.warning
         else:
             plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -1384,10 +1726,10 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the last value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the last value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1396,14 +1738,16 @@ def last_seq(input,
        seq = last_seq(input=layer)
 
     :param agg_level: Aggregated level
-    :param name: Layer name.
+    :type agg_level: AggregateLevel
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param stride: window size.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :param stride: The step size between successive pooling regions.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1440,10 +1784,10 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
-    If stride > 0, this layer slides a window whose size is determined by stride,
-    and return the first value of the window as the output. Thus, a long sequence
-    will be shorten. Note that for sequence with sub-sequence, the default value
-    of stride is -1.
+    If stride > 0, this layer will slide a window whose size is determined by stride,
+    and return the first value of the sequence in the window as the output. Thus, a
+    long sequence will be shortened. Note that for sequence with sub-sequence, the
+    default value of stride is -1.
 
     The simple usage is:
 
@@ -1452,13 +1796,15 @@ def first_seq(input,
        seq = first_seq(input=layer)
 
     :param agg_level: aggregation level
-    :param name: Layer name.
+    :type agg_level: AggregateLevel
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param stride: window size.
-    :type stride: Int
-    :param layer_attr: extra layer attributes.
+    :param stride: The step size between successive pooling regions.
+    :type stride: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1516,8 +1862,8 @@ def expand_layer(input,
                  expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                  layer_attr=None):
     """
-    A layer for "Expand Dense data or (sequence data where the length of each
-    sequence is one) to sequence data."
+    A layer for expanding dense data or (sequence data where the length of each
+    sequence is one) to sequence data.
 
     The example usage is:
 
@@ -1527,18 +1873,22 @@ def expand_layer(input,
                              expand_as=layer2,
                              expand_level=ExpandLevel.FROM_NO_SEQUENCE)
 
-    :param input: Input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param expand_as: Expand as this layer's sequence info.
+    :param expand_as: Expand the input according to this layer's sequence infomation. And
+                      after the operation, the input expanded will have the same number of
+                      elememts as this layer.
     :type expand_as: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: Bias attribute. None means default bias. False means no
-                      bias.
-    :type bias_attr: ParameterAttribute|None|False
-    :param expand_level: whether input layer is timestep(default) or sequence.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param expand_level: Whether the input layer is a sequence or the element of a sequence.
     :type expand_level: ExpandLevel
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1559,14 +1909,27 @@ def expand_layer(input,
 
 
 @wrap_name_default()
+@wrap_act_default(act=IdentityActivation())
 @layer_support()
-def repeat_layer(input, num_repeats, name=None, layer_attr=None):
+def repeat_layer(input,
+                 num_repeats,
+                 as_row_vector=True,
+                 act=None,
+                 name=None,
+                 layer_attr=None):
     """
-    A layer for repeating the input for num_repeats times. This is equivalent
-    to apply concat_layer() with num_repeats same input.
+    A layer for repeating the input for num_repeats times.
+
+    If as_row_vector:
+
+    .. math::
+       y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
+    If not as_row_vector:
 
     .. math::
-       y  = [x, x, \cdots, x]
+       y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
+
 
     The example usage is:
 
@@ -1574,13 +1937,21 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
 
        expand = repeat_layer(input=layer, num_repeats=4)
 
-    :param input: Input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
+    :type as_row_vector: bool
+    :param act: Activation type. IdentityActivation is the default activation.
+    :type act: BaseActivation
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1589,20 +1960,23 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
     l = Layer(
         inputs=[input.name],
         name=name,
+        active_type=act.name,
         num_filters=num_repeats,
+        as_row_vector=as_row_vector,
         type=LayerType.FEATURE_MAP_EXPAND_LAYER,
         **ExtraAttr.to_kwargs(layer_attr))
     return LayerOutput(
         name=name,
         size=l.config.size,
         layer_type=LayerType.FEATURE_MAP_EXPAND_LAYER,
+        activation=act,
         parents=[input])
 
 
 @wrap_name_default("seqreshape")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def seq_reshape_layer(input,
                       reshape_size,
                       act=None,
@@ -1622,20 +1996,21 @@ def seq_reshape_layer(input,
 
        reshape = seq_reshape_layer(input=layer, reshape_size=4)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param act: Activation type.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1658,7 +2033,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -1674,13 +2049,14 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
 
        interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
 
-    :param input: Input layer.
-    :type input: list|tuple
+    :param input: The input of this layer.
+    :type input: list | tuple
     :param weight: Weight layer.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1714,7 +2090,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -1724,18 +2100,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int|None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int|None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None|basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -1770,8 +2147,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -1779,13 +2156,14 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
        power = power_layer(input=layer1, weight=layer2)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1823,13 +2201,14 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
        scale = scaling_layer(input=layer1, weight=layer2)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1863,11 +2242,12 @@ def trans_layer(input, name=None, layer_attr=None):
 
        trans = trans_layer(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1901,13 +2281,16 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
                           height=100,
                           width=100)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
-    :param name: Layer name.
+    :param width: The width of the sample matrix.
+    :type width: int
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1950,17 +2333,17 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
        cos = cos_sim(a=layer1, b=layer2, size=3)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1986,6 +2369,51 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COSINE_SIM, parents=[a, b], size=size)
 
 
+@wrap_name_default()
+@layer_support()
+def l2_distance_layer(x, y, name=None, layer_attr=None):
+    """
+    This layer calculates and returns the Euclidean distance between two input
+    vectors x and y. The equation is as follows:
+
+    ..  math::
+        l2_distance(\\mathbf{x}, \\mathbf{y}) = \\sqrt{\\sum_{i=1}^D(x_i - y_i)}
+
+    The output size of this layer is fixed to be 1. Note that the above
+    computation is for one sample. Multiple samples are processed in one batch.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       l2_sim = l2_distance(x=layer1, y=layer2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param x: The first input x for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of x's output.
+    :type x: LayerOutput
+    :param y: The second input y for this layer, whose output is a matrix with
+              dimensionality N x D. N is the sample number in a mini-batch.
+              D is the dimensionality of y's output.
+    :type y: LayerOutput
+    :param layer_attr: The extra layer attributes, for example, drop rate.
+                       See ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: The returned LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(x, LayerOutput) and isinstance(y, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.L2_DISTANCE,
+        inputs=[x.name, y.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.L2_DISTANCE, parents=[x, y], size=1)
+
+
 @wrap_name_default()
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
@@ -2000,8 +2428,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2010,21 +2440,23 @@ def hsigmoid(input,
         cost = hsigmoid(input=[layer1, layer2],
                         label=data_layer)
 
-    :param input: Input layers. It could be a LayerOutput or list/tuple of
-                 LayerOutput.
-    :type input: LayerOutput|list|tuple
-    :param label: Label layer.
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int|None
-    :param name: layer name
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param bias_attr: Bias attribute. None means default bias.
-                      False means no bias.
-    :type bias_attr: ParameterAttribute|False
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute|None
-    :param layer_attr: Extra Layer Attribute.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2082,6 +2514,7 @@ def img_conv_layer(input,
                    groups=1,
                    stride=1,
                    padding=0,
+                   dilation=1,
                    bias_attr=None,
                    param_attr=None,
                    shared_biases=True,
@@ -2089,6 +2522,7 @@ def img_conv_layer(input,
                    filter_size_y=None,
                    stride_y=None,
                    padding_y=None,
+                   dilation_y=None,
                    trans=False,
                    layer_type=None):
     """
@@ -2108,15 +2542,21 @@ def img_conv_layer(input,
     what-are-deconvolutional-layers/>`_ .
     The num_channel means input image's channel number. It may be 1 or 3 when
     input is raw pixels of image(mono or RGB), or it may be the previous layer's
-    num_filters * num_group.
-
-    There are several group of filter in PaddlePaddle implementation.
-    Each group will process some channel of the inputs. For example, if an input
-    num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create
-    32*4 = 128 filters to process inputs. The channels will be split into 4
-    pieces. First 256/4 = 64 channels will process by first 32 filters. The
-    rest channels will be processed by rest group of filters.
-
+    num_filters.
+
+    There are several groups of filters in PaddlePaddle implementation.
+    If the groups attribute is greater than 1, for example groups=2,
+    the input will be splitted into 2 parts along the channel axis, and
+    the filters will also be splitted into 2 parts. The first half of the filters 
+    is only connected to the first half of the input channels, while the second 
+    half of the filters is only connected to the second half of the input. After
+    the computation of convolution for each part of input,
+    the output will be obtained by concatenating the two results.
+
+    The details of grouped convolution, please refer to:
+    `ImageNet Classification with Deep Convolutional Neural Networks
+    <http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
+    
     The example usage is:
 
     ..  code-block:: python
@@ -2127,51 +2567,73 @@ def img_conv_layer(input,
                               bias_attr=False,
                               act=ReluActivation())
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Layer Input.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel. Or input a tuple for
-                        two image dimension.
-    :type filter_size: int|tuple|list
-    :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
-                        currently supports rectangular filters, the filter's
-                        shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int|None
-    :param num_filters: Each filter group's number of filter
-    :param act: Activation type. Default is tanh
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
+    :type filter_size: int | tuple | list
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
+    :type filter_size_y: int
+    :param num_filters: The number of filters. It is as same as the output image channel.
+    :type num_filters: int
+    :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param groups: Group size of filters.
+    :param groups: The group number. 1 is the default group number.
     :type groups: int
-    :param stride: The x dimension of the stride. Or input a tuple for two image
-                   dimension.
-    :type stride: int|tuple|list
-    :param stride_y: The y dimension of the stride.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided. 1 is the default value.
+    :type stride: int | tuple | list
+    :param stride_y: The stride on the y axis.
     :type stride_y: int
-    :param padding: The x dimension of the padding. Or input a tuple for two
-                    image dimension
-    :type padding: int|tuple|list
-    :param padding_y: The y dimension of the padding.
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided. 0 is the default padding size.
+    :type padding: int | tuple | list
+    :param padding_y: The padding size on the y axis.
     :type padding_y: int
-    :param bias_attr: Convolution bias attribute. None means default bias.
-                      False means no bias.
-    :type bias_attr: ParameterAttribute|False
-    :param num_channels: number of input channels. If None will be set
-                        automatically from previous output.
+    :param dilation: The dimensions of the dilation. If the parameter is set to one integer,
+                     the two dimensions on x and y axises will be same when dilation_y is not
+                     set. If it is set to a list, the first element indicates the dimension
+                     on the x axis, and the second is used to specify the dimension on the y
+                     axis when dilation_y is not provided. 1 is the default dimension.
+    :type dilation: int | tuple | list
+    :param dilation_y: The dimension of the dilation on the y axis.
+    :type dilation_y: int
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channel number of the input.
     :type num_channels: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param shared_biases: Is biases will be shared between filters or not.
+    :param shared_biases: Whether biases will be shared between filters or not.
     :type shared_biases: bool
-    :param layer_attr: Layer Extra Attribute.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param trans: true if it is a convTransLayer, false if it is a convLayer
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
     :type trans: bool
-    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt" or "cudnn_convt",
-                       otherwise layer_type has to be either "exconv" or
-                       "cudnn_conv"
-    :type layer_type: String
+    :param layer_type: Specify the layer type. If the dilation's dimension on one axis is
+                       larger than 1, layer_type has to be "cudnn_conv" or "cudnn_convt".
+                       If trans=True, layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or "cudnn_conv".
+    :type layer_type: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2200,6 +2662,13 @@ def img_conv_layer(input,
         else:
             padding_y = padding
 
+    if dilation_y is None:
+        if isinstance(dilation, collections.Sequence):
+            assert len(dilation) == 2
+            dilation, dilation_y = dilation
+        else:
+            dilation_y = dilation
+
     if param_attr.attr.get('initial_smart'):
         # special initial for conv layers.
         init_w = (2.0 / (filter_size**2 * num_channels))**0.5
@@ -2209,6 +2678,10 @@ def img_conv_layer(input,
         param_attr.attr["initial_smart"] = False
 
     if layer_type:
+        if dilation > 1 or dilation_y > 1:
+            assert layer_type in [
+                "cudnn_conv", "cudnn_convt", "exconv", "exconvt"
+            ]
         if trans:
             assert layer_type in ["exconvt", "cudnn_convt"]
         else:
@@ -2224,11 +2697,13 @@ def img_conv_layer(input,
             conv=Conv(
                 filter_size=filter_size,
                 padding=padding,
+                dilation=dilation,
                 stride=stride,
                 channels=num_channels,
                 groups=groups,
                 filter_size_y=filter_size_y,
                 padding_y=padding_y,
+                dilation_y=dilation_y,
                 stride_y=stride_y),
             **param_attr.attr),
         active_type=act.name,
@@ -2259,11 +2734,12 @@ def img_pool_layer(input,
                    pool_size_y=None,
                    stride_y=None,
                    padding_y=None,
-                   ceil_mode=True):
+                   ceil_mode=True,
+                   exclude_mode=None):
     """
     Image pooling Layer.
 
-    The details of pooling layer, please refer ufldl's pooling_ .
+    The details of pooling layer, please refer to ufldl's pooling_ .
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
@@ -2271,15 +2747,17 @@ def img_pool_layer(input,
 
     ..  math::
 
-        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     - ceil_mode=False:
 
     ..  math::
 
-        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
-        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
 
     The example usage is:
 
@@ -2295,33 +2773,43 @@ def img_pool_layer(input,
                                  padding_y=2,
                                  pool_type=MaxPooling())
 
-    :param padding: pooling padding width.
+    :param padding: The padding size on the x axis. 0 is the default padding size.
     :type padding: int
-    :param padding_y: pooling padding height. It's equal to padding by default.
-    :type padding_y: int|None
-    :param name: name of pooling layer
-    :type name: basestring.
-    :param input: layer's input
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param pool_size: pooling window width
+    :param pool_size: The pooling window length on the x axis.
     :type pool_size: int
-    :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
-    :type pool_size_y: int|None
-    :param num_channels: number of input channel.
+    :param pool_size_y: The pooling window length on the y axis. If the parameter is
+                        not set or set to None, its actual value will be automatically
+                        set to pool_size.
+    :type pool_size_y: int
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
-                      MaxPooling.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
     :type pool_type: BasePoolingType
-    :param stride: stride width of pooling.
+    :param stride: The stride on the x axis. 1 is the default value.
     :type stride: int
-    :param stride_y: stride height of pooling. It is equal to stride by default.
-    :type stride_y: int|None
-    :param layer_attr: Extra Layer attribute.
+    :param stride_y: The stride on the y axis. If the parameter is not set or set to
+                     None, its actual value will be automatically set to 'stride'.
+    :type stride_y: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
-                      Defalut is True. If set false, Otherwise use floor.
-
+    :param ceil_mode: Whether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
     :type ceil_mode: bool
+    :param exclude_mode: Whether to exclude the padding cells when calculating, but only 
+                         work when pool_type is AvgPooling. If None, also exclude the padding 
+                         cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling 
+                         as pool_type to identify the mode.
+    :type exclude_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2334,11 +2822,14 @@ def img_pool_layer(input,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
+    assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling,
+                               CudnnMaxPooling, CudnnAvgInclPadPooling], \
+        "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported"
+
     type_name = pool_type.name + '-projection' \
         if (
         isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
         else pool_type.name
-
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
     stride_y = stride if stride_y is None else stride_y
     padding_y = padding if padding_y is None else padding_y
@@ -2361,6 +2852,7 @@ def img_pool_layer(input,
                     padding_y=padding_y))
         ],
         ceil_mode=ceil_mode,
+        exclude_mode=exclude_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2370,40 +2862,90 @@ def img_pool_layer(input,
         size=l.config.size)
 
 
-@wrap_name_default("spp")
+@wrap_name_default("pool3d")
 @layer_support()
-def spp_layer(input,
-              name=None,
-              num_channels=None,
-              pool_type=None,
-              pyramid_height=None,
-              layer_attr=None):
+def img_pool3d_layer(input,
+                     pool_size,
+                     name=None,
+                     num_channels=None,
+                     pool_type=None,
+                     stride=1,
+                     padding=0,
+                     layer_attr=None,
+                     pool_size_y=None,
+                     stride_y=None,
+                     padding_y=None,
+                     pool_size_z=None,
+                     stride_z=None,
+                     padding_z=None,
+                     ceil_mode=True):
     """
-    Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition.
-    The details please refer to
-    `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
+    Image pooling Layer.
+
+    The details of pooling layer, please refer ufldl's pooling_ .
+
+    .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
+
+    - ceil_mode=True:
+
+    ..  math::
+
+        w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride}
+
+        h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y}
+
+        d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z}
 
     The example usage is:
 
     ..  code-block:: python
 
-        spp = spp_layer(input=data,
-                        pyramid_height=2,
-                        num_channels=16,
-                        pool_type=MaxPooling())
+        maxpool = img_pool3d_layer(input=conv,
+                                 pool_size=3,
+                                 num_channels=8,
+                                 stride=1,
+                                 padding=1,
+                                 pool_type=MaxPooling())
 
-    :param name: layer name.
-    :type name: basestring
-    :param input: layer's input.
+    :param padding: pooling padding width.
+    :type padding: int | tuple | list
+    :param name: The name of this layer. It is optional.
+    :type name: basestring.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: number of input channel.
+    :param pool_size: The pooling window lengths along three axises. If the parameter
+                      is set to one integer, the three lengths will be same.
+    :type pool_size: int | tuple | list
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param pool_type: Pooling type. MaxPooling or AveragePooling. Default is MaxPooling.
-    :type scale: BasePoolingType
-    :param pyramid_height: pyramid height.
-    :type pyramid_height: int
-    :param layer_attr: Extra Layer Attribute.
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
+    :type pool_type: BasePoolingType
+    :param stride: The strides of the pooling along three axises. If the parameter
+                   is set to one integer, the three strides will be same. 1 is the
+                   default value.
+    :type stride: int | tuple | list
+    :param padding: The sizes of padding along three axises. If the parameter is set to
+                    one integer, they will be same. 0 is the default padding size.
+    :type padding: int | tuple | list
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use the ceil function to calculate output height and width.
+                      True is the default. If it is set to False, the floor function will
+                      be used.
+    :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2416,11 +2958,118 @@ def spp_layer(input,
     elif isinstance(pool_type, AvgPooling):
         pool_type.name = 'avg'
 
-    type_name = pool_type.name
-    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
-        type_name += '-projection'
-
-    l = Layer(
+    type_name = pool_type.name + '-projection' \
+        if (
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        else pool_type.name
+
+    if isinstance(pool_size, collections.Sequence):
+        assert len(pool_size) == 3
+        pool_size, pool_size_y, pool_size_z = pool_size
+    else:
+        pool_size_y = pool_size
+        pool_size_z = pool_size
+
+    if isinstance(stride, collections.Sequence):
+        assert len(stride) == 3
+        stride, stride_y, stride_z = stride
+    else:
+        stride_y = stride
+        stride_z = stride
+
+    if isinstance(padding, collections.Sequence):
+        assert len(padding) == 3
+        padding, padding_y, padding_y = padding
+    else:
+        padding_y = padding
+        padding_z = padding
+
+    l = Layer(
+        name=name,
+        type=LayerType.POOL3D_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                pool=Pool3d(
+                    pool_type=type_name,
+                    channels=num_channels,
+                    size_x=pool_size,
+                    start=None,
+                    stride=stride,
+                    padding=padding,
+                    size_y=pool_size_y,
+                    stride_y=stride_y,
+                    padding_y=padding_y,
+                    size_z=pool_size_z,
+                    stride_z=stride_z,
+                    padding_z=padding_z))
+        ],
+        ceil_mode=ceil_mode,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        LayerType.POOL_LAYER,
+        parents=[input],
+        num_filters=num_channels,
+        size=l.config.size)
+
+
+@wrap_name_default("spp")
+@layer_support()
+def spp_layer(input,
+              name=None,
+              num_channels=None,
+              pool_type=None,
+              pyramid_height=None,
+              layer_attr=None):
+    """
+    A layer performs spatial pyramid pooling.
+
+    Reference:
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        <https://arxiv.org/abs/1406.4729>`_
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        spp = spp_layer(input=data,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=MaxPooling())
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
+    :param pool_type: Pooling type. MaxPooling is the default pooling.
+    :type scale: BasePoolingType
+    :param pyramid_height: The pyramid height of this pooling.
+    :type pyramid_height: int
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if pool_type is None:
+        pool_type = MaxPooling()
+    elif isinstance(pool_type, AvgPooling):
+        pool_type.name = 'avg'
+
+    type_name = pool_type.name
+    if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)):
+        type_name += '-projection'
+
+    l = Layer(
         name=name,
         type=LayerType.SPP_LAYER,
         inputs=Input(
@@ -2477,8 +3126,10 @@ def img_cmrnorm_layer(input,
                       layer_attr=None):
     """
     Response normalization across feature maps.
-    The details please refer to
-    `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
+
+    Reference:
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_
 
     The example usage is:
 
@@ -2486,9 +3137,9 @@ def img_cmrnorm_layer(input,
 
         norm = img_cmrnorm_layer(input=net, size=5)
 
-    :param name: layer name.
-    :type name: None|basestring
-    :param input: layer's input.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: Normalize in number of :math:`size` feature maps.
     :type size: int
@@ -2496,9 +3147,11 @@ def img_cmrnorm_layer(input,
     :type scale: float
     :param power: The hyper-parameter.
     :type power: float
-    :param num_channels: input layer's filers number or channels. If
-                         num_channels is None, it will be set automatically.
-    :param layer_attr: Extra Layer Attribute.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2508,23 +3161,26 @@ def img_cmrnorm_layer(input,
 
 
 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def batch_norm_layer(input,
                      act=None,
                      name=None,
+                     img3D=False,
                      num_channels=None,
                      bias_attr=None,
                      param_attr=None,
                      layer_attr=None,
                      batch_norm_type=None,
+                     epsilon=1e-5,
                      moving_average_fraction=0.9,
-                     use_global_stats=None):
+                     use_global_stats=None,
+                     mean_var_names=None):
     """
-    Batch Normalization Layer. The notation of this layer as follow.
+    Batch Normalization Layer. The notation of this layer is as follows.
 
     :math:`x` is the input features over a mini-batch.
 
@@ -2538,8 +3194,10 @@ def batch_norm_layer(input,
         \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
-    The details of batch normalization please refer to this
-    `paper <http://arxiv.org/abs/1502.03167>`_.
+    Reference:
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
+        Internal Covariate Shift
+        <http://arxiv.org/abs/1502.03167>`_
 
     The example usage is:
 
@@ -2547,60 +3205,57 @@ def batch_norm_layer(input,
 
         norm = batch_norm_layer(input=net, act=ReluActivation())
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: batch normalization input. Better be linear activation.
-                Because there is an activation inside batch_normalization.
+    :param input: This layer's input which is to be performed batch normalization on.
     :type input: LayerOutput
-    :param batch_norm_type: We have batch_norm and cudnn_batch_norm. batch_norm
-                            supports both CPU and GPU. cudnn_batch_norm requires
-                            cuDNN version greater or equal to v4 (>=v4). But
-                            cudnn_batch_norm is faster and needs less memory
-                            than batch_norm. By default (None), we will
-                            automaticly select cudnn_batch_norm for GPU and
-                            batch_norm for CPU. Otherwise, select batch norm
-                            type based on the specified type. If you use cudnn_batch_norm,
-                            we suggested you use latest version, such as v5.1.
-    :type batch_norm_type: None|string, None or "batch_norm" or "cudnn_batch_norm"
-    :param act: Activation Type. Better be relu. Because batch
-                     normalization will normalize input near zero.
+    :param batch_norm_type: We have batch_norm, mkldnn_batch_norm and cudnn_batch_norm.
+                            batch_norm supports CPU, MKLDNN and GPU. cudnn_batch_norm
+                            requires cuDNN version greater or equal to v4 (>=v4).
+                            But cudnn_batch_norm is faster and needs less
+                            memory than batch_norm. mkldnn_batch_norm requires
+                            use_mkldnn is enabled. By default (None), we will
+                            automatically select cudnn_batch_norm for GPU,
+                            mkldnn_batch_norm for MKLDNN and batch_norm for CPU.
+                            Users can specify the batch norm type. If you use
+                            cudnn_batch_norm, we suggested you use latest version,
+                            such as v5.1.
+    :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
+                           or "mkldnn_batch_norm"
+    :param act: Activation type. ReluActivation is the default activation.
     :type act: BaseActivation
-    :param num_channels: num of image channels or previous layer's number of
-                         filters. None will automatically get from layer's
-                         input.
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
     :type num_channels: int
-    :param bias_attr: :math:`\\beta`, better be zero when initialize. So the
-                      initial_std=0, initial_mean=1 is best practice.
-    :type bias_attr: ParameterAttribute
-    :param param_attr: :math:`\\gamma`, better be one when initialize. So the
-                       initial_std=0, initial_mean=1 is best practice.
+    :param bias_attr: :math:`\\beta`. The bias attribute. If the parameter is set to
+                      False or an object whose type is not ParameterAttribute, no
+                      bias is defined. If the parameter is set to True, the bias is
+                      initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: :math:`\\gamma`. The parameter attribute. See ParameterAttribute
+                       for details.
     :type param_attr: ParameterAttribute
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param use_global_stats: whether use moving mean/variance statistics
-                             during testing peroid. If None or True,
-                             it will use moving mean/variance statistics during
-                             testing. If False, it will use the mean
-                             and variance of current batch of test data for
-                             testing.
-    :type use_global_stats: bool|None.
-    :param moving_average_fraction: Factor used in the moving average
-                                   computation, referred to as facotr,
-                                   :math:`runningMean = newMean*(1-factor)
-                                   + runningMean*factor`
+    :param use_global_stats: Whether use moving mean/variance statistics during
+                             testing peroid. If the parameter is set to None or
+                             True, it will use moving mean/variance statistics
+                             during testing. If the parameter is set to False, it
+                             will use the mean and variance of the current batch
+                             of test data.
+    :type use_global_stats: bool | None.
+    :param epsilon: The small constant added to the variance to improve numeric stability.
+    :type epsilon: float.
+    :param moving_average_fraction: Factor used in the moving average computation.
+                                   :math:`runningMean = newMean*(1-factor) + runningMean*factor`
     :type moving_average_fraction: float.
+    :param mean_var_names: [mean name, variance name]
+    :type mean_var_names: string list
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    if not isinstance(act, ReluActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
-
-    if not isinstance(input.activation, LinearActivation):
-        logger.log(logging.WARN,
-                   "The activation should be inside batch normalization, the "
-                   "previous layer's activation may be Linear")
 
     if num_channels is None:
         if input.num_filters is not None:
@@ -2608,17 +3263,22 @@ def batch_norm_layer(input,
         else:
             num_channels = input.size
     assert (batch_norm_type is None) or (batch_norm_type == "batch_norm") or \
+           (batch_norm_type == "mkldnn_batch_norm") or \
            (batch_norm_type == "cudnn_batch_norm")
+
     l = Layer(
         name=name,
+        img3D=img3D,
         inputs=Input(
             input.name, image=Image(channels=num_channels), **param_attr.attr),
         active_type=act.name,
         type=LayerType.BATCH_NORM_LAYER,
         batch_norm_type=batch_norm_type,
         bias=ParamAttr.to_bias(bias_attr),
+        epsilon=epsilon,
         moving_average_fraction=moving_average_fraction,
         use_global_stats=use_global_stats,
+        mean_var_names=mean_var_names,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     return LayerOutput(
@@ -2649,12 +3309,13 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
 
        sum_to_one_norm = sum_to_one_norm_layer(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
-    :type layer_attr: ExtraLayerAttribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
+                       for details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2667,10 +3328,47 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
         name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
 
 
+@wrap_name_default()
+@layer_support()
+def row_l2_norm_layer(input, name=None, layer_attr=None):
+    """
+    A layer for L2-normalization in each row.
+
+    .. math::
+       out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}}
+
+    where the size of :math:`in` is (batchSize x dataDim) ,
+    and the size of :math:`out` is a (batchSize x dataDim) .
+
+    The example usage is:
+
+    .. code-block:: python
+
+       row_l2_norm_layer = row_l2_norm_layer(input=layer)
+
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute
+                       for details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.ROW_L2_NORM_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
+
+
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
@@ -2690,30 +3388,27 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
                             act=ReluActivation(),
                             bias_attr=False)
 
-    This layer just simply add all input layers together, then activate the sum
-    inputs. Each input of this layer should be the same size, which is also the
-    output size of this layer.
+    This layer just simply adds all input layers together, then activates the
+    sum. All inputs should share the same dimension, which is also the dimension
+    of this layer's output.
 
     There is no weight matrix for each input, because it just a simple add
     operation. If you want a complicated operation before add, please use
     mixed_layer.
 
-    It is a very good way to set dropout outside the layers. Since not all
-    PaddlePaddle layer support dropout, you can add an add_to layer, set
-    dropout here.
-    Please refer to dropout_layer for details.
-
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layers. It could be a LayerOutput or list/tuple of
+    :param input: The input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
-    :type input: LayerOutput|list|tuple
-    :param act: Activation Type, default is tanh.
+    :type input: LayerOutput | list | tuple
+    :param act: Activation Type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: Bias attribute. If False, means no bias. None is default
-                      bias.
-    :type bias_attr: ParameterAttribute|bool
-    :param layer_attr: Extra Layer attribute.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2749,11 +3444,11 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
-    Concat all input vector into one huge vector.
-    Inputs can be list of LayerOutput or list of projection.
+    Concatenate all input vectors to one vector.
+    Inputs can be a list of LayerOutput or a list of projection.
 
     The example usage is:
 
@@ -2761,13 +3456,14 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 
         concat = concat_layer(input=[layer1, layer2])
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layers or projections
-    :type input: list|tuple|collections.Sequence
-    :param act: Activation type.
+    :param input: The input layers or projections
+    :type input: list | tuple | collections.Sequence
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2833,18 +3529,20 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 @wrap_name_default("seqconcat")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
-    Concat sequence a with sequence b.
+    Concatenate sequence a and sequence b.
 
     Inputs:
-      - a = [a1, a2, ..., an]
+      - a = [a1, a2, ..., am]
       - b = [b1, b2, ..., bn]
-      - Note that the length of a and b should be the same.
 
-    Output: [a1, b1, a2, b2, ..., an, bn]
+    Output: [a1, ..., am, b1, ..., bn]
+
+    Note that the above computation is for one sample. Multiple samples are
+    processed in one batch.
 
     The example usage is:
 
@@ -2852,20 +3550,21 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
 
         concat = seq_concat_layer(a=layer1, b=layer2)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input sequence layer
+    :param a: The first input sequence layer
     :type a: LayerOutput
-    :param b: input sequence layer
+    :param b: The second input sequence layer
     :type b: LayerOutput
-    :param act: Activation type.
+    :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute or None or bool
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2897,31 +3596,25 @@ def memory(name,
            boot_bias_active_type=None,
            boot_with_const_id=None):
     """
-    The memory layers is a layer cross each time step. Reference this output
-    as previous time step layer :code:`name` 's output.
-
-    The default memory is zero in first time step, previous time step's
-    output in the rest time steps.
+    The memory takes a layer's output at previous time step as its own output.
 
-    If boot_bias, the first time step value is this bias and
-    with activation.
+    If boot_bias, the activation of the bias is the initial value of the memory.
 
-    If boot_with_const_id, then the first time stop is a IndexSlot, the
-    Arguments.ids()[0] is this :code:`cost_id`.
+    If boot_with_const_id is set, then the memory's output at the first time step
+    is a IndexSlot, the Arguments.ids()[0] is this :code:`cost_id`.
 
-    If boot_layer is not null, the memory is just the boot_layer's output.
-    Set :code:`is_seq` is true boot layer is sequence.
+    If boot_layer is specified, the memory's output at the first time step will
+    be the boot_layer's output.
 
-    The same name layer in recurrent group will set memory on each time
-    step.
+    In other case, the default memory's output at the first time step is zero.
 
     .. code-block:: python
 
        mem = memory(size=256, name='state')
        state = fc_layer(input=mem, size=256, name='state')
 
-    If you do not want to specify the name, you can equivalently use set_input()
-    to specify the layer needs to be remembered as the following:
+    If you do not want to specify the name, you can also use set_input()
+    to specify the layer to be remembered as the following:
 
     .. code-block:: python
 
@@ -2929,26 +3622,31 @@ def memory(name,
        state = fc_layer(input=mem, size=256)
        mem.set_input(mem)
 
-    :param name: the name of the layer which this memory remembers.
+    :param name: The name of the layer which this memory remembers.
                  If name is None, user should call set_input() to specify the
                  name of the layer which this memory remembers.
     :type name: basestring
-    :param size: size of memory.
+    :param size: The dimensionality of memory.
     :type size: int
-    :param memory_name: the name of the memory.
-                        It is ignored when name is provided.
+    :param memory_name: The name of the memory. It is ignored when name is provided.
     :type memory_name: basestring
-    :param is_seq: is sequence for boot_layer
+    :param is_seq: DEPRECATED. is sequence for boot_layer
     :type is_seq: bool
-    :param boot_layer: boot layer of memory.
-    :type boot_layer: LayerOutput|None
-    :param boot_bias: boot layer's bias
-    :type boot_bias: ParameterAttribute|None
-    :param boot_bias_active_type: boot layer's active type.
+    :param boot_layer: This parameter specifies memory's output at the first time
+                       step and the output is boot_layer's output.
+    :type boot_layer: LayerOutput | None
+    :param boot_bias: The bias attribute of memory's output at the first time step.
+                      If the parameter is set to False or an object whose type is not
+                      ParameterAttribute, no bias is defined. If the parameter is set
+                      to True, the bias is initialized to zero.
+    :type boot_bias: ParameterAttribute | None
+    :param boot_bias_active_type: Activation type for memory's bias at the first time
+                                  step. LinearActivation is the default activation.
     :type boot_bias_active_type: BaseActivation
-    :param boot_with_const_id: boot layer's id.
+    :param boot_with_const_id: This parameter specifies memory's output at the first
+                               time step and the output is an index.
     :type boot_with_const_id: int
-    :return: LayerOutput object which is a memory.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if boot_bias_active_type is None:
@@ -2965,7 +3663,6 @@ def memory(name,
     memory_name = Memory(
         name,
         size,
-        is_sequence=is_seq,
         boot_layer=boot_layer.name if boot_layer is not None else None,
         boot_bias=boot_bias,
         boot_bias_active_type=boot_bias_active_type.name,
@@ -2981,8 +3678,8 @@ def memory(name,
 
 
 @wrap_bias_attr_default()
-@wrap_act_default(
-    param_names=['gate_act', 'state_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
 @layer_support()
@@ -2996,25 +3693,25 @@ def lstm_step_layer(input,
                     bias_attr=None,
                     layer_attr=None):
     """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
 
     The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
     :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.
 
     The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
 
@@ -3025,31 +3722,32 @@ def lstm_step_layer(input,
         ...
 
 
-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. The default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and users can use
     :code:`get_output_layer` to extract this output.
 
-    :param name: Layer's name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
-                 :code:`state.size`.
+    :param size: The dimension of this layer's output, which must be
+                 equal to the dimension of the state.
     :type size: int
-    :param input: input layer. :math:`Wx_t + Wh_{t-1}`
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param state: State Layer. :math:`c_{t-1}`
+    :param state: The state of the LSTM unit.
     :type state: LayerOutput
-    :param act: Activation type. Default is tanh
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param gate_act: Gate Activation Type. Default is sigmoid, and should
-                          be sigmoid only.
+    :param gate_act: Activation type of the gate. SigmoidActivation is the
+                     default activation.
     :type gate_act: BaseActivation
-    :param state_act: State Activation Type. Default is sigmoid, and should
-                           be sigmoid only.
+    :param state_act: Activation type of the state. TanhActivation is the
+                      default activation.
     :type state_act: BaseActivation
-    :param bias_attr: Bias Attribute.
-    :type bias_attr: ParameterAttribute
-    :param layer_attr: layer's extra attribute.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3094,17 +3792,31 @@ def gru_step_layer(input,
                    layer_attr=None):
     """
 
-    :param input:
+    :param input: The input of this layer, whose dimension can be divided by 3.
     :type input: LayerOutput
-    :param output_mem:
-    :param size:
-    :param act:
-    :param name:
-    :param gate_act:
-    :param bias_attr:
-    :param param_attr: the parameter_attribute for transforming the output_mem
-                       from previous step.
-    :param layer_attr:
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
+    :type act: BaseActivation
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation is
+                     the default activation.
+    :type gate_act: BaseActivation
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3149,25 +3861,47 @@ def gru_step_naive_layer(input,
                          param_attr=None,
                          layer_attr=None):
     """
-    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    GRU Step Layer, which is realized using PaddlePaddle API. It supports ERROR_CLIPPING
     and DROPOUT.
 
-    :param input:
-    :param output_mem:
-    :param size:
-    :param name:
-    :param act:
-    :param gate_act:
-    :param bias_attr:
-    :param param_attr:
-    :param layer_attr:
-    :return:
+    :param input: The input of this layer, whose dimensionality can be divided by 3.
+    :param output_mem: A memory which memorizes the output of this layer at previous
+                       time step.
+    :type output_mem: LayerOutput
+    :param size: The dimension of this layer's output. If it is not set or set to None,
+                 it will be set to one-third of the dimension of the input automatically.
+    :type size: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param act: Activation type of this layer's output. TanhActivation
+                is the default activation.
+    :type act: BaseActivation
+    :param gate_act: Activation type of this layer's two gates. SigmoidActivation
+                     is the default activation.
+    :type gate_act: BaseActivation
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute, no bias
+                      is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
     """
     if input.size % 3 != 0:
         raise ValueError("GruStep input size must be divided by 3")
     if size is None:
         size = input.size / 3
 
+    if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
+        raise ValueError("You should not specify the field `name` in bias_attr."
+                         " Otherwise, the three biases, which correponding to "
+                         " the two gates and the mixed layer for computing Wx+b"
+                         ", will share the same parameter matrix unexpectedly.")
+
     def __gate__(gate_name, offset):
         with mixed_layer(
                 name=name + "_" + gate_name,
@@ -3214,14 +3948,15 @@ def get_output_layer(input, arg_name, name=None, layer_attr=None):
     output besides the default one, please use get_output_layer first to get
     the output from input.
 
-    :param name: Layer's name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: get output layer's input. And this layer should contains
+    :param input: The input layer. And this layer should contain
                    multiple outputs.
     :type input: LayerOutput
-    :param arg_name: Output name from input.
+    :param arg_name: The name of the output to be extracted from the input layer.
     :type arg_name: basestring
-    :param layer_attr: Layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3276,17 +4011,22 @@ def recurrent_layer(input,
         out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
 
 
-    :param input: Input Layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param act: activation.
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param bias_attr: bias attribute.
-    :type bias_attr: ParameterAttribute
-    :param param_attr: parameter attribute.
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If the parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param name: name of the layer
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3311,20 +4051,22 @@ def recurrent_layer(input,
 class StaticInput(object):
     """
     StaticInput is only used in recurrent_group which defines a read-only memory
-    that can be a sequence or non-sequence.
+    and can be a sequence or non-sequence.
+    :param size: DEPRECATED
+    :param is_seq: DEPRECATED
     """
 
     def __init__(self, input, is_seq=False, size=None):
         assert isinstance(input, LayerOutput)
         self.input = input
-        self.is_seq = is_seq
-        assert input.size is not None or size is not None
+        assert input.size is not None
         if size is not None:
-            input.size = size
+            assert input.size == size
 
 
-class SubsequenceInput(object):
+def SubsequenceInput(input):
     """
+    DEPRECATED.
     Input sequence has sub-sequence, used in recurrent_group.
 
     The example usage is:
@@ -3333,26 +4075,17 @@ class SubsequenceInput(object):
 
        input = SubsequenceInput(layer)
     """
-
-    def __init__(self, input):
-        assert isinstance(input, LayerOutput)
-        assert input.size is not None
-        self.input = input
+    return input
 
 
 @wrap_name_default("recurrent_group")
-def recurrent_group(step,
-                    input,
-                    reverse=False,
-                    name=None,
-                    targetInlink=None,
-                    is_generating=False):
+def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
     """
     Recurrent layer group is an extremely flexible recurrent unit in
     PaddlePaddle. As long as the user defines the calculation done within a
     time step, PaddlePaddle will iterate such a recurrent calculation over
-    sequence input. This is extremely usefull for attention based model, or
-    Neural Turning Machine like models.
+    sequence input. This is useful for attention-based models, or Neural
+    Turning Machine like models.
 
     The basic usage (time steps) is:
 
@@ -3374,18 +4107,17 @@ def recurrent_group(step,
                   demo/seqToseq/seqToseq_net.py
     - sequence steps: paddle/gserver/tests/sequence_nest_layer_group.conf
 
-    :param step: recurrent one time step function.The input of this function is
-                 input of the group. The return of this function will be
-                 recurrent group's return value.
+    :param step: A step function which takes the input of recurrent_group as its own
+                 input and returns values as recurrent_group's output every time step.
 
-                 The recurrent group scatter a sequence into time steps. And
-                 for each time step, will invoke step function, and return
-                 a time step result. Then gather each time step of output into
+                 The recurrent group scatters a sequence into time steps. And
+                 for each time step, it will invoke step function, and return
+                 a time step result. Then gather outputs of each time step into
                  layer group's output.
 
     :type step: callable
 
-    :param name: recurrent_group's name.
+    :param name: The recurrent_group's name. It is optional.
     :type name: basestring
 
     :param input: Input links array.
@@ -3393,15 +4125,16 @@ def recurrent_group(step,
                   LayerOutput will be scattered into time steps.
                   SubsequenceInput will be scattered into sequence steps.
                   StaticInput will be imported to each time step, and doesn't change
-                  through time. It's a mechanism to access layer outside step function.
+                  over time. It's a mechanism to access layer outside step function.
 
-    :type input: LayerOutput|StaticInput|SubsequenceInput|list|tuple
+    :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
 
-    :param reverse: If reverse is set true, the recurrent unit will process the
+    :param reverse: If reverse is set to True, the recurrent unit will process the
                     input sequence in a reverse order.
     :type reverse: bool
 
-    :param targetInlink: the input layer which share info with layer group's output
+    :param targetInlink: DEPRECATED.
+                         The input layer which share info with layer group's output
 
                          Param input specifies multiple input layers. For
                          SubsequenceInput inputs, config should assign one input
@@ -3409,104 +4142,56 @@ def recurrent_group(step,
                          of words in each sentence) with all layer group's outputs.
                          targetInlink should be one of the layer group's input.
 
-    :type targetInlink: LayerOutput|SubsequenceInput
-
-    :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must
-                          be LayerOutput.
-
-    :type is_generating: bool
+    :type targetInlink: LayerOutput | SubsequenceInput
 
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
     model_type('recurrent_nn')
 
-    def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput) \
-               or isinstance(x, SubsequenceInput)
-
-    if is_single_input(input):
+    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
         input = [input]
     assert isinstance(input, collections.Sequence)
 
     def is_in_links(x):
-        return isinstance(x, LayerOutput) or isinstance(x, SubsequenceInput)
+        return isinstance(x, LayerOutput)
 
     in_links = filter(is_in_links, input)
 
-    def targetInlink_in_inlinks():
-        for inlink in in_links:
-            if isinstance(inlink, SubsequenceInput):
-                if targetInlink == inlink.input:
-                    return True
-            elif targetInlink == inlink:
-                return True
-        return False
-
-    assert (targetInlink == None or targetInlink_in_inlinks())
-    targetInlinkName = None if targetInlink == None \
-        else targetInlink.name if isinstance(targetInlink, LayerOutput) \
-        else targetInlink.input.name
-
-    contains_sub_seq = [False]
-
-    def map_in_links(x):
-        if isinstance(x, SubsequenceInput):
-            contains_sub_seq[0] = True
-            return Link(name=x.input.name, has_subseq=True)
-        else:
-            return x.name
-
     RecurrentLayerGroupWithoutOutLinksBegin(
         name=name,
-        in_links=map(map_in_links, in_links),
-        seq_reversed=reverse,
-        target_inlinkname=targetInlinkName)
+        in_links=map(lambda x: x.name, in_links),
+        seq_reversed=reverse)
     in_args = []
-    has_LayerOutput = False
     for each_input in input:
-        assert is_single_input(each_input)
-        if isinstance(each_input, LayerOutput):
-            in_args.append(each_input)
-            has_LayerOutput = True
-        elif isinstance(each_input, SubsequenceInput):
-            in_args.append(each_input.input)
-            has_LayerOutput = True
-        else:
+        if isinstance(each_input, StaticInput):  # StaticInput
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
-                name=mem_name,
-                is_seq=each_input.is_seq,
+                name=None,
                 size=each_input.input.size,
                 boot_layer=each_input.input)
-            with mixed_layer(
-                    name=mem_name,
-                    size=each_input.input.size,
-                    act=IdentityActivation()) as mix:
-                mix += identity_projection(mem)
+            mem.set_input(mem)
             in_args.append(mem)
-
-    assert (is_generating != has_LayerOutput)
+        else:
+            in_args.append(each_input)
 
     layer_outs = step(*in_args)
 
     if isinstance(layer_outs, LayerOutput):
         layer_outs = [layer_outs]
 
-    for ot in layer_outs:
-        assert isinstance(ot, LayerOutput)
-        ot.reverse = reverse
-        if contains_sub_seq[0]:
-            RecurrentLayerGroupSetOutLink(Link(ot.name, has_subseq=True))
-        else:
-            RecurrentLayerGroupSetOutLink(ot.name)
+    for layer_out in layer_outs:
+        assert isinstance(
+            layer_out, LayerOutput
+        ), "Type of step function's return value must be LayerOutput."
+        layer_out.reverse = reverse
+        RecurrentLayerGroupSetOutLink(layer_out.name)
 
     RecurrentLayerGroupEnd(name=name)
 
     for layer_out in layer_outs:
-        # Thee previous full_name is the name is the rnn group
-        # We need a full_name outside the rnn group
+        # The previous full_name is the name inside the recurrent group.
+        # We need a full_name outside the recurrent group.
         layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
 
     if len(layer_outs) == 1:
@@ -3529,7 +4214,20 @@ class BaseGeneratedInput(object):
 
 class GeneratedInput(BaseGeneratedInput):
     def after_real_step(self, input):
-        return maxid_layer(input=input, name='__beam_search_predict__')
+        if isinstance(input, LayerOutput):
+            input = [input]
+        elif isinstance(input, collections.Sequence):
+            input = list(input)
+            if len(input) > 1:
+                logger.info(
+                    ("More than one layers inside the recurrent_group "
+                     "are returned as outputs of the entire recurrent_group "
+                     "PLEASE garantee the first output is probability of "
+                     "the predicted next word."))
+
+        return [maxid_layer(
+            input=input[0], name='__beam_search_predict__')] + (
+                input[1:] if len(input) > 1 else [])
 
     def before_real_step(self):
         predict_id = memory(
@@ -3562,11 +4260,12 @@ def maxid_layer(input, name=None, layer_attr=None):
 
        maxid = maxid_layer(input=layer)
 
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3585,6 +4284,45 @@ def maxid_layer(input, name=None, layer_attr=None):
         size=l.config.size)
 
 
+@wrap_name_default()
+def dot_prod_layer(input1, input2, name=None, layer_attr=None):
+    """
+    A layer for computing the dot product of two vectors.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        dot_prod = dot_prod_layer(input1=vec1, input2=vec2)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input1: The first input layer.
+    :type input1: LayerOutput
+    :param input2: The second input layer.
+    :type input2: LayerOutput
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input1, LayerOutput)
+    assert isinstance(input2, LayerOutput)
+    assert input1.size == input2.size, ("Two inputs should have the same size.")
+
+    l = Layer(
+        name=name,
+        type=LayerType.DOT_PROD_LAYER,
+        inputs=[input1.name, input2.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.DOT_PROD_LAYER,
+        parents=[input1, input2],
+        size=l.config.size)
+
+
 @wrap_name_default()
 def out_prod_layer(input1, input2, name=None, layer_attr=None):
     """
@@ -3597,13 +4335,14 @@ def out_prod_layer(input1, input2, name=None, layer_attr=None):
 
        out_prod = out_prod_layer(input1=vec1, input2=vec2)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input1: The first input layer name.
+    :param input1: The first input layer.
     :type input: LayerOutput
-    :param input2: The second input layer name.
+    :param input2: The second input layer.
     :type input2: LayerOutput
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3638,13 +4377,14 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
 
        eos = eos_layer(input=layer, eos_id=id)
 
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param eos_id: end id of sequence
+    :param eos_id: End id of sequence
     :type eos_id: int
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3705,8 +4445,9 @@ def beam_search(step,
     - machine translation : demo/seqToseq/translation/gen.conf \
                             demo/seqToseq/seqToseq_net.py
 
-    :param name: Name of the recurrent unit that generates sequences.
-    :type name: base string
+    :param name: The name of the recurrent unit that is responsible for
+                 generating sequences. It is optional.
+    :type name: basestring
     :param step: A callable function that defines the calculation in a time
                  step, and it is applied to sequences with arbitrary length by
                  sharing a same set of weights.
@@ -3716,6 +4457,7 @@ def beam_search(step,
     :type step: callable
     :param input: Input data for the recurrent unit, which should include the
                   previously generated words as a GeneratedInput object.
+                  In beam_search, none of the input's type should be LayerOutput.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -3757,18 +4499,20 @@ def beam_search(step,
 
     real_input = []
     for i, each_input in enumerate(input):
-        assert isinstance(each_input, StaticInput) or isinstance(
-            each_input, BaseGeneratedInput)
+        assert not isinstance(each_input, LayerOutput), (
+            "in beam_search, "
+            "none of the input should has a type of LayerOutput.")
         if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1
+            assert generated_input_index == -1, ("recurrent_group accepts "
+                                                 "only one GeneratedInput.")
             generated_input_index = i
+
         else:
             real_input.append(each_input)
 
-    assert generated_input_index != -1
+    assert generated_input_index != -1, "No GeneratedInput is given."
 
     gipt = input[generated_input_index]
-    assert isinstance(gipt, BaseGeneratedInput)
 
     gipt.bos_id = bos_id
     gipt.eos_id = eos_id
@@ -3787,26 +4531,23 @@ def beam_search(step,
 
         predict = gipt.after_real_step(step(*args))
 
-        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
-
+        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
         return predict
 
-    tmp = recurrent_group(
-        step=__real_step__,
-        input=real_input,
-        reverse=False,
-        name=name,
-        is_generating=True)
-
-    return tmp
+    return recurrent_group(
+        step=__real_step__, input=real_input, reverse=False, name=name)
 
 
 def __cost_input__(input, label, weight=None):
     """
     inputs and parents for cost layers.
     """
-    ipts = [Input(input.name), Input(label.name)]
-    parents = [input, label]
+    if isinstance(input, LayerOutput):
+        input = [input]
+    if isinstance(label, LayerOutput):
+        label = [label]
+    ipts = [Input(ipt.name) for ipt in (input + label)]
+    parents = [ipt for ipt in (input + label)]
     if weight is not None:
         assert weight.size == 1
         ipts.append(Input(weight.name))
@@ -3816,26 +4557,33 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
+def square_error_cost(input,
+                      label,
+                      weight=None,
+                      name=None,
+                      coeff=1.0,
+                      layer_attr=None):
     """
-    mean squared error cost:
+    sum of square error cost:
 
     ..  math::
 
-        \\frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
+        cost = \\sum_{i=1}^N(t_i-y_i)^2
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Network prediction.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: Data label.
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -3851,7 +4599,7 @@ def mse_cost(input, label, weight=None, name=None, coeff=1.0, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
-regression_cost = mse_cost
+regression_cost = square_error_cost
 
 
 @wrap_name_default("cost")
@@ -3861,22 +4609,28 @@ def classification_cost(input,
                         weight=None,
                         name=None,
                         evaluator=classification_error_evaluator,
-                        layer_attr=None):
+                        layer_attr=None,
+                        coeff=1.):
     """
     classification cost Layer.
 
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layer name. network output.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: label layer name. data_layer often.
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param evaluator: Evaluator method.
-    :param layer_attr: layer's extra attribute.
+    :param evaluator: Evaluator method. classification_error_evaluator is the default.
+    :type evaluator: Evaluator method
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3890,6 +4644,7 @@ def classification_cost(input,
         name=name,
         type="multi-class-cross-entropy",
         inputs=ipts,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
@@ -3927,7 +4682,7 @@ def conv_operator(img,
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
     The first input is the image and the second is filter kernel. It only
-    support GPU mode.
+    supports GPU mode.
 
     The example usage is:
 
@@ -3939,27 +4694,31 @@ def conv_operator(img,
                           num_filters=64,
                           num_channels=64)
 
-    :param img: input image
+    :param img: The input image.
     :type img: LayerOutput
-    :param filter: input filter
+    :param filter: The input filter.
     :type filter: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
+    :param filter_size: The dimension of the filter kernel on the x axis.
     :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                        PaddlePaddle now supports rectangular filters,
-                        the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size_y: The dimension of the filter kernel on the y axis.
+                          If the parameter is not set or set to None, it will
+                          set to 'filter_size' automatically.
     :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of the output channels.
     :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels. If the parameter is not set
+                         or set to None, it will be automatically set to the channel
+                         number of the 'img'.
     :type num_channels: int
-    :param stride: The x dimension of the stride.
+    :param stride: The stride on the x axis.
     :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride_y: The stride on the y axis. If the parameter is not set or
+                     set to None, it will be set to 'stride' automatically.
     :type stride_y: int
-    :param padding: The x dimension of padding.
+    :param padding: The padding size on the x axis.
     :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding_y: The padding size on the y axis. If the parameter is not set
+                      or set to None, it will be set to 'padding' automatically.
     :type padding_y: int
     :return: A ConvOperator Object.
     :rtype: ConvOperator
@@ -3975,8 +4734,7 @@ def conv_operator(img,
         num_channels = img.num_filters
 
     assert isinstance(filter, LayerOutput)
-    if filter.size is not None:
-        filter.size = filter_size * filter_size_y * num_filters * num_channels
+    assert filter.size is not None
 
     opCls = ConvTransOperator if trans else ConvOperator
 
@@ -4011,9 +4769,9 @@ def conv_projection(input,
                     param_attr=None,
                     trans=False):
     """
-    Different from img_conv_layer and conv_op, conv_projection is an Projection,
-    which can be used in mixed_layer and conat_layer. It use cudnn to implement
-    conv and only support GPU mode.
+    Different from img_conv_layer and conv_op, conv_projection is a Projection,
+    which can be used in mixed_layer and concat_layer. It uses cudnn to implement
+    convolution and only supports GPU mode.
 
     The example usage is:
 
@@ -4024,34 +4782,47 @@ def conv_projection(input,
                               num_filters=64,
                               num_channels=64)
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param filter_size: The x dimension of a filter kernel.
-    :type filter_size: int
-    :param filter_size_y: The y dimension of a filter kernel. Since
-                          PaddlePaddle now supports rectangular filters,
-                          the filter's shape can be (filter_size, filter_size_y).
+    :param filter_size: The dimensions of the filter kernel. If the parameter is
+                        set to one integer, the two dimensions on x and y axises
+                        will be same when filter_size_y is not set. If it is set
+                        to a list, the first element indicates the dimension on
+                        the x axis, and the second is used to specify the dimension
+                        on the y axis when filter_size_y is not provided.
+    :type filter_size: int | tuple | list
+    :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
+                          is not set, it will be set automatically according to filter_size.
     :type filter_size_y: int
-    :param num_filters: channel of output data.
+    :param num_filters: The number of filters.
     :type num_filters: int
-    :param num_channels: channel of input data.
+    :param num_channels: The number of the input channels.
     :type num_channels: int
-    :param stride: The x dimension of the stride.
-    :type stride: int
-    :param stride_y: The y dimension of the stride.
+    :param stride: The strides. If the parameter is set to one integer, the strides
+                   on x and y axises will be same when stride_y is not set. If it is
+                   set to a list, the first element indicates the stride on the x axis,
+                   and the second is used to specify the stride on the y axis when
+                   stride_y is not provided.
+    :type stride: int | tuple | list
+    :param stride_y: The stride on the y axis.
     :type stride_y: int
-    :param padding: The x dimension of padding.
-    :type padding: int
-    :param padding_y: The y dimension of padding.
+    :param padding: The padding sizes. If the parameter is set to one integer, the padding
+                    sizes on x and y axises will be same when padding_y is not set. If it
+                    is set to a list, the first element indicates the padding size on the
+                    x axis, and the second is used to specify the padding size on the y axis
+                    when padding_y is not provided.
+    :type padding: int | tuple | list
+    :param padding_y: The padding size on the y axis.
     :type padding_y: int
     :param groups: The group number.
     :type groups: int
-    :param param_attr: Convolution param attribute. None means default attribute
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param trans: whether it is convTrans or conv
-    :type trans: boolean
-    :return: A DotMulProjection Object.
-    :rtype: DotMulProjection
+    :param trans: Whether it is ConvTransProjection or ConvProjection
+    :type trans: bool
+    :return: A Projection Object.
+    :rtype: ConvTransProjection | ConvProjection
     """
     if num_channels is None:
         assert input.num_filters is not None
@@ -4116,13 +4887,13 @@ def pad_layer(input,
               layer_attr=None):
     """
     This operation pads zeros to the input data according to pad_c,pad_h
-    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
-    of padding. And the input data shape is NCHW.
+    and pad_w. pad_c, pad_h, pad_w specify the size in the corresponding
+    dimension. And the input data shape is NCHW.
 
-    For example, pad_c=[2,3] means padding 2 zeros before the
-    input data and 3 zeros after the input data in channel dimension.
-    pad_h means padding zeros in height dimension. pad_w means padding zeros
-    in width dimension.
+    For example, pad_c=[2,3] means padding 2 zeros before the input data
+    and 3 zeros after the input data in the channel dimension. pad_h means
+    padding zeros in the height dimension. pad_w means padding zeros in the
+    width dimension.
 
     For example,
 
@@ -4157,17 +4928,18 @@ def pad_layer(input,
                        pad_h=[0,0],
                        pad_w=[2,2])
 
-    :param input: layer's input.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param pad_c: padding size in channel dimension.
-    :type pad_c: list|None
-    :param pad_h: padding size in height dimension.
-    :type pad_h: list|None
-    :param pad_w: padding size in width dimension.
-    :type pad_w: list|None
-    :param layer_attr: Extra Layer Attribute.
+    :param pad_c: The padding size in the channel dimension.
+    :type pad_c: list | None
+    :param pad_h: The padding size in the height dimension.
+    :type pad_h: list | None
+    :param pad_w: The padding size in the width dimension.
+    :type pad_w: list | None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :param name: layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4214,7 +4986,7 @@ def pad_layer(input,
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
     """
-    This layer performs cyclic convolution for two input. For example:
+    This layer performs cyclic convolution on two inputs. For example:
       - a[in]: contains M elements.
       - b[in]: contains N elements (N should be odd).
       - c[out]: contains M elements.
@@ -4223,7 +4995,7 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
         c[i] = \sum_{j=-(N-1)/2}^{(N-1)/2}a_{i+j} * b_{j}
 
-    In this formular:
+    In this formula:
      - a's index is computed modulo M. When it is negative, then get item from
        the right side (which is the end of array) to the left.
      - b's index is computed modulo N. When it is negative, then get item from
@@ -4235,13 +5007,14 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
        conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param layer_attr: layer's extra attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4272,8 +5045,8 @@ def tensor_layer(a,
                  bias_attr=None,
                  layer_attr=None):
     """
-    This layer performs tensor operation for two input.
-    For example, each sample:
+    This layer performs tensor operation on two inputs.
+    For example:
 
     .. math::
        y_{i} = a * W_{i} * {b^\mathrm{T}}, i=0,1,...,K-1
@@ -4291,24 +5064,27 @@ def tensor_layer(a,
 
        tensor = tensor_layer(a=layer1, b=layer2, size=1000)
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: Input layer a.
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b.
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param size: the layer dimension.
-    :type size: int.
-    :param act: Activation Type. Default is tanh.
+    :param size: The dimension of this layer.
+    :type size: int
+    :param act: Activation type. LinearActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4329,7 +5105,7 @@ def tensor_layer(a,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def selective_fc_layer(input,
                        size,
                        select=None,
@@ -4343,7 +5119,7 @@ def selective_fc_layer(input,
                        layer_attr=None):
     """
     Selectived fully connected layer. Different from fc_layer, the output
-    of this layer maybe sparse. It requires an additional input to indicate
+    of this layer can be sparse. It requires an additional input to indicate
     several selected columns for output. If the selected columns is not
     specified, selective_fc_layer acts exactly like fc_layer.
 
@@ -4353,26 +5129,39 @@ def selective_fc_layer(input,
 
        sel_fc = selective_fc_layer(input=input, size=128, act=TanhActivation())
 
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
-    :type input: LayerOutput|list|tuple
-    :param select: The select layer. The output of select layer should be a
-                   sparse binary matrix, and treat as the mask of selective fc.
-                   If is None, acts exactly like fc_layer.
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
+    :param select: The layer to select columns to output. It should be a sparse
+                   binary matrix, and is treated as the mask of selective fc. If
+                   it is not set or set to None, selective_fc_layer acts exactly
+                   like fc_layer.
     :type select: LayerOutput
-    :param size: The layer dimension.
+    :param size: The dimension of this layer, which should be equal to that of
+                 the layer 'select'.
     :type size: int
-    :param act: Activation Type. Default is tanh.
+    :param act: Activation type. TanhActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute.
+    :param pass_generation: The flag which indicates whether it is during generation.
+    :type pass_generation: bool
+    :param has_selected_colums: The flag which indicates whether the parameter 'select'
+                                has been set. True is the default.
+    :type has_selected_colums: bool
+    :param mul_ratio: A ratio helps to judge how sparse the output is and determine
+                      the computation method for speed consideration.
+    :type mul_ratio: float
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param bias_attr: The Bias Attribute. If no bias, then pass False or
-                      something not type of ParameterAttribute. None will get a
-                      default Bias.
-    :type bias_attr: ParameterAttribute|None|Any
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4384,6 +5173,13 @@ def selective_fc_layer(input,
         if isinstance(param_attr, collections.Sequence):
             assert len(input) == len(param_attr)
         else:
+            if "parameter_name" in param_attr.attr and len(input) > 1:
+                logger.fatal(
+                    "When the name field of param_attr is manually specified "
+                    "and the input is a list, the param_attr should also be a "
+                    "list with each item being the param_attr for each input "
+                    "item. If only one named param_attr is provided, all the "
+                    "input items would share this parameter.")
             param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
 
     assert isinstance(input, collections.Sequence)
@@ -4415,7 +5211,7 @@ def selective_fc_layer(input,
 @layer_support()
 def sampling_id_layer(input, name=None, layer_attr=None):
     """
-    A layer for sampling id from multinomial distribution from the input layer.
+    A layer for sampling id from a multinomial distribution from the input layer.
     Sampling one id for one sample.
 
     The simple usage is:
@@ -4424,12 +5220,13 @@ def sampling_id_layer(input, name=None, layer_attr=None):
 
        samping_id = sampling_id_layer(input=input)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4450,8 +5247,7 @@ def slope_intercept_layer(input,
                           intercept=0.0,
                           layer_attr=None):
     """
-    This layer for applying a slope and an intercept to the input
-    element-wise. There is no activation and weight.
+    This layer for applying a slope and an intercept to the input.
 
     ..  math::
         y = slope * x + intercept
@@ -4462,16 +5258,17 @@ def slope_intercept_layer(input,
 
        scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param slope: the scale factor.
-    :type slope: float.
-    :param intercept: the offset.
-    :type intercept: float.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param slope: The scale factor.
+    :type slope: float
+    :param intercept: The offset.
+    :type intercept: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4526,12 +5323,13 @@ def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
     :type weights: LayerOutput
     :param vectors: The vector layer.
     :type vectors: LayerOutput
-    :param size: the dimension of this layer.
+    :param size: The dimension of this layer.
     :type size: int
-    :param name: The Layer Name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4578,11 +5376,11 @@ def block_expand_layer(input,
 
        outputW = 1 + (2 * padding_x + imgSizeW - block_x + stride_x - 1) / stride_x
 
-    The expand method is the same with ExpandConvLayer, but saved the transposed
+    The expanding method is the same with ExpandConvLayer, but saved the transposed
     value. After expanding, output.sequenceStartPositions will store timeline.
-    The number of time steps are outputH * outputW and the dimension of each
+    The number of time steps is outputH * outputW and the dimension of each
     time step is block_y * block_x * num_channels. This layer can be used after
-    convolution neural network, and before recurrent neural network.
+    convolutional neural network, and before recurrent neural network.
 
     The simple usage is:
 
@@ -4595,10 +5393,12 @@ def block_expand_layer(input,
                                          block_x=1,
                                          block_x=3)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: The channel number of input layer.
-    :type num_channels: int|None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
     :param block_x: The width of sub block.
     :type block_x: int
     :param block_y: The width of sub block.
@@ -4611,10 +5411,11 @@ def block_expand_layer(input,
     :type padding_x: int
     :param padding_y: The padding size in vertical direction.
     :type padding_y: int
-    :param name: The name of this layer, which can not specify.
-    :type name: None|basestring.
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param name: The name of this layer. It is optional.
+    :type name: basestring.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4644,18 +5445,37 @@ def block_expand_layer(input,
 @layer_support()
 def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     """
-    A layer to do max out on conv layer output.
-      - Input: output of a conv layer.
-      - Output: feature map size same as input. Channel is (input channel) / groups.
+    A layer to do max out on convolutional layer output.
+      - Input: the output of a convolutional layer.
+      - Output: feature map size same as the input's, and its channel number is
+        (input channel) / groups.
 
     So groups should be larger than 1, and the num of channels should be able
-    to devided by groups.
+    to be devided by groups.
+
+    Reference:
+        `Maxout Networks
+        <http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf>`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        <https://arxiv.org/pdf/1312.6082v4.pdf>`_
+
+
+    .. math::
+
+       & out = \max_k (in[n, k, o_c , s])
+
+       & out_{i * s + j} = \max_k in_{  k * o_{c} * s + i * s + j}
+
+       & s = \\frac{input.size}{ num\_channels}
+
+       & o_{c} = \\frac{num\_channels}{groups}
+
+       & 0 \le i < o_{c}
+
+       & 0 \le j < s
+
+       & 0 \le k < groups
 
-    Please refer to Paper:
-      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-      - Multi-digit Number Recognition from Street View \
-        Imagery using Deep Convolutional Neural Networks: \
-        https://arxiv.org/pdf/1312.6082v4.pdf
 
     The simple usage is:
 
@@ -4665,21 +5485,22 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
                              num_channels=128,
                              groups=4)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_channels: The channel number of input layer. If None will be set
-                     automatically from previous output.
-    :type num_channels: int|None
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
     :param groups: The group number of input layer.
     :type groups: int
-    :param name: The name of this layer, which can not specify.
-    :type name: None|basestring.
-    :param layer_attr: Extra Layer attribute.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
     assert groups > 1
     if num_channels is None:
@@ -4707,22 +5528,22 @@ def ctc_layer(input,
               layer_attr=None):
     """
     Connectionist Temporal Classification (CTC) is designed for temporal
-    classication task. That is, for sequence labeling problems where the
+    classication task. e.g. sequence labeling problems where the
     alignment between the inputs and the target labels is unknown.
 
-    More details can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_
+    Reference:
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
-        Considering the 'blank' label needed by CTC, you need to use
-        (num_classes + 1) as the input size. num_classes is the category number.
-        And the 'blank' is the last category index. So the size of 'input' layer, such as
-        fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
-        should also be num_classes + 1.
+        Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
+        as the size of the input, where num_classes is the category number.
+        And the 'blank' is the last category index. So the size of 'input' layer (e.g.
+        fc_layer with softmax activation) should be (num_classes + 1). The size of
+        ctc_layer should also be (num_classes + 1).
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4731,18 +5552,19 @@ def ctc_layer(input,
                       size=9055,
                       norm_by_times=True)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
     :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
     :type size: int
-    :param name: The name of this layer
-    :type name: basestring|None
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param norm_by_times: Whether to do normalization by times. False is the default.
     :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4783,33 +5605,21 @@ def warp_ctc_layer(input,
     building process, PaddlePaddle will clone the source codes, build and
     install it to :code:`third_party/install/warpctc` directory.
 
-    To use warp_ctc layer, you need to specify the path of :code:`libwarpctc.so`,
-    using following methods:
-
-    1. Set it in :code:`paddle.init` (python api) or :code:`paddle_init` (c api),
-    such as :code:`paddle.init(use_gpu=True,
-    warpctc_dir=your_paddle_source_dir/third_party/install/warpctc/lib)`.
-
-    2. Set environment variable LD_LIBRARY_PATH on Linux or DYLD_LIBRARY_PATH
-    on Mac OS. For instance, :code:`export
-    LD_LIBRARY_PATH=your_paddle_source_dir/third_party/install/warpctc/lib:$LD_LIBRARY_PATH`.
-
-    More details of CTC can be found by referring to `Connectionist Temporal
-    Classification: Labelling Unsegmented Sequence Data with Recurrent
-    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
-    icml2006_GravesFGS06.pdf>`_.
+    Reference:
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        with Recurrent Neural Networks
+        <http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf>`_
 
     Note:
-        - Let num_classes represent the category number. Considering the 'blank'
-          label needed by CTC, you need to use (num_classes + 1) as the input size.
-          Thus, the size of both warp_ctc layer and 'input' layer should be set to
-          num_classes + 1.
+        - Let num_classes represents the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the size of
+          warp_ctc layer.
         - You can set 'blank' to any value ranged in [0, num_classes], which
-          should be consistent as that used in your labels.
+          should be consistent with those used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-          'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected to be used instead in the 'input' layer.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4819,20 +5629,21 @@ def warp_ctc_layer(input,
                            blank=1000,
                            norm_by_times=False)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param label: The data layer of label with variable length.
+    :param label: The input label.
     :type label: LayerOutput
-    :param size: category numbers + 1.
+    :param size: The dimension of this layer, which must be equal to (category number + 1).
     :type size: int
-    :param name: The name of this layer, which can not specify.
-    :type name: basestring|None
-    :param blank: the 'blank' label used in ctc
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param blank: The 'blank' label used in ctc.
     :type blank: int
-    :param norm_by_times: Whether to normalization by times. False by default.
+    :param norm_by_times: Whether to do normalization by times. False is the default.
     :type norm_by_times: bool
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4870,7 +5681,7 @@ def crf_layer(input,
     A layer for calculating the cost of sequential conditional random
     field model.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4878,23 +5689,26 @@ def crf_layer(input,
                       label=label,
                       size=label_dim)
 
-    :param input: The first input layer is the feature.
+    :param input: The first input layer.
     :type input: LayerOutput
-    :param label: The second input layer is label.
+    :param label: The input label.
     :type label: LayerOutput
     :param size: The category number.
     :type size: int
-    :param weight: The third layer is "weight" of each sample, which is an
-                  optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param param_attr: Parameter attribute. None means default attribute
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4940,11 +5754,11 @@ def crf_decoding_layer(input,
     """
     A layer for calculating the decoding sequence of sequential conditional
     random field model. The decoding sequence is stored in output.ids.
-    If a second input is provided, it is treated as the ground-truth label, and
-    this layer will also calculate error. output.value[i] is 1 for incorrect
-    decoding or 0 for correct decoding.
+    If the input 'label' is provided, it is treated as the ground-truth label, and
+    this layer will also calculate error. output.value[i] is 1 for an incorrect
+    decoding and 0 for the correct.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4953,16 +5767,18 @@ def crf_decoding_layer(input,
 
     :param input: The first input layer.
     :type input: LayerOutput
-    :param size: size of this layer.
+    :param size: The dimension of this layer.
     :type size: int
-    :param label: None or ground-truth label.
-    :type label: LayerOutput or None
-    :param param_attr: Parameter attribute. None means default attribute
+    :param label: The input label.
+    :type label: LayerOutput | None
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4989,7 +5805,11 @@ def crf_decoding_layer(input,
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 
-@wrap_act_default(act=SigmoidActivation())
+"""
+Following are cost Layers.
+"""
+
+
 @wrap_bias_attr_default(has_bias=True)
 @wrap_param_attr_default()
 @wrap_name_default()
@@ -4997,7 +5817,6 @@ def crf_decoding_layer(input,
 def nce_layer(input,
               label,
               num_classes=None,
-              act=None,
               param_attr=None,
               weight=None,
               num_neg_samples=10,
@@ -5007,8 +5826,10 @@ def nce_layer(input,
               layer_attr=None):
     """
     Noise-contrastive estimation.
-    Implements the method in the following paper:
-    A fast and simple algorithm for training neural probabilistic language models.
+
+    Reference:
+        `A fast and simple algorithm for training neural probabilistic language
+        models. <https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf>`_
 
     The example usage is:
 
@@ -5018,31 +5839,42 @@ def nce_layer(input,
                         param_attr=[attr1, attr2], weight=layer3,
                         num_classes=3, neg_distribution=[0.1,0.3,0.6])
 
-    :param name: layer name
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
-    :type input: LayerOutput|list|tuple|collections.Sequence
-    :param label: label layer
+    :param input: The first input of this layer.
+    :type input: LayerOutput | list | tuple | collections.Sequence
+    :param label: The input label.
     :type label: LayerOutput
-    :param weight: weight layer, can be None(default)
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param num_classes: number of classes.
+    :param num_classes: The number of classes.
     :type num_classes: int
-    :param act: Activation, default is Sigmoid.
+    :param act: Activation type. SigmoidActivation is the default activation.
     :type act: BaseActivation
-    :param param_attr: The Parameter Attribute|list.
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
     :type param_attr: ParameterAttribute
-    :param num_neg_samples: number of negative samples. Default is 10.
+    :param num_neg_samples: The number of sampled negative labels. 10 is the
+                            default value.
     :type num_neg_samples: int
-    :param neg_distribution: The distribution for generating the random negative labels.
-                             A uniform distribution will be used if not provided.
-                             If not None, its length must be equal to num_classes.
-    :type neg_distribution: list|tuple|collections.Sequence|None
-    :param bias_attr: Bias parameter attribute. True if no bias.
-    :type bias_attr: ParameterAttribute|None|False
-    :param layer_attr: Extra Layer Attribute.
+    :param neg_distribution: The discrete noisy distribution over the output
+                             space from which num_neg_samples negative labels
+                             are sampled. If this parameter is not set, a
+                             uniform distribution will be used. A user-defined
+                             distribution is a list whose length must be equal
+                             to the num_classes. Each member of the list defines
+                             the probability of a class given input x.
+    :type neg_distribution: list | tuple | collections.Sequence | None
+    :param bias_attr: The parameter attribute for bias. If this parameter is set to
+                      False or an object whose type is not ParameterAttribute,
+                      no bias is defined. If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
-    :return: layer name.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     if isinstance(input, LayerOutput):
@@ -5065,8 +5897,6 @@ def nce_layer(input,
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
         assert abs(sum(neg_distribution) - 1.0) < 1e-5
-    if not isinstance(act, BaseActivation):
-        raise TypeError()
 
     ipts_for_layer = []
     parents = []
@@ -5088,7 +5918,7 @@ def nce_layer(input,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
         neg_sampling_dist=neg_distribution,
-        active_type=act.name,
+        active_type=SigmoidActivation().name,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
@@ -5098,12 +5928,7 @@ def nce_layer(input,
         LayerType.NCE_LAYER,
         parents=parents,
         size=l.config.size,
-        activation=act)
-
-
-"""
-following are cost Layers.
-"""
+        activation=SigmoidActivation())
 
 
 @wrap_name_default()
@@ -5116,11 +5941,11 @@ def rank_cost(left,
               coeff=1.0,
               layer_attr=None):
     """
-    A cost Layer for learning to rank using gradient descent. Details can refer
-    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
-    ICML_ranking.pdf>`_.
-    This layer contains at least three inputs. The weight is an optional
-    argument, which affects the cost.
+    A cost Layer for learning to rank using gradient descent.
+
+    Reference:
+        `Learning to Rank using Gradient Descent
+        <http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf>`_
 
     .. math::
 
@@ -5137,7 +5962,7 @@ def rank_cost(left,
       - :math:`o_i` and :math:`o_j`: the left output and right output.
         Their dimension is one.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5151,14 +5976,16 @@ def rank_cost(left,
     :type right: LayerOutput
     :param label: Label is 1 or 0, means positive order and reverse order.
     :type label: LayerOutput
-    :param weight: The weight affects the cost, namely the scale of cost.
-                   It is an optional argument.
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutput
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5194,7 +6021,7 @@ def lambda_cost(input,
     """
     lambdaCost for lambdaRank LTR approach.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5203,25 +6030,25 @@ def lambda_cost(input,
                          NDCG_num=8,
                          max_sort_size=-1)
 
-    :param input: Samples of the same query should be loaded as sequence.
+    :param input: The first input of this layer, which is often a document
+                  samples list of the same query and whose type must be sequence.
     :type input: LayerOutput
-    :param score: The 2nd input. Score of each sample.
+    :param score: The scores of the samples.
     :type input: LayerOutput
     :param NDCG_num: The size of NDCG (Normalized Discounted Cumulative Gain),
-                     e.g., 5 for NDCG@5. It must be less than for equal to the
-                     minimum size of lists.
+                     e.g., 5 for NDCG@5. It must be less than or equal to the
+                     minimum size of the list.
     :type NDCG_num: int
-    :param max_sort_size: The size of partial sorting in calculating gradient.
-                          If max_sort_size = -1, then for each list, the
-                          algorithm will sort the entire list to get gradient.
-                          In other cases, max_sort_size must be greater than or
-                          equal to NDCG_num. And if max_sort_size is greater
-                          than the size of a list, the algorithm will sort the
-                          entire list of get gradient.
+    :param max_sort_size: The size of partial sorting in calculating gradient. If
+                          max_sort_size is equal to -1 or greater than the number
+                          of the samples in the list, then the algorithm will sort
+                          the entire list to compute the gradient. In other cases,
+                          max_sort_size must be greater than or equal to NDCG_num.
     :type max_sort_size: int
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param layer_attr: Extra Layer Attribute.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5252,6 +6079,8 @@ def cross_entropy(input,
     """
     A loss layer for multi class entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy(input=input_layer,
@@ -5260,20 +6089,20 @@ def cross_entropy(input,
     :param input: The first input layer.
     :type input: LayerOutput.
     :param label: The input label.
-    :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring.
-    :param coeff: The cost is multiplied with coeff.
-                  The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param weight: The cost of each sample is multiplied with each weight.
-                   The weight should be a layer with size=1. Note that gradient
-                   will not be calculated for weight.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param weight: The weight layer defines a weight for each sample in the
+                   mini-batch. It is optional.
     :type weight: LayerOutout
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
 
     ipts, parents = __cost_input__(input, label, weight)
@@ -5298,25 +6127,29 @@ def cross_entropy_with_selfnorm(input,
     A loss layer for multi class entropy with selfnorm.
     Input should be a vector of positive numbers, without normalization.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy_with_selfnorm(input=input_layer,
                                           label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
-    :type softmax_selfnorm_alpha: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type softmax_selfnorm_alpha: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype: LayerOutput.
+    :rtype: LayerOutput
     """
     Layer(
         name=name,
@@ -5337,17 +6170,20 @@ def cross_entropy_with_selfnorm(input,
 @layer_support()
 def sum_cost(input, name=None, layer_attr=None):
     """
-    A loss layer which calculate the sum of the input as loss
+    A loss layer which calculates the sum of the input as loss.
+
+    The example usage is:
 
     .. code-block:: python
 
        cost = sum_cost(input=input_layer)
 
-    :param input: The first input layer.
-    :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring.
-    :param layer_attr: Extra Layer Attribute.
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
@@ -5364,38 +6200,110 @@ def sum_cost(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
+def huber_regression_cost(input,
+                          label,
+                          name=None,
+                          delta=1.0,
+                          coeff=1.0,
+                          layer_attr=None):
     """
-    A loss layer for huber loss.
+    In statistics, the Huber loss is a loss function used in robust regression,
+    that is less sensitive to outliers in data than the squared error loss.
+    Given a prediction f(x), a label y and :math:`\delta`, the loss function
+    is defined as:
+
+    .. math::
+
+       loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta
+
+       loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise
+
+    The example usage is:
 
     .. code-block:: python
 
-       cost = huber_cost(input=input_layer,
-                         label=label_layer)
+       cost = huber_regression_cost(input=input_layer, label=label_layer)
 
     :param input: The first input layer.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param label: The input label.
-    :type input: LayerOutput.
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
-    :type coeff: float.
-    :param layer_attr: Extra Layer Attribute.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param delta: The difference between the observed and predicted values.
+    :type delta: float
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
     assert isinstance(input, LayerOutput)
+    Layer(
+        name=name,
+        type=LayerType.HUBER_REGRESSION,
+        inputs=[input.name, label.name],
+        delta=delta,
+        coeff=coeff,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.HUBER_REGRESSION, parents=[input, label], size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def huber_classification_cost(input,
+                              label,
+                              name=None,
+                              coeff=1.0,
+                              layer_attr=None):
+    """
+    For classification purposes, a variant of the Huber loss called modified Huber
+    is sometimes used. Given a prediction f(x) (a real-valued classifier score) and
+    a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber
+    loss is defined as:
+
+    .. math:
+
+       loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1
+
+       loss = -4yf(x), otherwise
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = huber_classification_cost(input=input_layer, label=label_layer)
+
+    :param input: The first input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
+    :type coeff: float
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
     if input.size is not None:
         assert input.size == 1
     Layer(
         name=name,
-        type=LayerType.HUBER,
+        type=LayerType.HUBER_CLASSIFICATION,
         inputs=[input.name, label.name],
         coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(name, LayerType.HUBER, parents=[input, label], size=1)
+    return LayerOutput(
+        name, LayerType.HUBER_CLASSIFICATION, parents=[input, label], size=1)
 
 
 @wrap_name_default()
@@ -5408,6 +6316,8 @@ def multi_binary_label_cross_entropy(input,
     """
     A loss layer for multi binary label cross entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = multi_binary_label_cross_entropy(input=input_layer,
@@ -5417,11 +6327,13 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5429,10 +6341,10 @@ def multi_binary_label_cross_entropy(input,
 
     if input.activation is None or \
             not isinstance(input.activation, SigmoidActivation):
-        logger.log(
-            logging.WARN,
-            "%s is not recommend for multi_binary_label_cross_entropy's activation, "
-            "maybe the sigmoid is better" % repr(input.activation))
+        logger.log(logging.WARN,
+                   ("%s is not a recommended activation for "
+                    "multi_binary_label_cross_entropy, sigmoid is better") %
+                   repr(input.activation))
 
     Layer(
         name=name,
@@ -5447,12 +6359,119 @@ def multi_binary_label_cross_entropy(input,
         size=1)
 
 
+class BeamInput(object):
+    """
+    Define the input for cross_entropy_over_beam layer.
+
+    A beam is made up of a triple: the first one is scores over all
+    candidates; the second one is indices of top k selected candidates; the
+    third one is the index of ground truth, which is also always called
+    gold.
+    """
+
+    def __init__(self, candidate_scores, selected_candidates, gold):
+        assert isinstance(candidate_scores, LayerOutput)
+        self.candidate_scores = candidate_scores
+        assert candidate_scores.size == 1
+
+        assert isinstance(selected_candidates, LayerOutput)
+        self.selected_candidates = selected_candidates
+
+        assert isinstance(gold, LayerOutput)
+        self.gold = gold
+
+
+@wrap_name_default()
+@layer_support()
+def cross_entropy_over_beam(input, name=None):
+    """
+    This layer is used in learning to search models, which is to solve complex
+    joint prediction problems based on learning to search through a
+    problem-defined search space.
+
+    Specifically, the learning to search process for this layer begins with
+    searching a target sequence from a nested sequence. In the first search
+    step, top beam size sequences with highest scores, indices of these top k
+    sequences in the original nested sequence, and the ground truth (also
+    called gold) altogether (a triple) make up of the first beam.
+
+    Then, several special positions, for example, start and end positions
+    that define meaningful segments are searched. In these searches, top k
+    positions with highest scores are selected, and then sequence, starting
+    from the selected starts till ends of the sequences (or a fixed position)
+    are taken to search next.
+
+    We call the possible top k results returned in one search the beam. This
+    search process can be repeated for pre-defined turns and leads to several
+    beam expansions.
+
+    Finally, the layer cross_entropy_over_beam takes all the beam expansions
+    which contain several candidate targets found along the multi-step search.
+    cross_entropy_over_beam calculates cross entropy over the expanded beams
+    which all the candidates in the beam as the normalized factor.
+
+    Note that, if gold falls off the beam at search step t, then the cost is
+    calculated over the beam at step t.
+
+    This cost layer always works together with kmax_seq_score_layer,
+    sub_nested_seq_layer, and sequence_slice_layer to trim the input to form a
+    sub-search space.
+
+
+    The example usage is:
+
+    .. code-block:: python
+
+       cost = cross_entropy_over_beam(input=[
+           BeamInput(
+               candidate_scores=beam1_candidates,
+               selected_candidates=beam1_topk,
+               gold=gold1),
+           BeamInput(
+               candidate_scores=beam2_candidates,
+               selected_candidates=beam2_topk,
+               gold=gold2),
+       ])
+
+
+    :param input: Input beams for this layer.
+    :type input: BeamInput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    if isinstance(input, BeamInput):
+        input = [input]
+    else:
+        assert isinstance(input, list), (
+            'input for cross_entropy_over_beam shold be a python list '
+            'of BeamInput object.')
+        for ipt in input:
+            assert isinstance(ipt, BeamInput), (
+                'input for cross_entropy_over_beam '
+                'should be a BeamInput object.')
+
+    ipts = []
+    parents = []
+    for beam in input:
+        parents += [beam.candidate_scores, beam.selected_candidates, beam.gold]
+        ipts += [
+            beam.candidate_scores.name, beam.selected_candidates.name,
+            beam.gold.name
+        ]
+
+    Layer(name=name, type=LayerType.CROSS_ENTROPY_OVER_BEAM, inputs=ipts)
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
+
+
 @wrap_name_default()
 @layer_support()
 def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     This is a L1 loss but more smooth. It requires that the
-    size of input and label are equal. The formula is as follows,
+    sizes of input and label are equal. The formula is as follows,
 
     .. math::
 
@@ -5464,8 +6483,11 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
-    More details can be found by referring to `Fast R-CNN
-    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+    Reference:
+        `Fast R-CNN
+        <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+
+    The example usage is:
 
     .. code-block:: python
 
@@ -5476,11 +6498,13 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param name: The name of this layers. It is not necessary.
-    :type name: None|basestring
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param coeff: The weight of the gradient in the back propagation.
+                  1.0 is the default value.
     :type coeff: float
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5502,12 +6526,12 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
 @wrap_name_default()
 def multiplex_layer(input, name=None, layer_attr=None):
     """
-    This layer multiplex multiple layers according to the index,
-    which is provided by the first input layer.
-    inputs[0]: the index of the layer to output of size batchSize.
+    This layer multiplex multiple layers according to the indexes,
+    which are provided by the first input layer.
+    inputs[0]: the indexes of the layers to form the output of size batchSize.
     inputs[1:N]; the candidate output data.
-    For each index i from 0 to batchSize -1, the output is the i-th row of the
-    (index[i] + 1)-th layer.
+    For each index i from 0 to batchSize - 1, the i-th row of the output is the
+    the same to the i-th row of the (index[i] + 1)-th layer.
 
     For each i-th row of output:
     .. math::
@@ -5516,15 +6540,18 @@ def multiplex_layer(input, name=None, layer_attr=None):
     where, y is output. :math:`x_{k}` is the k-th input layer and
     :math:`k = x_{0}[i] + 1`.
 
+    The example usage is:
+
     .. code-block:: python
 
        maxid = multiplex_layer(input=layers)
 
     :param input: Input layers.
     :type input: list of LayerOutput
-    :param name: Layer name.
+    :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -5548,3 +6575,957 @@ def multiplex_layer(input, name=None, layer_attr=None):
         layer_type=LayerType.MULTIPLEX_LAYER,
         parents=input,
         size=l.config.size)
+
+
+@wrap_name_default("dropout")
+def dropout_layer(input, dropout_rate, name=None):
+    """
+
+    The example usage is:
+
+    .. code-block:: python
+
+        dropout = dropout_layer(input=input_layer, dropout_rate=0.5)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param dropout_rate: The probability of dropout.
+    :type dropout_rate: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    return addto_layer(
+        name=name,
+        input=input,
+        act=LinearActivation(),
+        bias_attr=False,
+        layer_attr=ExtraAttr(drop_rate=dropout_rate))
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support(DROPOUT)
+def row_conv_layer(input,
+                   context_len,
+                   act=None,
+                   name=None,
+                   param_attr=None,
+                   layer_attr=None):
+    """
+
+    The row convolution is called lookahead convolution. It is firstly
+    introduced in paper of `Deep Speech 2: End-to-End Speech Recognition
+    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
+
+    The bidirectional RNN that learns representation for a sequence by
+    performing a forward and a backward pass through the entire sequence.
+    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
+    to deploy in an online and low-latency setting. The lookahead convolution
+    incorporates information from future subsequences in a computationally
+    efficient manner to improve unidirectional RNNs.
+
+    The connection of row convolution is different from the 1D sequence
+    convolution. Assumed that, the future context-length is k, that is to say,
+    it can get the output at timestep t by using the the input feature from t-th
+    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
+    activations are d, the activations r_t for the new layer at time-step t are:
+
+    .. math::
+
+        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
+                  \quad \\text{for} \quad  (1 \leq i \leq d)
+
+    Note:
+        The `context_len` is `k + 1`. That is to say, the lookahead step
+        number plus one equals context_len.
+
+
+    .. code-block:: python
+
+       row_conv = row_conv_layer(input=input_layer, context_len=3)
+
+
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param context_len: The context length equals the lookahead step number
+                        plus one.
+    :type context_len: int
+    :param act: Activation Type. LinearActivation is the default activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert context_len > 0, "the context_len must be greatet than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        context_length=context_len,
+        type=LayerType.ROW_CONV_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
+
+
+@layer_support()
+@wrap_name_default()
+def prelu_layer(input,
+                name=None,
+                partial_sum=1,
+                channel_shared=None,
+                num_channels=None,
+                param_attr=None,
+                layer_attr=None):
+    """
+    The Parametric Relu activation that actives outputs with a learnable weight.
+
+    Reference:
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification <http://arxiv.org/pdf/1502.01852v1.pdf>`_
+
+    .. math::
+       z_i &\\quad if \\quad z_i > 0 \\\\
+       a_i * z_i  &\\quad \\mathrm{otherwise}
+
+    The example usage is:
+
+    .. code-block:: python
+
+       prelu = prelu_layer(input=layers, partial_sum=1)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param partial_sum: this parameter makes a group of inputs share the same weight.
+
+        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share the same weight.
+        - partial_sum = number of outputs, indicates all elements share the same weight.
+
+    :type partial_sum: int
+    :param channel_shared: whether or not the parameter are shared across channels.
+
+        - channel_shared = True, we set the partial_sum to the number of outputs.
+        - channel_shared = False, we set the partial_sum to the number of elements in one channel.
+
+    :type channel_shared: bool
+    :param num_channels: number of input channel.
+    :type num_channels: int
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
+
+    if not param_attr:
+        param_attr = ParamAttr(initial_mean=0.25, initial_std=0.0)
+    else:
+        assert isinstance(param_attr, ParameterAttribute)
+
+    if num_channels is None:
+        assert input.num_filters is not None, \
+                'the input channel cannot be detected, please specify the num_channels parameter'
+        num_channels = input.num_filters
+
+    if channel_shared is not None:
+        assert isinstance(channel_shared, bool)
+        assert (input.height != 0 and input.width != 0), \
+            'input height and widht must be setted'
+        if channel_shared:
+            partial_sum = input.height * input.width * num_channels
+        else:
+            partial_sum = input.height * input.width
+
+    l = Layer(
+        name=name,
+        type=LayerType.PRELU,
+        inputs=Input(input.name, **param_attr.attr),
+        partial_sum=partial_sum,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.PRELU,
+        parents=input,
+        num_filters=num_channels,
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    product between :math:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        `Language Modeling with Gated Convolutional Networks
+        <https://arxiv.org/abs/1612.08083>`_
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param size: The dimension of this layer's output.
+    :type size: int
+    :param act: Activation type of the projection. LinearActivation is the default
+                activation.
+    :type act: BaseActivation
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param gate_attr: The extra layer attribute of the gate. See ExtraLayerAttribute for
+                      details.
+    :type gate_attr: ExtraLayerAttribute | None
+    :param gate_param_attr: The parameter attribute of the gate. See ParameterAttribute
+                            for details.
+    :type gate_param_attr: ParameterAttribute
+    :param gate_bias_attr: The bias attribute of the gate. If this parameter is set to False or
+                           an object whose type is not ParameterAttribute, no bias is defined.
+                           If this parameter is set to True, the bias is initialized to zero.
+    :type gate_bias_attr: ParameterAttribute | bool | None | Any
+    :param inproj_attr: Extra layer attributes of the projection. See ExtraLayerAttribute for
+                        details.
+    :type inproj_attr: ExtraLayerAttribute | None
+    :param inproj_param_attr: The parameter attribute of the projection. See ParameterAttribute
+                              for details.
+    :type inproj_param_attr: ParameterAttribute
+    :param inproj_bias_attr: The bias attribute of the projection. If this parameter is set to False
+                             or an object whose type is not ParameterAttribute, no bias is defined.
+                             If this parameter is set to True, the bias is initialized to zero.
+    :type inproj_bias_attr: ParameterAttribute | bool | None | Any
+    :param layer_attr: Extra layer attribute of the product. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute | None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        layer_attr=inproj_attr,
+        param_attr=inproj_param_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        layer_attr=gate_attr,
+        param_attr=gate_param_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
+
+
+@layer_support()
+@wrap_name_default('switch_order')
+def switch_order_layer(input,
+                       name=None,
+                       reshape_axis=None,
+                       act=None,
+                       layer_attr=None):
+    """
+    This layer switch dimension order of image input.
+    From order "batchSize, channels, height, width"
+    to order "batchSize, height, width, channels".
+
+    The example usage is:
+
+    .. code-block:: python
+       reshape_axis = 3
+       switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis)
+       reshape = {'height':[ 0, 1, 2], 'width':[3]}
+
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param reshape_axis: Specify the axises of 'height'. Its value should be positive and less than 4.
+    :type reshape_axis: int
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert reshape_axis != None and (reshape_axis > 0 and reshape_axis < 4)
+    height = [ele for ele in xrange(reshape_axis)]
+    width = [ele for ele in range(reshape_axis, 4)]
+    reshape = {'height': height, 'width': width}
+
+    l = Layer(
+        name=name,
+        inputs=input.name,
+        reshape=reshape,
+        type=LayerType.SWITCH_ORDER_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SWITCH_ORDER_LAYER,
+        activation=act,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
+    """
+    This layer crops images according to the offset and shape. Users can set
+    the crop shape through the argument 'shape' explicitly or by specifying a
+    reference input layer.
+
+    The example usage is:
+
+    .. code-block:: python
+    crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
+
+    :param input: The input of this layer. If two inputs are given, the second one
+                  will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
+    :type input: LayerOutput | Sequence
+    :param offset: The crop offset.
+    :type offset: Sequence
+    :param axis: The start axis to be cropped. For image input layer:
+        - 0: batch size
+        - 1: channels
+        - 2: height
+        - 3: width
+    :type axis: int
+    :param shape: The shape to be cropped to. Default is None.
+    :type shape: Sequence | None
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    else:
+        assert isinstance(input, collections.Sequence)
+    l = Layer(
+        inputs=[x.name for x in input],
+        axis=axis,
+        offset=offset,
+        shape=shape,
+        name=name,
+        type=LayerType.CROP_LAYER,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.CROP_LAYER,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def sub_nested_seq_layer(input, selected_indices, name=None):
+    """
+    The sub_nested_seq_layer accepts two inputs: the first one is a nested
+    sequence; the second one is a set of selceted indices in the nested sequence.
+
+    Then sub_nest_seq_layer trims the first nested sequence input according
+    to the selected indices to form a new output. This layer is useful in
+    beam training.
+
+    The example usage is:
+
+    .. code-block:: python
+
+        sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
+
+
+    :param input: The input of this layer. It is a nested sequence.
+    :type input: LayerOutput
+    :param selected_indices: A set of sequence indices in the nested sequence.
+    :type input: LayerOutput
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+    assert isinstance(selected_indices, LayerOutput), (
+        'The second input of '
+        'sub_nested_seq_layer must be a Paddle layer.')
+
+    l = Layer(
+        inputs=input.name,
+        selected_indices=selected_indices.name,
+        name=name,
+        type=LayerType.SUB_NESTED_SEQ)
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.SUB_NESTED_SEQ,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default("clip")
+def clip_layer(input, min, max, name=None):
+    """
+    A layer for clipping the input value by the threshold.
+
+    .. math::
+
+        out[i] = \min (\max (in[i],p_{1} ),p_{2} )
+
+    .. code-block:: python
+
+        clip = clip_layer(input=input_layer, min=-10, max=10)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param min: The lower threshold for clipping.
+    :type min: float
+    :param max: The upper threshold for clipping.
+    :type max: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.CLIP_LAYER,
+        inputs=[input.name],
+        min=min,
+        max=max)
+    return LayerOutput(
+        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+def seq_slice_layer(input, starts, ends, name=None):
+    """
+    seq_slice_layer will return one or several sub-sequences from the
+    input sequence layer given start and end indices.
+
+        - If only start indices are given, and end indices are set to None,
+          this layer slices the input sequence from the given start indices
+          to its end.
+        - If only end indices are given, and start indices are set to None,
+          this layer slices the input sequence from its beginning to the
+          given end indices.
+        - If start and end indices are both given, they should have the same
+          number of elements.
+
+    If start or end indices contains more than one elements, the input sequence
+    will be sliced for multiple times.
+
+
+    .. code-block:: python
+
+        seq_silce = seq_slice_layer(input=input_seq,
+                                    starts=start_pos, ends=end_pos)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be a sequence.
+    :type input: LayerOutput
+    :param starts: The start indices to slice the input sequence.
+    :type starts: LayerOutput | None
+    :param ends: The end indices to slice the input sequence.
+    :type ends: LayerOutput | None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of seq_slice layer must be a PaddlePaddle layer.')
+
+    if starts is not None:
+        assert isinstance(starts, LayerOutput), (
+            'The start indices for seq_slice layer '
+            'must be a PaddlePaddle layer.')
+    if ends is not None:
+        assert isinstance(ends, LayerOutput), (
+            'The end indices for seq_slice layer must be a PaddlePaddle layer.')
+    assert starts is not None or ends is not None, (
+        'start and end indices '
+        'cannot be set to None at the same time, at least one of '
+        'them should be given.')
+    if starts is not None and ends is not None:
+        assert starts.size == ends.size, (
+            'If start and end indices are both given to seq_slice_layer, '
+            'they should have the same width.')
+
+    Layer(
+        name=name,
+        type=LayerType.SEQ_SLICE,
+        inputs=input.name,
+        starts=starts.name if starts is not None else None,
+        ends=ends.name if ends is not None else None)
+    return LayerOutput(
+        name, LayerType.SEQ_SLICE, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def kmax_seq_score_layer(input, name=None, beam_size=1):
+    """
+    This layer accepts one input which is scores over a sequence or a nested
+    sequence, and returns indices of beam_size sequences with highest scores.
+
+    .. code-block:: python
+
+        kmax_indices = kmax_seq_score_layer(input=input_layer, beam_size)
+
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer. It stores scores over a sequence or
+                  a nested sequence and its size must be 1.
+    :type input: LayerOutput
+    :param beam_size: The indices of the sequences with top beam_size scores are returned.
+    :type beam_size: int
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), ("kmax_seq_score_layer "
+                                            "accepts only one input.")
+    assert input.size == 1, (
+        "input of kmax_seq_score_layer is a score "
+        "over a sequence or a nested sequence, so its width must be 1.")
+
+    Layer(
+        name=name,
+        type=LayerType.KMAX_SEQ_SCORE,
+        inputs=[input.name],
+        beam_size=beam_size)
+
+    return LayerOutput(
+        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
+
+
+@wrap_name_default("conv3d")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+@wrap_act_default(act=ReluActivation())
+@layer_support(DROPOUT)
+def img_conv3d_layer(input,
+                     filter_size,
+                     num_filters,
+                     name=None,
+                     num_channels=None,
+                     act=None,
+                     groups=1,
+                     stride=1,
+                     padding=0,
+                     bias_attr=None,
+                     param_attr=None,
+                     shared_biases=True,
+                     layer_attr=None,
+                     trans=False,
+                     layer_type=None):
+    """
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv3d_layer(input=data, filter_size=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
+                        is set to one integer, the three dimensions will be same.
+    :type filter_size: int | tuple | list
+    :param num_filters: The number of filters. It is as same as the output image channel.
+    :type num_filters: int
+    :param act: Activation type. ReluActivation is the default activation.
+    :type act: BaseActivation
+    :param groups: The number of the filter groups.
+    :type groups: int
+    :param stride: The strides of the convolution along three axises. If the parameter
+                   is set to one integer, the three strides will be same.
+    :type stride: int | tuple | list
+    :param padding: The numbers of padding along three axises. If the parameter is set to
+                    one integer, they will be same.
+    :type padding: int | tuple | list
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :param num_channels: The number of input channels. If the parameter is not set or
+                         set to None, its actual value will be automatically set to
+                         the channels number of the input.
+    :type num_channels: int
+    :param param_attr: The parameter attribute of the convolution. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param shared_biases: Whether biases will be shared between filters or not.
+    :type shared_biases: bool
+    :param layer_attr: The extra layer attributes. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
+    :param trans: True if it is a convTransLayer, False if it is a convLayer
+    :type trans: bool
+    :param layer_type: Specify the layer type. If the parameter is set, it must be "deconv3d"
+                       when trans=True. If not set, it will be automatically set to "deconv3d"
+                       when trans=True and "conv3d" when trans=False.
+    :type layer_type: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if isinstance(filter_size, collections.Sequence):
+        assert len(filter_size) == 3
+        filter_size, filter_size_y, filter_size_z = filter_size
+    else:
+        filter_size_y = filter_size
+        filter_size_z = filter_size
+
+    if isinstance(stride, collections.Sequence):
+        assert len(stride) == 3
+        stride, stride_y, stride_z = stride
+    else:
+        stride_y = stride
+        stride_z = stride
+
+    if isinstance(padding, collections.Sequence):
+        assert len(padding) == 3
+        padding, padding_y, padding_z = padding
+    else:
+        padding_y = padding
+        padding_z = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size**2 * num_channels))**0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    if layer_type:
+        if trans:
+            assert layer_type in ["deconv3d"]
+        lt = layer_type
+    else:
+        lt = LayerType.DECONV3D_LAYER if trans else LayerType.CONV3D_LAYER
+
+    l = Layer(
+        name=name,
+        inputs=Input(
+            input.name,
+            conv=Conv3D(
+                filter_size=filter_size,
+                padding=padding,
+                stride=stride,
+                channels=num_channels,
+                groups=groups,
+                filter_size_y=filter_size_y,
+                padding_y=padding_y,
+                stride_y=stride_y,
+                filter_size_z=filter_size_z,
+                padding_z=padding_z,
+                stride_z=stride_z),
+            **param_attr.attr),
+        active_type=act.name,
+        num_filters=num_filters,
+        bias=ParamAttr.to_bias(bias_attr),
+        shared_biases=shared_biases,
+        type=lt,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        lt,
+        parents=[input],
+        activation=act,
+        num_filters=num_filters,
+        size=l.config.size)
+
+
+@wrap_name_default("scale_shift")
+@wrap_param_attr_default()
+@wrap_bias_attr_default()
+def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
+    """
+    A layer applies a linear transformation to each element in each row of
+    the input matrix. For each element, the layer first re-scales it and then
+    adds a bias to it.
+
+    This layer is very like the SlopeInterceptLayer, except the scale and
+    bias are trainable.
+
+    .. math::
+
+        y = w * x + b
+
+    .. code-block:: python
+
+        scale_shift = scale_shift_layer(input=input_layer, bias_attr=False)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer.
+    :type input: LayerOutput
+    :param param_attr: The parameter attribute of scaling. See ParameterAttribute for
+                      details.
+    :type param_attr: ParameterAttribute
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SHIFT_LAYER,
+        inputs=Input(input.name, **param_attr.attr),
+        bias=ParamAttr.to_bias(bias_attr))
+    return LayerOutput(
+        name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default("resize")
+def resize_layer(input, size, name=None):
+    """
+    The resize layer resizes the input matrix with a shape of [Height, Width]
+    into the output matrix with a shape of [Height x Width / size, size],
+    where size is the parameter of this layer indicating the output dimension.
+
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param size: The resized output dimension of this layer.
+    :type size: int
+    :return: A LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
+    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
+
+
+@wrap_act_default(act=LinearActivation())
+@wrap_name_default('sub_seq')
+def sub_seq_layer(input, offsets, sizes, act=None, bias_attr=None, name=None):
+    """
+    sub_seq_layer will return sub-sequences from the input sequences. For each
+    sequence in the input sequence layer, sub_seq_layer will slice it by given
+    offset and size. Please notice that, number of offset value and size value
+    both are equal to the number of sequence in the input layer.
+
+    .. code-block:: python
+
+        sub_seq = sub_seq_layer(input=input_seq, offsets=offsets, sizes=sizes)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer, which should be sequence.
+    :type input: LayerOutput
+    :param offsets: The offset indices to slice the input sequence, which should
+                    be sequence type.
+    :type offsets: LayerOutput
+    :param sizes: The sizes of the sub-sequences, which should be sequence type.
+    :type sizes: LayerOutput
+    :param act: Activation type, LinearActivation is the default activation.
+    :type act: BaseActivation.
+    :param bias_attr: The bias attribute. If the parameter is set to False or an object
+                      whose type is not ParameterAttribute, no bias is defined. If the
+                      parameter is set to True, the bias is initialized to zero.
+    :type bias_attr: ParameterAttribute | None | bool | Any
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of sub_seq_layer layer must be a PaddlePaddle layer.')
+    assert isinstance(offsets, LayerOutput), (
+        'The offset indices for sub_seq_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(sizes, LayerOutput), (
+        'The sizes of sub-sequences, must be a PaddlePaddle layer.')
+
+    Layer(
+        name=name,
+        type=LayerType.SUB_SEQ_LAYER,
+        inputs=[input.name, offsets.name, sizes.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr))
+
+    return LayerOutput(
+        name,
+        LayerType.SUB_SEQ_LAYER,
+        parents=[input, offsets, sizes],
+        size=input.size)
+
+
+@wrap_name_default('scale_sub_region')
+def scale_sub_region_layer(input, indices, value, name=None):
+    """
+    Given an image or feature map with CHW information, scale_sub_region_layer
+    can be used to multiply a real value to values of a sub continuous region.
+    You can provide start and end indices of CHW for each instance.
+    Please notice that all start indices are counting from 1.
+    The shape of indices should be [batch_size, 6] and the layout for each row
+    is [C_Start, C_End, H_Start, H_End, W_Start, W_End].
+
+    .. code-block:: python
+
+        scale_sub_region = scale_sub_region_layer(input=input,
+                                                  indices=indices,
+                                                  value=value)
+
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param input: The input of this layer which should contains CHW information.
+    :type input: LayerOutput
+    :param indices: Start index and end index for C H W, the input value should
+                    be a 2-D matrix with shape [batch_size, 6].
+    :type indices: LayerOutput.
+    :param value: value to multiply.
+    :type value: float
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), (
+        'The first input of scale_sub_region_layer, '
+        'must be a PaddlePaddle layer.')
+    assert isinstance(indices, LayerOutput), (
+        'The start and end indices for CHW, must be a PaddlePaddle layer.')
+    assert isinstance(value, float), (
+        'The value to multiply, must be a real value.')
+
+    Layer(
+        name=name,
+        type=LayerType.SCALE_SUB_REGION_LAYER,
+        inputs=[input.name, indices.name],
+        value=value)
+
+    return LayerOutput(
+        name,
+        LayerType.SCALE_SUB_REGION_LAYER,
+        parents=[input, indices],
+        num_filters=input.num_filters,
+        size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \\rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
old mode 100755
new mode 100644
index fb533a47e0..b5cde7bac7
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -11,10 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""
+import math
 
-"""
-# from activations import *
 from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
     IdentityActivation, TanhActivation, SequenceSoftmaxActivation
 from attrs import ExtraAttr
@@ -26,9 +24,10 @@ from paddle.trainer.config_parser import *
 
 __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-    "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group', 'lstmemory_unit',
-    'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group',
-    'simple_gru', 'simple_attention', 'simple_gru2', 'bidirectional_gru',
+    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
+    'img_conv_group', 'img_separable_conv', 'vgg_16_network', 'gru_unit',
+    'gru_group', 'simple_gru', 'simple_attention', 'dot_product_attention',
+    'multi_head_attention', 'simple_gru2', 'bidirectional_gru',
     'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
 ]
 
@@ -55,49 +54,49 @@ def sequence_conv_pool(input,
                        context_attr=None,
                        pool_attr=None):
     """
-    Text convolution pooling layers helper.
+    Text convolution pooling group.
 
     Text input => Context Projection => FC Layer => Pooling => Output.
 
-    :param name: name of output layer(pooling layer name)
+    :param name: group name.
     :type name: basestring
-    :param input: name of input layer
+    :param input: input layer.
     :type input: LayerOutput
     :param context_len: context projection length. See
                         context_projection's document.
     :type context_len: int
     :param hidden_size: FC Layer size.
     :type hidden_size: int
-    :param context_start: context projection length. See
+    :param context_start: context start position. See
                           context_projection's context_start.
-    :type context_start: int or None
+    :type context_start: int|None
     :param pool_type: pooling layer type. See pooling_layer's document.
-    :type pool_type: BasePoolingType.
+    :type pool_type: BasePoolingType
     :param context_proj_layer_name: context projection layer name.
                                     None if user don't care.
     :type context_proj_layer_name: basestring
-    :param context_proj_param_attr: context projection parameter attribute.
-                                    None if user don't care.
-    :type context_proj_param_attr: ParameterAttribute or None.
+    :param context_proj_param_attr: padding parameter attribute of context projection layer.
+                                    If false, it means padding always be zero.
+    :type context_proj_param_attr: ParameterAttribute|None
     :param fc_layer_name: fc layer name. None if user don't care.
     :type fc_layer_name: basestring
     :param fc_param_attr: fc layer parameter attribute. None if user don't care.
-    :type fc_param_attr: ParameterAttribute or None
+    :type fc_param_attr: ParameterAttribute|None
     :param fc_bias_attr: fc bias parameter attribute. False if no bias,
                          None if user don't care.
-    :type fc_bias_attr: ParameterAttribute or None
-    :param fc_act: fc layer activation type. None means tanh
+    :type fc_bias_attr: ParameterAttribute|False|None
+    :param fc_act: fc layer activation type. None means tanh.
     :type fc_act: BaseActivation
-    :param pool_bias_attr: pooling layer bias attr. None if don't care.
-                           False if no bias.
-    :type pool_bias_attr: ParameterAttribute or None.
+    :param pool_bias_attr: pooling layer bias attr. False if no bias.
+                           None if user don't care.
+    :type pool_bias_attr: ParameterAttribute|False|None
     :param fc_attr: fc layer extra attribute.
     :type fc_attr: ExtraLayerAttribute
     :param context_attr: context projection layer extra attribute.
     :type context_attr: ExtraLayerAttribute
     :param pool_attr: pooling layer extra attribute.
     :type pool_attr: ExtraLayerAttribute
-    :return: output layer name.
+    :return: layer's output.
     :rtype: LayerOutput
     """
     # Set Default Value to param
@@ -163,45 +162,45 @@ def simple_img_conv_pool(input,
     """
     Simple image convolution and pooling group.
 
-    Input => conv => pooling
+    Img input => Conv => Pooling => Output.
 
-    :param name: group name
+    :param name: group name.
     :type name: basestring
-    :param input: input layer name.
+    :param input: input layer.
     :type input: LayerOutput
-    :param filter_size: see img_conv_layer for details
+    :param filter_size: see img_conv_layer for details.
     :type filter_size: int
-    :param num_filters: see img_conv_layer for details
+    :param num_filters: see img_conv_layer for details.
     :type num_filters: int
-    :param pool_size: see img_pool_layer for details
+    :param pool_size: see img_pool_layer for details.
     :type pool_size: int
-    :param pool_type: see img_pool_layer for details
+    :param pool_type: see img_pool_layer for details.
     :type pool_type: BasePoolingType
-    :param act: see img_conv_layer for details
+    :param act: see img_conv_layer for details.
     :type act: BaseActivation
-    :param groups: see img_conv_layer for details
+    :param groups: see img_conv_layer for details.
     :type groups: int
-    :param conv_stride: see img_conv_layer for details
+    :param conv_stride: see img_conv_layer for details.
     :type conv_stride: int
-    :param conv_padding: see img_conv_layer for details
+    :param conv_padding: see img_conv_layer for details.
     :type conv_padding: int
-    :param bias_attr: see img_conv_layer for details
+    :param bias_attr: see img_conv_layer for details.
     :type bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer for details
+    :param num_channel: see img_conv_layer for details.
     :type num_channel: int
-    :param param_attr: see img_conv_layer for details
+    :param param_attr: see img_conv_layer for details.
     :type param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer for details
+    :param shared_bias: see img_conv_layer for details.
     :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer for details
+    :param conv_layer_attr: see img_conv_layer for details.
     :type conv_layer_attr: ExtraLayerAttribute
-    :param pool_stride: see img_pool_layer for details
+    :param pool_stride: see img_pool_layer for details.
     :type pool_stride: int
-    :param pool_padding: see img_pool_layer for details
+    :param pool_padding: see img_pool_layer for details.
     :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer for details
+    :param pool_layer_attr: see img_pool_layer for details.
     :type pool_layer_attr: ExtraLayerAttribute
-    :return: Layer's output
+    :return: layer's output
     :rtype: LayerOutput
     """
     _conv_ = img_conv_layer(
@@ -253,47 +252,51 @@ def img_conv_bn_pool(input,
     """
     Convolution, batch normalization, pooling group.
 
-    :param name: group name
+    Img input => Conv => BN => Pooling => Output.
+
+    :param name: group name.
     :type name: basestring
-    :param input: layer's input
+    :param input: input layer.
     :type input: LayerOutput
-    :param filter_size: see img_conv_layer's document
+    :param filter_size: see img_conv_layer for details.
     :type filter_size: int
-    :param num_filters: see img_conv_layer's document
+    :param num_filters: see img_conv_layer for details.
     :type num_filters: int
-    :param pool_size: see img_pool_layer's document.
+    :param pool_size: see img_pool_layer for details.
     :type pool_size: int
-    :param pool_type: see img_pool_layer's document.
+    :param pool_type: see img_pool_layer for details.
     :type pool_type: BasePoolingType
-    :param act: see batch_norm_layer's document.
+    :param act: see batch_norm_layer for details.
     :type act: BaseActivation
-    :param groups: see img_conv_layer's document
+    :param groups: see img_conv_layer for details.
     :type groups: int
-    :param conv_stride: see img_conv_layer's document.
+    :param conv_stride: see img_conv_layer for details.
     :type conv_stride: int
-    :param conv_padding: see img_conv_layer's document.
+    :param conv_padding: see img_conv_layer for details.
     :type conv_padding: int
-    :param conv_bias_attr: see img_conv_layer's document.
+    :param conv_bias_attr: see img_conv_layer for details.
     :type conv_bias_attr: ParameterAttribute
-    :param num_channel: see img_conv_layer's document.
+    :param num_channel: see img_conv_layer for details.
     :type num_channel: int
-    :param conv_param_attr: see img_conv_layer's document.
+    :param conv_param_attr: see img_conv_layer for details.
     :type conv_param_attr: ParameterAttribute
-    :param shared_bias: see img_conv_layer's document.
+    :param shared_bias: see img_conv_layer for details.
     :type shared_bias: bool
-    :param conv_layer_attr: see img_conv_layer's document.
+    :param conv_layer_attr: see img_conv_layer for details.
     :type conv_layer_attr: ExtraLayerOutput
-    :param bn_param_attr: see batch_norm_layer's document.
-    :type bn_param_attr: ParameterAttribute.
-    :param bn_bias_attr: see batch_norm_layer's document.
-    :param bn_layer_attr: ParameterAttribute.
-    :param pool_stride: see img_pool_layer's document.
+    :param bn_param_attr: see batch_norm_layer for details.
+    :type bn_param_attr: ParameterAttribute
+    :param bn_bias_attr: see batch_norm_layer for details.
+    :type bn_bias_attr: ParameterAttribute
+    :param bn_layer_attr: see batch_norm_layer for details.
+    :type bn_layer_attr: ExtraLayerAttribute
+    :param pool_stride: see img_pool_layer for details.
     :type pool_stride: int
-    :param pool_padding: see img_pool_layer's document.
+    :param pool_padding: see img_pool_layer for details.
     :type pool_padding: int
-    :param pool_layer_attr: see img_pool_layer's document.
+    :param pool_layer_attr: see img_pool_layer for details.
     :type pool_layer_attr: ExtraLayerAttribute
-    :return: Layer groups output
+    :return: layer's output
     :rtype: LayerOutput
     """
     __conv__ = img_conv_layer(
@@ -340,24 +343,40 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0,
                    pool_stride=1,
-                   pool_type=None):
+                   pool_type=None,
+                   param_attr=None):
     """
     Image Convolution Group, Used for vgg net.
 
-    TODO(yuyang18): Complete docs
-
-    :param conv_batchnorm_drop_rate:
-    :param input:
-    :param conv_num_filter:
-    :param pool_size:
-    :param num_channels:
-    :param conv_padding:
-    :param conv_filter_size:
-    :param conv_act:
-    :param conv_with_batchnorm:
-    :param pool_stride:
-    :param pool_type:
-    :return:
+    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
+        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
+    :type conv_batchnorm_drop_rate: list
+    :param input: input layer.
+    :type input: LayerOutput
+    :param conv_num_filter: list of output channels num.
+    :type conv_num_filter: list|tuple
+    :param pool_size: pooling filter size.
+    :type pool_size: int
+    :param num_channels: input channels num.
+    :type num_channels: int
+    :param conv_padding: convolution padding size.
+    :type conv_padding: int
+    :param conv_filter_size: convolution filter size.
+    :type conv_filter_size: int
+    :param conv_act: activation funciton after convolution.
+    :type conv_act: BaseActivation
+    :param conv_with_batchnorm: if conv_with_batchnorm[i] is true,
+        there is a batch normalization operation after each convolution.
+    :type conv_with_batchnorm: list
+    :param pool_stride: pooling stride size.
+    :type pool_stride: int
+    :param pool_type: pooling type.
+    :type pool_type: BasePoolingType
+    :param param_attr: param attribute of convolution layer,
+                       None means default attribute.
+    :type param_attr: ParameterAttribute
+    :return: layer's output
+    :rtype: LayerOutput
     """
     tmp = input
 
@@ -397,6 +416,7 @@ def img_conv_group(input,
             padding=conv_padding[i],
             filter_size=conv_filter_size[i],
             num_filters=conv_num_filter[i],
+            param_attr=param_attr,
             **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
@@ -415,6 +435,85 @@ def img_conv_group(input,
         input=tmp, stride=pool_stride, pool_size=pool_size, pool_type=pool_type)
 
 
+@wrap_name_default("separable_conv")
+def img_separable_conv(input,
+                       num_channels,
+                       num_out_channels,
+                       filter_size,
+                       stride=1,
+                       padding=0,
+                       depth_multiplier=1,
+                       act=None,
+                       bias_attr=None,
+                       param_attr=None,
+                       shared_bias=True,
+                       layer_type='exconv',
+                       name=None):
+    """
+    Separable Convolution.
+
+    The separable convolution module is consisted of a depthwise convolution
+    that acts separately on input channels, followed by a pointwise convolution
+    with 1*1 kernels that mixes channels. It is used for Xception:
+    https://arxiv.org/pdf/1610.02357.pdf
+
+    :param input: input layer.
+    :type input: LayerOutput
+    :param num_channels: the number of input channels.
+    :type num_channels: int
+    :param num_out_channels: the number of output channels.
+    :type num_out_channels: int
+    :param filter_size: the filter size for the depthwise convolution.
+    :type filter_size: int|tuple
+    :param stride: the stride size for the depthwise convolution.
+    :type stride: int|tuple
+    :param padding: the padding size for the depthwise convolution.
+    :type padding: int|tuple
+    :param depth_multiplier: the number of filter for one channel in the
+                             depthwize convolution.
+    :type depth_multiplier: int
+    :param act: the activation function for the output.
+    :type act: BaseActivation
+    :param bias_attr: see img_conv_layer for details.
+    :type bias_attr: ParameterAttribute
+    :param param_attr: see img_conv_layer for details.
+    :type param_attr: ParameterAttribute
+    :param shared_bias: see img_conv_layer for details.
+    :type shared_bias: bool
+    :param layer_type: see img_conv_layer for details.
+    :type layer_type: bool
+    :return: layer's output
+    :rtype: LayerOutput
+    """
+    __depthwise_conv__ = img_conv_layer(
+        name="%s_depthwise_conv" % name,
+        input=input,
+        num_channels=num_channels,
+        num_filters=num_channels * depth_multiplier,
+        groups=num_channels,
+        filter_size=filter_size,
+        stride=stride,
+        padding=padding,
+        act=LinearActivation(),
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias,
+        layer_type=layer_type)
+    __pointwise_conv__ = img_conv_layer(
+        name="%s_pointwise_conv" % name,
+        input=__depthwise_conv__,
+        num_channels=num_channels * depth_multiplier,
+        num_filters=num_out_channels,
+        filter_size=1,
+        stride=1,
+        padding=0,
+        act=act,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        shared_biases=shared_bias)
+    return __pointwise_conv__
+
+
 def small_vgg(input_image, num_channels, num_classes):
     def __vgg__(ipt, num_filter, times, dropouts, num_channels_=None):
         return img_conv_group(
@@ -449,12 +548,14 @@ def vgg_16_network(input_image, num_channels, num_classes=1000):
     """
     Same model from https://gist.github.com/ksimonyan/211839e770f7b538e2d8
 
-    :param num_classes:
-    :param input_image:
+    :param num_classes: number of class.
+    :type num_classes: int
+    :param input_image: input layer.
     :type input_image: LayerOutput
-    :param num_channels:
+    :param num_channels: input channels num.
     :type num_channels: int
-    :return:
+    :return: layer's output
+    :rtype: LayerOutput
     """
 
     tmp = img_conv_group(
@@ -543,8 +644,8 @@ def simple_lstm(input,
     """
     Simple LSTM Cell.
 
-    It just combine a mixed layer with fully_matrix_projection and a lstmemory
-    layer. The simple lstm cell was implemented as follow equations.
+    It just combines a mixed layer with fully_matrix_projection and a lstmemory
+    layer. The simple lstm cell was implemented with follow equations.
 
     ..  math::
 
@@ -558,37 +659,37 @@ def simple_lstm(input,
 
         h_t & = o_t tanh(c_t)
 
-    Please refer **Generating Sequences With Recurrent Neural Networks** if you
-    want to know what lstm is. Link_ is here.
+    Please refer to **Generating Sequences With Recurrent Neural Networks** for more
+    details about lstm. Link_ is here.
 
     .. _Link: http://arxiv.org/abs/1308.0850
 
     :param name: lstm layer name.
     :type name: basestring
-    :param input: input layer name.
+    :param input: layer's input.
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param reverse: whether to process the input data in a reverse order
+    :param reverse: process the input in a reverse order or not.
     :type reverse: bool
-    :param mat_param_attr: mixed layer's matrix projection parameter attribute.
+    :param mat_param_attr: parameter attribute of matrix projection in mixed layer.
     :type mat_param_attr: ParameterAttribute
     :param bias_param_attr: bias parameter attribute. False means no bias, None
                             means default bias.
     :type bias_param_attr: ParameterAttribute|False
-    :param inner_param_attr: lstm cell parameter attribute.
+    :param inner_param_attr: parameter attribute of lstm cell.
     :type inner_param_attr: ParameterAttribute
-    :param act: lstm final activiation type
+    :param act: last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: lstm gate activiation type
+    :param gate_act: gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: lstm state activiation type.
+    :param state_act: state activiation type of lstm.
     :type state_act: BaseActivation
-    :param mixed_layer_attr: mixed layer's extra attribute.
+    :param mixed_layer_attr: extra attribute of mixed layer.
     :type mixed_layer_attr: ExtraLayerAttribute
-    :param lstm_cell_attr: lstm layer's extra attribute.
+    :param lstm_cell_attr: extra attribute of lstm.
     :type lstm_cell_attr: ExtraLayerAttribute
-    :return: lstm layer name.
+    :return: layer's output.
     :rtype: LayerOutput
     """
     fc_name = 'lstm_transform_%s' % name
@@ -614,21 +715,21 @@ def simple_lstm(input,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   out_memory=None,
                    name=None,
                    size=None,
                    param_attr=None,
                    act=None,
                    gate_act=None,
                    state_act=None,
-                   mixed_bias_attr=None,
+                   input_proj_bias_attr=None,
+                   input_proj_layer_attr=None,
                    lstm_bias_attr=None,
-                   mixed_layer_attr=None,
-                   lstm_layer_attr=None,
-                   get_output_layer_attr=None):
+                   lstm_layer_attr=None):
     """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    lstmemory_unit defines the caculation process of a LSTM unit during a
+    single time step. This function is not a recurrent layer, so it can not be
+    directly used to process sequence input. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
@@ -638,13 +739,13 @@ def lstmemory_unit(input,
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
@@ -659,46 +760,59 @@ def lstmemory_unit(input,
                                    state_act=TanhActivation())
 
 
-    :param input: input layer name.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param name: lstmemory unit name.
+    :param out_memory: The output of previous time step.
+    :type out_memory: LayerOutput | None
+    :param name: The lstmemory unit name.
     :type name: basestring
-    :param size: lstmemory unit size.
+    :param size: The lstmemory unit size.
     :type size: int
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activiation type
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: lstm gate activiation type
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: lstm state activiation type.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
-    :return: lstmemory unit name.
+    :return: The lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
         assert input.size % 4 == 0
         size = input.size / 4
-    out_mem = memory(name=name, size=size)
+    if out_memory is None:
+        out_mem = memory(name=name, size=size)
+    else:
+        out_mem = out_memory
+
     state_mem = memory(name="%s_state" % name, size=size)
 
     with mixed_layer(
             name="%s_input_recurrent" % name,
             size=size * 4,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr,
+            bias_attr=input_proj_bias_attr,
+            layer_attr=input_proj_layer_attr,
             act=IdentityActivation()) as m:
         m += identity_projection(input=input)
         m += full_matrix_projection(input=out_mem, param_attr=param_attr)
@@ -713,11 +827,7 @@ def lstmemory_unit(input,
         gate_act=gate_act,
         state_act=state_act,
         layer_attr=lstm_layer_attr)
-    get_output_layer(
-        name='%s_state' % name,
-        input=lstm_out,
-        arg_name='state',
-        layer_attr=get_output_layer_attr)
+    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
 
     return lstm_out
 
@@ -726,30 +836,30 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                     size=None,
                     name=None,
+                    out_memory=None,
                     reverse=False,
                     param_attr=None,
                     act=None,
                     gate_act=None,
                     state_act=None,
-                    mixed_bias_attr=None,
+                    input_proj_bias_attr=None,
+                    input_proj_layer_attr=None,
                     lstm_bias_attr=None,
-                    mixed_layer_attr=None,
-                    lstm_layer_attr=None,
-                    get_output_layer_attr=None):
+                    lstm_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states, or hidden states in every time step are accessible to the
+    cell states(or hidden states) in every time step are accessible to the
     user. This is especially useful in attention model. If you do not need to
-    access the internal states of the lstm, but merely use its outputs,
+    access the internal states of the lstm and merely use its outputs,
     it is recommended to use the lstmemory, which is relatively faster than
     lstmemory_group.
 
     NOTE: In PaddlePaddle's implementation, the following input-to-hidden
     multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
     speed up the calculations. Consequently, an additional mixed_layer with
     full_matrix_projection must be included before lstmemory_unit is called.
 
@@ -763,34 +873,43 @@ def lstmemory_group(input,
                                     gate_act=SigmoidActivation(),
                                     state_act=TanhActivation())
 
-    :param input: input layer name.
+    :param input: Input layer.
     :type input: LayerOutput
-    :param name: lstmemory group name.
-    :type name: basestring
-    :param size: lstmemory group size.
+    :param size: The lstmemory group size.
     :type size: int
-    :param reverse: is lstm reversed
+    :param name: The name of lstmemory group.
+    :type name: basestring
+    :param out_memory: The output of previous time step.
+    :type out_memory: LayerOutput | None
+    :param reverse: Process the input in a reverse order or not.
     :type reverse: bool
-    :param param_attr: Parameter config, None if use default.
+    :param param_attr: The parameter attribute for the weights in
+                     input to hidden projection.
+                     None means default attribute.
     :type param_attr: ParameterAttribute
-    :param act: lstm final activiation type
+    :param act: The last activiation type of lstm.
     :type act: BaseActivation
-    :param gate_act: lstm gate activiation type
+    :param gate_act: The gate activiation type of lstm.
     :type gate_act: BaseActivation
-    :param state_act: lstm state activiation type.
+    :param state_act: The state activiation type of lstm.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
-    :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
-    :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
-    :param lstm_layer_attr: lstm layer's extra attribute.
+    :param input_proj_bias_attr: The parameter attribute for the bias in
+                      input to hidden projection.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type input_proj_bias_attr: ParameterAttribute|bool|None
+    :param input_proj_layer_attr: The extra layer attribute for
+                     input to hidden projection of the LSTM unit,
+                     such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
+    :param lstm_bias_attr: The parameter attribute for the bias in lstm layer.
+                      False or None means no bias.
+                      If this parameter is set to True,
+                      the bias is initialized to zero.
+    :type lstm_bias_attr: ParameterAttribute|True|None
+    :param lstm_layer_attr: The extra attribute of lstm layer.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
     """
@@ -800,15 +919,15 @@ def lstmemory_group(input,
             input=ipt,
             name=name,
             size=size,
-            mixed_bias_attr=mixed_bias_attr,
-            mixed_layer_attr=mixed_layer_attr,
-            param_attr=param_attr,
-            lstm_bias_attr=lstm_bias_attr,
             act=act,
             gate_act=gate_act,
             state_act=state_act,
+            out_memory=out_memory,
+            input_proj_bias_attr=input_proj_bias_attr,
+            input_proj_layer_attr=input_proj_layer_attr,
+            param_attr=param_attr,
             lstm_layer_attr=lstm_layer_attr,
-            get_output_layer_attr=get_output_layer_attr)
+            lstm_bias_attr=lstm_bias_attr)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
@@ -819,6 +938,7 @@ def lstmemory_group(input,
 
 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
              size=None,
              name=None,
              gru_bias_attr=None,
@@ -828,26 +948,28 @@ def gru_unit(input,
              gru_layer_attr=None,
              naive=False):
     """
-    Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    gru_unit defines the calculation process of a gated recurrent unit during a single
+    time step. This function is not a recurrent layer, so it can not be
+    directly used to process sequence input. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
     Please see grumemory in layers.py for the details about the maths.
 
-    :param input: input layer name.
+    :param input: input layer.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
     :type size: int
-    :param act: type of the activation
+    :param act: activation type of gru
     :type act: BaseActivation
-    :param gate_act: type of the gate activation
+    :param gate_act: gate activation type or gru
     :type gate_act: BaseActivation
-    :param gru_layer_attr: Extra parameter attribute of the gru layer.
-    :type gru_layer_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra attribute of the gru layer.
+    :type gru_layer_attr: ExtraLayerAttribute
     :return: the gru output layer.
     :rtype: LayerOutput
     """
@@ -856,7 +978,7 @@ def gru_unit(input,
     if size is None:
         size = input.size / 3
 
-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
 
     if naive:
         __step__ = gru_step_naive_layer
@@ -878,6 +1000,7 @@ def gru_unit(input,
 
 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
               size=None,
               name=None,
               reverse=False,
@@ -888,11 +1011,11 @@ def gru_group(input,
               gru_layer_attr=None,
               naive=False):
     """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
     benefit is that gru hidden states are accessible to the user. This is
     especially useful in attention model. If you do not need to access
-    any internal state, but merely use the outputs of a GRU, it is recommended
+    any internal state and merely use the outputs of a GRU, it is recommended
     to use the grumemory, which is relatively faster.
 
     Please see grumemory in layers.py for more detail about the maths.
@@ -901,27 +1024,30 @@ def gru_group(input,
 
     ..  code-block:: python
 
-        gru = gur_group(input=[layer1],
+        gru = gru_group(input=[layer1],
                         size=256,
                         act=TanhActivation(),
                         gate_act=SigmoidActivation())
 
-    :param input: input layer name.
+    :param input: input layer.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
     :type size: int
-    :param reverse: whether to process the input data in a reverse order
+    :param reverse: process the input in a reverse order or not.
     :type reverse: bool
-    :param act: type of the activiation
+    :param act: activiation type of gru
     :type act: BaseActivation
-    :param gate_act: type of the gate activiation
+    :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias. False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False
-    :param gru_layer_attr: Extra parameter attribute of the gru layer.
-    :type gru_layer_attr: ParameterAttribute|False
+    :param gru_bias_attr: bias parameter attribute of gru layer,
+                          False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False|None
+    :param gru_layer_attr: Extra attribute of the gru layer.
+    :type gru_layer_attr: ExtraLayerAttribute
     :return: the gru group.
     :rtype: LayerOutput
     """
@@ -929,6 +1055,7 @@ def gru_group(input,
     def __gru_step__(ipt):
         return gru_unit(
             input=ipt,
+            memory_boot=memory_boot,
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
@@ -960,11 +1087,11 @@ def simple_gru(input,
                gru_layer_attr=None,
                naive=False):
     """
-    You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
+    You may see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
     simple_gru in network.py. The reason why there are so many interfaces is
     that we have two ways to implement recurrent neural network. One way is to
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
-    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But
     the multiplication operation :math:`W x_t` is not computed in these layers.
     See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
@@ -992,22 +1119,23 @@ def simple_gru(input,
 
         gru = simple_gru(input=[layer1], size=256)
 
-    :param input: input layer name.
+    :param input: input layer.
     :type input: LayerOutput
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
     :type size: int
-    :param reverse: whether to process the input data in a reverse order
+    :param reverse: process the input in a reverse order or not.
     :type reverse: bool
-    :param act: type of the activiation
+    :param act: activiation type of gru
     :type act: BaseActivation
-    :param gate_act: type of the gate activiation
+    :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias. False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False
-    :param gru_layer_attr: Extra parameter attribute of the gru layer.
-    :type gru_layer_attr: ParameterAttribute|False
+    :param gru_bias_attr: bias parameter attribute of gru layer,
+                          False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False|None
+    :param gru_layer_attr: Extra attribute of the gru layer.
+    :type gru_layer_attr: ExtraLayerAttribute
     :return: the gru group.
     :rtype: LayerOutput
     """
@@ -1045,8 +1173,8 @@ def simple_gru2(input,
                 mixed_layer_attr=None,
                 gru_cell_attr=None):
     """
-    simple_gru2 is the same with simple_gru, but using grumemory instead
-    Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is the same with simple_gru, but using grumemory instead.
+    Please refer to grumemory in layers.py for more detail about the math.
     simple_gru2 is faster than simple_gru.
 
     The example usage is:
@@ -1055,22 +1183,24 @@ def simple_gru2(input,
 
         gru = simple_gru2(input=[layer1], size=256)
 
-    :param input: input layer name.
+    :param input: input layer.
     :type input: LayerOutput
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
     :type size: int
-    :param reverse: whether to process the input data in a reverse order
+    :param reverse: process the input in a reverse order or not.
     :type reverse: bool
-    :param act: type of the activiation
+    :param act: activiation type of gru
     :type act: BaseActivation
-    :param gate_act: type of the gate activiation
+    :param gate_act: gate activiation type of gru
     :type gate_act: BaseActivation
-    :param gru_bias_attr: bias. False means no bias, None means default bias.
-    :type gru_bias_attr: ParameterAttribute|False
-    :param gru_layer_attr: Extra parameter attribute of the gru layer.
-    :type gru_layer_attr: ParameterAttribute|False
+    :param gru_bias_attr: bias parameter attribute of gru layer,
+                          False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False|None
+    :param gru_param_attr: param parameter attribute of gru layer,
+                          None means default param.
+    :type gru_param_attr: ParameterAttribute|None
     :return: the gru group.
     :rtype: LayerOutput
     """
@@ -1083,7 +1213,6 @@ def simple_gru2(input,
 
     return grumemory(
         name=name,
-        size=size,
         input=m,
         reverse=reverse,
         bias_attr=gru_bias_attr,
@@ -1120,7 +1249,7 @@ def bidirectional_gru(input,
                       concat_act=None):
     """
     A bidirectional_gru is a recurrent unit that iterates over the input
-    sequence both in forward and bardward orders, and then concatenate two
+    sequence both in forward and backward orders, and then concatenate two
     outputs to form a final output. However, concatenation of two outputs
     is not the only way to form the final output, you can also, for example,
     just add them together.
@@ -1137,11 +1266,10 @@ def bidirectional_gru(input,
     :type input: LayerOutput
     :param size: gru layer size.
     :type size: int
-    :param return_seq: If set False, outputs of the last time step are
-                       concatenated and returned.
-                       If set True, the entire output sequences that are
-                       processed in forward and backward directions are
+    :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
+                       If set True, the entire output sequences in forward
+                       and backward directions are concatenated and returned.
     :type return_seq: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1205,8 +1333,8 @@ def bidirectional_lstm(input,
                        concat_act=None):
     """
     A bidirectional_lstm is a recurrent unit that iterates over the input
-    sequence both in forward and bardward orders, and then concatenate two
-    outputs form a final output. However, concatenation of two outputs
+    sequence both in forward and backward orders, and then concatenate two
+    outputs to form a final output. However, concatenation of two outputs
     is not the only way to form the final output, you can also, for example,
     just add them together.
 
@@ -1227,13 +1355,12 @@ def bidirectional_lstm(input,
     :type input: LayerOutput
     :param size: lstm layer size.
     :type size: int
-    :param return_seq: If set False, outputs of the last time step are
-                       concatenated and returned.
-                       If set True, the entire output sequences that are
-                       processed in forward and backward directions are
+    :param return_seq: If set False, the last time step of output are
                        concatenated and returned.
+                       If set True, the entire output sequences in forward
+                       and backward directions are concatenated and returned.
     :type return_seq: bool
-    :return: LayerOutput object accroding to the return_seq.
+    :return: LayerOutput object.
     :rtype: LayerOutput
     """
     args = locals()
@@ -1278,7 +1405,7 @@ def simple_attention(encoded_sequence,
                      weight_act=None,
                      name=None):
     """
-    Calculate and then return a context vector by attention machanism.
+    Calculate and return a context vector with attention mechanism.
     Size of the context vector equals to size of the encoded_sequence.
 
     ..  math::
@@ -1311,10 +1438,10 @@ def simple_attention(encoded_sequence,
     :param name: name of the attention model.
     :type name: basestring
     :param softmax_param_attr: parameter attribute of sequence softmax
-                               that is used to produce attention weight
+                               that is used to produce attention weight.
     :type softmax_param_attr: ParameterAttribute
-    :param weight_act: activation of the attention model
-    :type weight_act: Activation
+    :param weight_act: activation of the attention model.
+    :type weight_act: BaseActivation
     :param encoded_sequence: output of the encoder
     :type encoded_sequence: LayerOutput
     :param encoded_proj: attention weight is computed by a feed forward neural
@@ -1331,6 +1458,7 @@ def simple_attention(encoded_sequence,
                                 compute attention weight.
     :type transform_param_attr: ParameterAttribute
     :return: a context vector
+    :rtype: LayerOutput
     """
     assert encoded_proj.size == decoder_state.size
     proj_size = encoded_proj.size
@@ -1366,27 +1494,214 @@ def simple_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
-############################################################################
-#                         Miscs                                            #
-############################################################################
+@wrap_name_default()
+def dot_product_attention(encoded_sequence,
+                          attended_sequence,
+                          transformed_state,
+                          softmax_param_attr=None,
+                          name=None):
+    """
+    Calculate and return a context vector with dot-product attention mechanism.
+    The dimension of the context vector equals to that of the attended_sequence.
+
+    ..  math::
+
+        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
+
+        e_{i,j} & = a(s_{i-1}, h_{j})
+
+        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
 
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
 
-@wrap_name_default("dropout")
-def dropout_layer(input, dropout_rate, name=None):
+    where :math:`h_{j}` is the jth element of encoded_sequence,
+    :math:`z_{j}` is the jth element of attended_sequence,
+    :math:`s_{i-1}` is transformed_state.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = dot_product_attention(encoded_sequence=enc_seq,
+                                        attended_sequence=att_seq,
+                                        transformed_state=state,)
+
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the dot_product_attention.
+    :type name: basestring
+    :param softmax_param_attr: The parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param encoded_sequence: The output hidden vectors of the encoder.
+    :type encoded_sequence: LayerOutput
+    :param attended_sequence: The attention weight is computed by a feed forward neural
+                              network which has two inputs : decoder's transformed hidden
+                              state of previous time step and encoder's output.
+                              attended_sequence is the sequence to be attended.
+    :type attended_sequence: LayerOutput
+    :param transformed_state: The transformed hidden state of decoder in previous time step.
+                              Since the dot-product operation will be performed on it and the
+                              encoded_sequence, their dimensions must be equal. For flexibility,
+                              we suppose transformations of the decoder's hidden state have been
+                              done outside dot_product_attention and no more will be performed
+                              inside. Then users can use either the original or transformed one.
+    :type transformed_state: LayerOutput
+    :return: The context vector.
+    :rtype: LayerOutput
     """
-    @TODO(yuyang18): Add comments.
+    assert transformed_state.size == encoded_sequence.size
 
-    :param name:
-    :param input:
-    :param dropout_rate:
-    :return:
+    expanded = expand_layer(
+        input=transformed_state,
+        expand_as=encoded_sequence,
+        name='%s_expand' % name)
+
+    m = dot_prod_layer(
+        input1=expanded, input2=encoded_sequence, name='%s_dot-product' % name)
+
+    attention_weight = fc_layer(
+        input=m,
+        size=1,
+        act=SequenceSoftmaxActivation(),
+        param_attr=softmax_param_attr,
+        name="%s_softmax" % name,
+        bias_attr=False)
+
+    scaled = scaling_layer(
+        weight=attention_weight,
+        input=attended_sequence,
+        name='%s_scaling' % name)
+
+    return pooling_layer(
+        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
+
+
+@wrap_name_default()
+def multi_head_attention(query,
+                         key,
+                         value,
+                         key_proj_size,
+                         value_proj_size,
+                         head_num,
+                         attention_type,
+                         softmax_param_attr=None,
+                         name=None):
     """
-    return addto_layer(
-        name=name,
-        input=input,
-        act=LinearActivation(),
-        bias_attr=False,
-        layer_attr=ExtraAttr(drop_rate=dropout_rate))
+    Calculate and return a context vector with dot-product attention mechanism.
+    The dimension of the context vector equals to value_proj_size * head_num.
+
+    Please refer to **Attention Is All You Need** for more details. The link is
+    as follows:
+    https://arxiv.org/abs/1706.03762.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = multi_head_attention(query=decoder_state,
+                                       key=enc_seq,
+                                       value=enc_seq,
+                                       key_proj_size=64,
+                                       value_pro_size=64,
+                                       head_num=8,
+                                       attention_type='dot-product attention')
+
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the multi_head_attention.
+    :type name: basestring
+    :param softmax_param_attr: The parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param query: query is used to calculate attention weights over values at current step.
+    :type query: LayerOutput
+    :param key: key is used to calculate the attention weight of the corresponding value.
+    :type key: LayerOutput
+    :param value: value is the sequence to be attended.
+    :type value: LayerOutput
+    :param key_proj_size: The dimension of the linear projection performed on key and query.
+    :type key_proj_size: int
+    :param value_proj_size: The dimension of the linear projection performed on value.
+    :type value_proj_size: int
+    :param head_num: The number of attention heads.
+    :type head_num: int
+    :param attention_type: The type of the attention mechanism used in each attention
+                           heads. Now, we only support scaled dot-product attention and
+                           additive attention.
+    :type attention_type: basestring
+    :return: The context vector.
+    :rtype: LayerOutput
+    """
+    assert attention_type in ['dot-product attention', 'additive attention']
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_query_proj' % name) as query_proj:
+        query_proj += full_matrix_projection(query)
+    query_proj = expand_layer(input=query_proj, expand_as=key)
+
+    with mixed_layer(
+            size=key_proj_size * head_num,
+            name='%s_key_proj' % name) as key_proj:
+        key_proj += full_matrix_projection(key)
+
+    with mixed_layer(
+            size=value_proj_size * head_num,
+            name='%s_value_proj' % name) as value_proj:
+        value_proj += full_matrix_projection(value)
+
+    head_list = []
+    for i in range(head_num):
+        with mixed_layer(size=key_proj_size) as sub_query_proj:
+            sub_query_proj += identity_projection(
+                query_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=key_proj_size) as sub_key_proj:
+            sub_key_proj += identity_projection(
+                key_proj, offset=key_proj_size * i, size=key_proj_size)
+
+        with mixed_layer(size=value_proj_size) as sub_value_proj:
+            sub_value_proj += identity_projection(
+                value_proj, offset=value_proj_size * i, size=value_proj_size)
+
+        if attention_type == 'dot-product attention':
+            m = dot_prod_layer(
+                input1=sub_query_proj,
+                input2=sub_key_proj,
+                name='%s_dot-product_%d' % (name, i))
+            m = slope_intercept_layer(
+                input=m,
+                slope=math.sqrt(1.0 / key_proj_size),
+                name='%s_dot-product_scaling_%d' % (name, i))
+        else:
+            with mixed_layer(
+                    size=key_proj_size,
+                    act=TanhActivation(),
+                    name='%s_combine_%d' % (name, i)) as m:
+                m += identity_projection(sub_query_proj)
+                m += identity_projection(sub_key_proj)
+
+        attention_weight = fc_layer(
+            input=m,
+            size=1,
+            act=SequenceSoftmaxActivation(),
+            param_attr=softmax_param_attr,
+            name="%s_softmax_%d" % (name, i),
+            bias_attr=False)
+
+        scaled = scaling_layer(
+            weight=attention_weight,
+            input=sub_value_proj,
+            name='%s_scaling_%d' % (name, i))
+        head = pooling_layer(
+            input=scaled,
+            pooling_type=SumPooling(),
+            name="%s_pooling_%d" % (name, i))
+
+        head_list.append(head)
+
+    attended = concat_layer(head_list)
+
+    return attended
 
 
 def inputs(layers, *args):
@@ -1409,7 +1724,7 @@ def inputs(layers, *args):
 
 def outputs(layers, *args):
     """
-    Declare the outputs of network. If user have not defined the inputs of
+    Declare the outputs of network. If user has not defined the inputs of
     network, this method will calculate the input order by dfs travel.
 
     :param layers: Output layers.
@@ -1417,6 +1732,8 @@ def outputs(layers, *args):
     :return:
     """
 
+    traveled = set()
+
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
         """
@@ -1428,6 +1745,11 @@ def outputs(layers, *args):
         :type layer: LayerOutput
         :return:
         """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
+
         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
         retv = []
         if layer.parents is not None:
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index c3495ee110..c3cd4cf8c3 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -116,7 +116,7 @@ class AdamOptimizer(BaseSGDOptimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py
index 0c38a8dce5..e0aeb311b3 100644
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -15,8 +15,9 @@
 """
 
 __all__ = [
-    "BasePoolingType", "MaxPooling", "AvgPooling", "CudnnMaxPooling",
-    "CudnnAvgPooling", "SumPooling", "SquareRootNPooling"
+    "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling",
+    "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling",
+    "SumPooling", "SquareRootNPooling"
 ]
 
 
@@ -55,6 +56,19 @@ class MaxPooling(BasePoolingType):
         self.output_max_index = output_max_index
 
 
+class MaxWithMaskPooling(BasePoolingType):
+    """
+    MaxWithMask pooling.
+
+    Not only return the very large values for each dimension in sequence or time steps,
+    but also the location indices of found maxinum values.
+
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "max-pool-with-mask")
+
+
 class CudnnMaxPooling(BasePoolingType):
     """
     Cudnn max pooling only support GPU. Return the maxinum value in the
@@ -75,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType):
         BasePoolingType.__init__(self, "cudnn-avg-pool")
 
 
+class CudnnAvgInclPadPooling(BasePoolingType):
+    """
+    Cudnn average pooling only support GPU. Return the average value in the
+    pooling window taking into account the padding cells.
+    """
+
+    def __init__(self):
+        BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool")
+
+
 class AvgPooling(BasePoolingType):
     """
     Average pooling.
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 6c860fd497..580aef935b 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
   COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 981ccbf248..10c941f707 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -1,10 +1,17 @@
 #!/bin/bash
-export configs=(test_fc layer_activations projections test_print_layer
+export configs=(test_repeat_layer test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
+test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
+test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
index 9fda16a540..93b505a602 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-3, batch_size=1000)
@@ -12,6 +26,7 @@ img_conv = img_conv_layer(
     num_filters=64,
     filter_size=(32, 32),
     padding=(1, 1),
+    dilation=(1, 1),
     stride=(1, 1),
     act=LinearActivation())
 img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
diff --git a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
index 91849b40a0..745f060fa5 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_trans_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-3, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index f87237f9b5..b6fc8f70f9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
index 7012dbf6a0..6edc03bba0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
@@ -1,3 +1,16 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Test all activations.
 '''
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index a607a62c99..59a71e1cd1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index dc8975cb31..96f06b4018 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -1,3 +1,16 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 '''
 Test mixed layer, projections and operators.
 '''
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
index 1a577b8d9b..3e0f957648 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -62,6 +64,8 @@ layers {
   moving_average_fraction: 0.9
   height: 227
   width: 227
+  depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 2818389b16..a18a4652e1 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 227
       img_size_y: 256
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -62,6 +64,8 @@ layers {
   moving_average_fraction: 0.9
   height: 256
   width: 256
+  depth: 1
+  epsilon: 1e-05
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
index 12b2255f3a..fee0f8e462 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -9,7 +9,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -21,7 +21,7 @@ layers {
   name: "__first_seq_1__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -33,7 +33,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -44,7 +44,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -55,7 +55,7 @@ layers {
   name: "__first_seq_2__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
@@ -67,7 +67,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 30
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
index 64530146a1..7254deb368 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -123,7 +123,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__simple_gru_0__"
   }
@@ -134,7 +134,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__simple_gru_1__"
   }
@@ -256,19 +256,15 @@ sub_models {
   memories {
     layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
     link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__simple_gru_0___transform"
     link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
     link_name: "__simple_gru_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__simple_gru_1___recurrent_group"
@@ -280,18 +276,14 @@ sub_models {
   memories {
     layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
     link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__simple_gru_1___transform"
     link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
     link_name: "__simple_gru_1__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 79fa4c74f0..75cf231203 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -104,7 +104,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
@@ -183,7 +183,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
@@ -205,7 +205,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_0__"
   }
@@ -216,7 +216,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_1__"
   }
@@ -341,24 +341,19 @@ sub_models {
   memories {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_0__"
     link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__lstm_group_1___recurrent_group"
@@ -373,23 +368,18 @@ sub_models {
   memories {
     layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_1__"
     link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
     link_name: "__lstm_group_1__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
index 68fa881b4f..0d51f70ee0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -138,7 +138,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__recurrent_layer_0__"
   }
@@ -149,7 +149,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__recurrent_layer_1__"
   }
@@ -161,7 +161,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstmemory_0__"
   }
@@ -172,7 +172,7 @@ layers {
   name: "__first_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstmemory_1__"
   }
@@ -184,7 +184,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_0__"
   }
@@ -195,7 +195,7 @@ layers {
   name: "__first_seq_2__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_1__"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
new file mode 100644
index 0000000000..9b69ae4a3b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_BatchNorm3D.protostr
@@ -0,0 +1,93 @@
+type: "nn"
+layers {
+  name: "data3D"
+  type: "data"
+  size: 360
+  active_type: ""
+  height: 6
+  width: 20
+  depth: 3
+}
+layers {
+  name: "__batch_norm_0__"
+  type: "batch_norm"
+  size: 360
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data3D"
+    input_parameter_name: "___batch_norm_0__.w0"
+    image_conf {
+      channels: 1
+      img_size: 20
+      img_size_y: 6
+      img_size_z: 3
+    }
+  }
+  inputs {
+    input_layer_name: "data3D"
+    input_parameter_name: "___batch_norm_0__.w1"
+  }
+  inputs {
+    input_layer_name: "data3D"
+    input_parameter_name: "___batch_norm_0__.w2"
+  }
+  bias_parameter_name: "___batch_norm_0__.wbias"
+  moving_average_fraction: 0.9
+  height: 6
+  width: 20
+  depth: 3
+  epsilon: 1e-05
+}
+parameters {
+  name: "___batch_norm_0__.w0"
+  size: 1
+  initial_mean: 1.0
+  initial_std: 0.0
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w1"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.w2"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data3D"
+output_layer_names: "__batch_norm_0__"
+sub_models {
+  name: "root"
+  layer_names: "data3D"
+  layer_names: "__batch_norm_0__"
+  input_layer_names: "data3D"
+  output_layer_names: "__batch_norm_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
index b110e91498..8a1399efad 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
@@ -74,6 +74,9 @@ layers {
   inputs {
     input_layer_name: "__bidirectional_gru_0___bw"
   }
+  height: 0
+  width: 0
+  depth: 1
 }
 parameters {
   name: "___bidirectional_gru_0___fw_transform.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index fd5224ca55..25ec632375 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -28,6 +28,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
new file mode 100644
index 0000000000..4b9578a0c0
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
@@ -0,0 +1,31 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__clip_0__"
+  type: "clip"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    clip_conf {
+      min: -10
+      max: 10
+    }
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__clip_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__clip_0__"
+  input_layer_names: "input"
+  output_layer_names: "__clip_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
new file mode 100644
index 0000000000..9fe2bc29d3
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_conv3d_layer.protostr
@@ -0,0 +1,132 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 36288
+  active_type: ""
+  height: 48
+  width: 42
+  depth: 6
+}
+layers {
+  name: "conv3d_1"
+  type: "conv3d"
+  size: 24192
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_conv3d_1.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 21
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 3
+      img_size_z: 6
+    }
+  }
+  bias_parameter_name: "_conv3d_1.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 24
+  width: 21
+  depth: 3
+}
+layers {
+  name: "conv3d_2"
+  type: "conv3d"
+  size: 24192
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_conv3d_2.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 21
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 3
+      img_size_z: 6
+    }
+  }
+  bias_parameter_name: "_conv3d_2.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 24
+  width: 21
+  depth: 3
+}
+parameters {
+  name: "_conv3d_1.w0"
+  size: 1296
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_1.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_2.w0"
+  size: 1296
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_conv3d_2.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "conv3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "conv3d_1"
+  layer_names: "conv3d_2"
+  input_layer_names: "data"
+  output_layer_names: "conv3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 05847344be..55ab464ddf 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -167,6 +167,20 @@ layers {
   softmax_selfnorm_alpha: 0.1
   coeff: 1.0
 }
+layers {
+  name: "__huber_regression_cost_0__"
+  type: "huber_regression"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  coeff: 1.0
+  delta: 1.0
+}
 layers {
   name: "huber_probs"
   type: "data"
@@ -180,8 +194,8 @@ layers {
   active_type: ""
 }
 layers {
-  name: "__huber_cost_0__"
-  type: "huber"
+  name: "__huber_classification_cost_0__"
+  type: "huber_classification"
   size: 1
   active_type: ""
   inputs {
@@ -300,7 +314,8 @@ output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
 output_layer_names: "__cross_entropy_0__"
 output_layer_names: "__cross_entropy_with_selfnorm_0__"
-output_layer_names: "__huber_cost_0__"
+output_layer_names: "__huber_regression_cost_0__"
+output_layer_names: "__huber_classification_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
 output_layer_names: "__nce_layer_0__"
@@ -324,9 +339,10 @@ sub_models {
   layer_names: "__lambda_cost_0__"
   layer_names: "__cross_entropy_0__"
   layer_names: "__cross_entropy_with_selfnorm_0__"
+  layer_names: "__huber_regression_cost_0__"
   layer_names: "huber_probs"
   layer_names: "huber_label"
-  layer_names: "__huber_cost_0__"
+  layer_names: "__huber_classification_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
   layer_names: "__sum_cost_0__"
   layer_names: "__nce_layer_0__"
@@ -349,7 +365,8 @@ sub_models {
   output_layer_names: "__lambda_cost_0__"
   output_layer_names: "__cross_entropy_0__"
   output_layer_names: "__cross_entropy_with_selfnorm_0__"
-  output_layer_names: "__huber_cost_0__"
+  output_layer_names: "__huber_regression_cost_0__"
+  output_layer_names: "__huber_classification_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
   output_layer_names: "__sum_cost_0__"
   output_layer_names: "__nce_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index b7d74f85ab..cec8a73db6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__mse_cost_0__"
+  name: "__square_error_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -130,7 +130,8 @@ input_layer_names: "label"
 input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
-output_layer_names: "__mse_cost_0__"
+output_layer_names: "__square_error_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -145,7 +146,7 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__mse_cost_0__"
+  layer_names: "__square_error_cost_0__"
   layer_names: "multi_class_label"
   layer_names: "__nce_layer_0__"
   input_layer_names: "input"
@@ -153,7 +154,8 @@ sub_models {
   input_layer_names: "weight"
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__square_error_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
new file mode 100644
index 0000000000..a602569697
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cross_entropy_over_beam.protostr
@@ -0,0 +1,207 @@
+type: "nn"
+layers {
+  name: "sentence_states"
+  type: "data"
+  size: 32
+  active_type: ""
+}
+layers {
+  name: "sentence_scores"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__kmax_seq_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  beam_size: 5
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 32
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_states"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_0__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__sub_nested_seq_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_seq_score_layer_1__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  beam_size: 5
+}
+layers {
+  name: "__seq_slice_layer_0__"
+  type: "seq_slice"
+  size: 32
+  active_type: ""
+  inputs {
+    input_layer_name: "__sub_nested_seq_layer_0__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_1__"
+  }
+  select_first: true
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__seq_slice_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+layers {
+  name: "__kmax_seq_score_layer_2__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+  }
+  beam_size: 5
+}
+layers {
+  name: "sentences_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "start_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "end_ids"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__cross_entropy_over_beam_0__"
+  type: "cross_entropy_over_beam"
+  active_type: ""
+  inputs {
+    input_layer_name: "sentence_scores"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_0__"
+  }
+  inputs {
+    input_layer_name: "sentences_ids"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_1__"
+  }
+  inputs {
+    input_layer_name: "start_ids"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+  }
+  inputs {
+    input_layer_name: "__kmax_seq_score_layer_2__"
+  }
+  inputs {
+    input_layer_name: "end_ids"
+  }
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "sentence_scores"
+input_layer_names: "sentences_ids"
+input_layer_names: "sentence_states"
+input_layer_names: "start_ids"
+input_layer_names: "end_ids"
+output_layer_names: "__cross_entropy_over_beam_0__"
+sub_models {
+  name: "root"
+  layer_names: "sentence_states"
+  layer_names: "sentence_scores"
+  layer_names: "__kmax_seq_score_layer_0__"
+  layer_names: "__sub_nested_seq_layer_0__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_seq_score_layer_1__"
+  layer_names: "__seq_slice_layer_0__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__kmax_seq_score_layer_2__"
+  layer_names: "sentences_ids"
+  layer_names: "start_ids"
+  layer_names: "end_ids"
+  layer_names: "__cross_entropy_over_beam_0__"
+  input_layer_names: "sentence_scores"
+  input_layer_names: "sentences_ids"
+  input_layer_names: "sentence_states"
+  input_layer_names: "start_ids"
+  input_layer_names: "end_ids"
+  output_layer_names: "__cross_entropy_over_beam_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
new file mode 100644
index 0000000000..7bf409731c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_deconv3d_layer.protostr
@@ -0,0 +1,132 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 36288
+  active_type: ""
+  height: 48
+  width: 42
+  depth: 6
+}
+layers {
+  name: "deconv3d_1"
+  type: "deconv3d"
+  size: 1387760
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_deconv3d_1.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 16
+      output_x: 42
+      img_size: 83
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 48
+      img_size_y: 95
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 6
+      img_size_z: 11
+    }
+  }
+  bias_parameter_name: "_deconv3d_1.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 95
+  width: 83
+  depth: 11
+}
+layers {
+  name: "deconv3d_2"
+  type: "deconv3d"
+  size: 1387760
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "_deconv3d_2.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 16
+      output_x: 42
+      img_size: 83
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 48
+      img_size_y: 95
+      filter_size_z: 3
+      padding_z: 1
+      stride_z: 2
+      output_z: 6
+      img_size_z: 11
+    }
+  }
+  bias_parameter_name: "_deconv3d_2.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 95
+  width: 83
+  depth: 11
+}
+parameters {
+  name: "_deconv3d_1.w0"
+  size: 6912
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_1.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_2.w0"
+  size: 6912
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_deconv3d_2.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "deconv3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "deconv3d_1"
+  layer_names: "deconv3d_2"
+  input_layer_names: "data"
+  output_layer_names: "deconv3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
new file mode 100644
index 0000000000..6690f9852a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
@@ -0,0 +1,66 @@
+type: "nn"
+layers {
+  name: "input_loc"
+  type: "data"
+  size: 16
+  active_type: ""
+  height: 16
+  width: 1
+}
+layers {
+  name: "input_conf"
+  type: "data"
+  size: 8
+  active_type: ""
+  height: 1
+  width: 8
+}
+layers {
+  name: "priorbox"
+  type: "data"
+  size: 32
+  active_type: ""
+  height: 4
+  width: 8
+}
+layers {
+  name: "test_detection_output"
+  type: "detection_output"
+  size: 1400
+  active_type: ""
+  inputs {
+    input_layer_name: "priorbox"
+    detection_output_conf {
+      num_classes: 21
+      nms_threshold: 0.45
+      nms_top_k: 400
+      background_id: 0
+      input_num: 1
+      keep_top_k: 200
+      confidence_threshold: 0.01
+    }
+  }
+  inputs {
+    input_layer_name: "input_loc"
+  }
+  inputs {
+    input_layer_name: "input_conf"
+  }
+}
+input_layer_names: "priorbox"
+input_layer_names: "input_loc"
+input_layer_names: "input_conf"
+output_layer_names: "test_detection_output"
+sub_models {
+  name: "root"
+  layer_names: "input_loc"
+  layer_names: "input_conf"
+  layer_names: "priorbox"
+  layer_names: "test_detection_output"
+  input_layer_names: "priorbox"
+  input_layer_names: "input_loc"
+  input_layer_names: "input_conf"
+  output_layer_names: "test_detection_output"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
new file mode 100644
index 0000000000..f1530c382c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_dot_prod_layer.protostr
@@ -0,0 +1,38 @@
+type: "nn"
+layers {
+  name: "vector1"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "vector2"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__dot_prod_layer_0__"
+  type: "dot_prod"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "vector1"
+  }
+  inputs {
+    input_layer_name: "vector2"
+  }
+}
+input_layer_names: "vector1"
+input_layer_names: "vector2"
+output_layer_names: "__dot_prod_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "vector1"
+  layer_names: "vector2"
+  layer_names: "__dot_prod_layer_0__"
+  input_layer_names: "vector1"
+  input_layer_names: "vector2"
+  output_layer_names: "__dot_prod_layer_0__"
+  is_recurrent_layer_group: false
+}
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000..4f3002b199
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000..f1e4d894a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
new file mode 100644
index 0000000000..f93d368c86
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
@@ -0,0 +1,59 @@
+type: "nn"
+layers {
+  name: "input_seq"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "input_seq"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_seq_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  beam_size: 5
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0883883476483
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input_seq"
+output_layer_names: "__kmax_seq_score_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input_seq"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_seq_score_layer_0__"
+  input_layer_names: "input_seq"
+  output_layer_names: "__kmax_seq_score_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
new file mode 100644
index 0000000000..9ba33689ed
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_l2_distance_layer.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "x"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "y"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__l2_distance_layer_0__"
+  type: "l2_distance"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "x"
+  }
+  inputs {
+    input_layer_name: "y"
+  }
+}
+input_layer_names: "x"
+input_layer_names: "y"
+output_layer_names: "__l2_distance_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "x"
+  layer_names: "y"
+  layer_names: "__l2_distance_layer_0__"
+  input_layer_names: "x"
+  input_layer_names: "y"
+  output_layer_names: "__l2_distance_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index 03f4f3a31d..39dc487146 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
@@ -105,6 +107,8 @@ layers {
       stride_y: 1
       output_y: 24
       img_size_y: 24
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_1__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
new file mode 100644
index 0000000000..0ba84dcc6d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
@@ -0,0 +1,79 @@
+type: "nn"
+layers {
+  name: "input_loc"
+  type: "data"
+  size: 16
+  active_type: ""
+  height: 16
+  width: 1
+}
+layers {
+  name: "input_conf"
+  type: "data"
+  size: 8
+  active_type: ""
+  height: 1
+  width: 8
+}
+layers {
+  name: "priorbox"
+  type: "data"
+  size: 32
+  active_type: ""
+  height: 4
+  width: 8
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 24
+  active_type: ""
+  height: 4
+  width: 6
+}
+layers {
+  name: "test_multibox_loss"
+  type: "multibox_loss"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "priorbox"
+    multibox_loss_conf {
+      num_classes: 21
+      overlap_threshold: 0.5
+      neg_pos_ratio: 3.0
+      neg_overlap: 0.5
+      background_id: 0
+      input_num: 1
+    }
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "input_loc"
+  }
+  inputs {
+    input_layer_name: "input_conf"
+  }
+}
+input_layer_names: "priorbox"
+input_layer_names: "label"
+input_layer_names: "input_loc"
+input_layer_names: "input_conf"
+output_layer_names: "test_multibox_loss"
+sub_models {
+  name: "root"
+  layer_names: "input_loc"
+  layer_names: "input_conf"
+  layer_names: "priorbox"
+  layer_names: "label"
+  layer_names: "test_multibox_loss"
+  input_layer_names: "priorbox"
+  input_layer_names: "label"
+  input_layer_names: "input_loc"
+  input_layer_names: "input_conf"
+  output_layer_names: "test_multibox_loss"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
index 15c6ab4dc8..d5d6d31a17 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -30,6 +30,8 @@ layers {
       stride_y: 1
       output_y: 48
       img_size_y: 48
+      dilation: 1
+      dilation_y: 1
     }
   }
   bias_parameter_name: "___conv_0__.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
new file mode 100644
index 0000000000..8eb98593f6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pooling3D_layer.protostr
@@ -0,0 +1,123 @@
+type: "nn"
+layers {
+  name: "data_2d"
+  type: "data"
+  size: 6000
+  active_type: ""
+  height: 20
+  width: 10
+}
+layers {
+  name: "pool___2d"
+  type: "pool"
+  size: 840
+  active_type: ""
+  inputs {
+    input_layer_name: "data_2d"
+    pool_conf {
+      pool_type: "avg-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+    }
+  }
+  height: 7
+  width: 4
+}
+layers {
+  name: "data_3d_1"
+  type: "data"
+  size: 60000
+  active_type: ""
+  height: 20
+  width: 10
+  depth: 10
+}
+layers {
+  name: "pool_3d_1"
+  type: "pool3d"
+  size: 3360
+  active_type: ""
+  inputs {
+    input_layer_name: "data_3d_1"
+    pool_conf {
+      pool_type: "avg-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+      size_z: 5
+      stride_z: 3
+      output_z: 4
+      img_size_z: 10
+      padding_z: 1
+    }
+  }
+  height: 7
+  width: 4
+  depth: 4
+}
+layers {
+  name: "pool_3d_2"
+  type: "pool3d"
+  size: 3360
+  active_type: ""
+  inputs {
+    input_layer_name: "data_3d_1"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 30
+      size_x: 5
+      stride: 3
+      output_x: 4
+      img_size: 10
+      padding: 1
+      size_y: 5
+      stride_y: 3
+      output_y: 7
+      img_size_y: 20
+      padding_y: 1
+      size_z: 5
+      stride_z: 3
+      output_z: 4
+      img_size_z: 10
+      padding_z: 1
+    }
+  }
+  height: 7
+  width: 4
+  depth: 4
+}
+input_layer_names: "data_2d"
+output_layer_names: "pool___2d"
+output_layer_names: "pool_3d_1"
+output_layer_names: "pool_3d_2"
+sub_models {
+  name: "root"
+  layer_names: "data_2d"
+  layer_names: "pool___2d"
+  layer_names: "data_3d_1"
+  layer_names: "pool_3d_1"
+  layer_names: "pool_3d_2"
+  input_layer_names: "data_2d"
+  output_layer_names: "pool___2d"
+  output_layer_names: "pool_3d_1"
+  output_layer_names: "pool_3d_2"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
new file mode 100644
index 0000000000..63fb38c650
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -0,0 +1,144 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+  height: 10
+  width: 10
+}
+layers {
+  name: "__prelu_layer_0__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_0__.w0"
+  }
+  partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_1__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_1__.w0"
+  }
+  partial_sum: 1
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_2__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_2__.w0"
+  }
+  partial_sum: 5
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_3__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_3__.w0"
+  }
+  partial_sum: 300
+  height: 10
+  width: 10
+  depth: 1
+}
+layers {
+  name: "__prelu_layer_4__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_4__.w0"
+  }
+  partial_sum: 100
+  height: 10
+  width: 10
+  depth: 1
+}
+parameters {
+  name: "___prelu_layer_0__.w0"
+  size: 300
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_1__.w0"
+  size: 300
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_2__.w0"
+  size: 60
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 60
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_3__.w0"
+  size: 1
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___prelu_layer_4__.w0"
+  size: 3
+  initial_mean: 0.25
+  initial_std: 0.0
+  dims: 1
+  dims: 3
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__prelu_layer_4__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__prelu_layer_0__"
+  layer_names: "__prelu_layer_1__"
+  layer_names: "__prelu_layer_2__"
+  layer_names: "__prelu_layer_3__"
+  layer_names: "__prelu_layer_4__"
+  input_layer_names: "input"
+  output_layer_names: "__prelu_layer_4__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
index c402aff174..f4cc492dfb 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
@@ -12,6 +12,7 @@ layers {
   inputs {
     input_layer_name: "input"
   }
+  user_arg: "layer=input %s"
 }
 input_layer_names: "input"
 output_layer_names: "input"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
new file mode 100644
index 0000000000..046037936a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
@@ -0,0 +1,593 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  height: 0
+  width: 0
+  depth: 1
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
new file mode 100644
index 0000000000..e012386ff9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_repeat_layer.protostr
@@ -0,0 +1,42 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__repeat_layer_0__"
+  type: "featmap_expand"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+}
+layers {
+  name: "__repeat_layer_1__"
+  type: "featmap_expand"
+  size: 300
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "data"
+  }
+  num_filters: 10
+  user_arg: "as_col_vec"
+}
+input_layer_names: "data"
+output_layer_names: "__repeat_layer_0__"
+output_layer_names: "__repeat_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__repeat_layer_0__"
+  layer_names: "__repeat_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__repeat_layer_0__"
+  output_layer_names: "__repeat_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
new file mode 100644
index 0000000000..9399252b23
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__resize_0__"
+  type: "resize"
+  size: 150
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__resize_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__resize_0__"
+  input_layer_names: "input"
+  output_layer_names: "__resize_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 77b447aa9d..711785be37 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -91,7 +91,7 @@ layers {
   name: "__last_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_forward"
   }
@@ -140,7 +140,7 @@ layers {
   name: "__first_seq_0__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_back"
   }
@@ -155,7 +155,7 @@ layers {
 }
 layers {
   name: "sub_seq_input@__recurrent_group_2__"
-  type: "sequence_scatter_agent"
+  type: "scatter_agent"
   size: 100
   active_type: ""
 }
@@ -182,7 +182,7 @@ layers {
 }
 layers {
   name: "rnn_subseq_forward"
-  type: "sequence_gather_agent"
+  type: "gather_agent"
   size: 200
   active_type: ""
 }
@@ -190,7 +190,7 @@ layers {
   name: "__last_seq_1__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "rnn_subseq_forward"
   }
@@ -258,7 +258,7 @@ layers {
   }
   bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
@@ -280,7 +280,7 @@ layers {
   name: "__last_seq_2__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__lstm_group_0__"
   }
@@ -329,7 +329,7 @@ layers {
   name: "__last_seq_3__"
   type: "seqlastins"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__gru_group_0__"
   }
@@ -378,7 +378,7 @@ layers {
   name: "__last_seq_4__"
   type: "seqlastins"
   size: 200
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "__fc_layer_0__"
   }
@@ -618,19 +618,15 @@ sub_models {
   memories {
     layer_name: "rnn_forward@__recurrent_group_0__"
     link_name: "rnn_forward+delay1@__recurrent_group_0__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_0__"
-    has_subseq: false
   }
   out_links {
     layer_name: "rnn_forward@__recurrent_group_0__"
     link_name: "rnn_forward"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_1__"
@@ -642,19 +638,15 @@ sub_models {
   memories {
     layer_name: "rnn_back@__recurrent_group_1__"
     link_name: "rnn_back+delay1@__recurrent_group_1__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_1__"
-    has_subseq: false
   }
   out_links {
     layer_name: "rnn_back@__recurrent_group_1__"
     link_name: "rnn_back"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_2__"
@@ -666,19 +658,15 @@ sub_models {
   memories {
     layer_name: "rnn_subseq_forward@__recurrent_group_2__"
     link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
-    is_sequence: false
   }
   in_links {
     layer_name: "sub_seq_input"
     link_name: "sub_seq_input@__recurrent_group_2__"
-    has_subseq: true
   }
   out_links {
     layer_name: "rnn_subseq_forward@__recurrent_group_2__"
     link_name: "rnn_subseq_forward"
-    has_subseq: true
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__lstm_group_0___recurrent_group"
@@ -693,24 +681,19 @@ sub_models {
   memories {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   memories {
     layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_0__"
     link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
     link_name: "__lstm_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__gru_group_0___recurrent_group"
@@ -722,19 +705,15 @@ sub_models {
   memories {
     layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
     link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
-    is_sequence: false
   }
   in_links {
     layer_name: "__mixed_1__"
     link_name: "__mixed_1__@__gru_group_0___recurrent_group"
-    has_subseq: false
   }
   out_links {
     layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
     link_name: "__gru_group_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 sub_models {
   name: "__recurrent_group_3__"
@@ -746,18 +725,14 @@ sub_models {
   memories {
     layer_name: "__fc_layer_0__@__recurrent_group_3__"
     link_name: "__memory_6__@__recurrent_group_3__"
-    is_sequence: false
   }
   in_links {
     layer_name: "seq_input"
     link_name: "seq_input@__recurrent_group_3__"
-    has_subseq: false
   }
   out_links {
     layer_name: "__fc_layer_0__@__recurrent_group_3__"
     link_name: "__fc_layer_0__"
-    has_subseq: false
   }
-  target_inlinkid: -1
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
new file mode 100644
index 0000000000..0ec88aa998
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_roi_pool_layer.protostr
@@ -0,0 +1,100 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 588
+  active_type: ""
+  height: 14
+  width: 14
+}
+layers {
+  name: "rois"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3136
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 3
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 3
+      output_x: 14
+      img_size: 14
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 14
+      img_size_y: 14
+      dilation: 1
+      dilation_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 14
+  width: 14
+}
+layers {
+  name: "__roi_pool_0__"
+  type: "roi_pool"
+  size: 784
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    roi_pool_conf {
+      pooled_width: 7
+      pooled_height: 7
+      spatial_scale: 0.0625
+    }
+  }
+  inputs {
+    input_layer_name: "rois"
+  }
+  height: 7
+  width: 7
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 432
+  initial_mean: 0.0
+  initial_std: 0.272165526976
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "rois"
+output_layer_names: "__roi_pool_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "rois"
+  layer_names: "__conv_0__"
+  layer_names: "__roi_pool_0__"
+  input_layer_names: "data"
+  input_layer_names: "rois"
+  output_layer_names: "__roi_pool_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
new file mode 100644
index 0000000000..19c9f16574
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
@@ -0,0 +1,41 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2560
+  active_type: ""
+}
+layers {
+  name: "__row_conv_layer_0__"
+  type: "row_conv"
+  size: 2560
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___row_conv_layer_0__.w0"
+    row_conv_conf {
+      context_length: 19
+    }
+  }
+}
+parameters {
+  name: "___row_conv_layer_0__.w0"
+  size: 48640
+  initial_mean: 0.0
+  initial_std: 0.229415733871
+  dims: 19
+  dims: 2560
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__row_conv_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__row_conv_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__row_conv_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
new file mode 100644
index 0000000000..c2786ff55c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__row_l2_norm_layer_0__"
+  type: "row_l2_norm"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__row_l2_norm_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__row_l2_norm_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__row_l2_norm_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
new file mode 100644
index 0000000000..35ade126a2
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_shift_layer.protostr
@@ -0,0 +1,72 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__scale_shift_0__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_0__.w0"
+  }
+}
+layers {
+  name: "__scale_shift_1__"
+  type: "scale_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___scale_shift_1__.w0"
+  }
+  bias_parameter_name: "___scale_shift_1__.wbias"
+}
+parameters {
+  name: "___scale_shift_0__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.w0"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___scale_shift_1__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__scale_shift_0__"
+output_layer_names: "__scale_shift_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__scale_shift_0__"
+  layer_names: "__scale_shift_1__"
+  input_layer_names: "data"
+  output_layer_names: "__scale_shift_0__"
+  output_layer_names: "__scale_shift_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
new file mode 100644
index 0000000000..d20133a10e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_scale_sub_region_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "indices"
+  type: "data"
+  size: 6
+  active_type: ""
+}
+layers {
+  name: "__scale_sub_region_0__"
+  type: "scale_sub_region"
+  size: 2016
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    scale_sub_region_conf {
+      image_conf {
+        channels: 1
+        img_size: 42
+        img_size_y: 48
+      }
+      value: 0.0
+    }
+  }
+  inputs {
+    input_layer_name: "indices"
+  }
+  height: 48
+  width: 42
+}
+input_layer_names: "data"
+input_layer_names: "indices"
+output_layer_names: "__scale_sub_region_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "indices"
+  layer_names: "__scale_sub_region_0__"
+  input_layer_names: "data"
+  input_layer_names: "indices"
+  output_layer_names: "__scale_sub_region_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
index 91284b4fb3..9d1b41c9d5 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -27,7 +27,7 @@ layers {
   name: "__seqreshape_0__"
   type: "seqreshape"
   size: 5
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "data1"
   }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
new file mode 100644
index 0000000000..5b73d614fe
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_slice_layer.protostr
@@ -0,0 +1,79 @@
+type: "nn"
+layers {
+  name: "word"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "starts"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "ends"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__seq_slice_layer_0__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+}
+layers {
+  name: "__seq_slice_layer_1__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "starts"
+  }
+  select_first: true
+}
+layers {
+  name: "__seq_slice_layer_2__"
+  type: "seq_slice"
+  size: 128
+  active_type: ""
+  inputs {
+    input_layer_name: "word"
+  }
+  inputs {
+    input_layer_name: "ends"
+  }
+  select_first: false
+}
+input_layer_names: "word"
+output_layer_names: "__seq_slice_layer_0__"
+output_layer_names: "__seq_slice_layer_1__"
+output_layer_names: "__seq_slice_layer_2__"
+sub_models {
+  name: "root"
+  layer_names: "word"
+  layer_names: "starts"
+  layer_names: "ends"
+  layer_names: "__seq_slice_layer_0__"
+  layer_names: "__seq_slice_layer_1__"
+  layer_names: "__seq_slice_layer_2__"
+  input_layer_names: "word"
+  output_layer_names: "__seq_slice_layer_0__"
+  output_layer_names: "__seq_slice_layer_1__"
+  output_layer_names: "__seq_slice_layer_2__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
index 1999c006d2..8989561df0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -9,76 +9,118 @@ layers {
   name: "__seq_pooling_0__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_1__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_2__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   average_strategy: "average"
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_3__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   average_strategy: "average"
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_4__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   average_strategy: "sum"
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_5__"
   type: "average"
   size: 100
-  active_type: "linear"
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   average_strategy: "sum"
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__seq_pooling_6__"
   type: "max"
   size: 100
-  active_type: "linear"
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_7__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_8__"
+  type: "average"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__seq_pooling_9__"
+  type: "max"
+  size: 100
+  active_type: ""
   inputs {
     input_layer_name: "dat_in"
   }
   output_max_index: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 input_layer_names: "dat_in"
 output_layer_names: "__seq_pooling_0__"
@@ -88,6 +130,9 @@ output_layer_names: "__seq_pooling_3__"
 output_layer_names: "__seq_pooling_4__"
 output_layer_names: "__seq_pooling_5__"
 output_layer_names: "__seq_pooling_6__"
+output_layer_names: "__seq_pooling_7__"
+output_layer_names: "__seq_pooling_8__"
+output_layer_names: "__seq_pooling_9__"
 sub_models {
   name: "root"
   layer_names: "dat_in"
@@ -98,6 +143,9 @@ sub_models {
   layer_names: "__seq_pooling_4__"
   layer_names: "__seq_pooling_5__"
   layer_names: "__seq_pooling_6__"
+  layer_names: "__seq_pooling_7__"
+  layer_names: "__seq_pooling_8__"
+  layer_names: "__seq_pooling_9__"
   input_layer_names: "dat_in"
   output_layer_names: "__seq_pooling_0__"
   output_layer_names: "__seq_pooling_1__"
@@ -106,6 +154,9 @@ sub_models {
   output_layer_names: "__seq_pooling_4__"
   output_layer_names: "__seq_pooling_5__"
   output_layer_names: "__seq_pooling_6__"
+  output_layer_names: "__seq_pooling_7__"
+  output_layer_names: "__seq_pooling_8__"
+  output_layer_names: "__seq_pooling_9__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
new file mode 100644
index 0000000000..4b906b113e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sub_nested_seq_select_layer.protostr
@@ -0,0 +1,37 @@
+type: "nn"
+layers {
+  name: "input_seq"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "input"
+  type: "data"
+  size: 5
+  active_type: ""
+}
+layers {
+  name: "__sub_nested_seq_layer_0__"
+  type: "sub_nested_seq"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input_seq"
+  }
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input_seq"
+output_layer_names: "__sub_nested_seq_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input_seq"
+  layer_names: "input"
+  layer_names: "__sub_nested_seq_layer_0__"
+  input_layer_names: "input_seq"
+  output_layer_names: "__sub_nested_seq_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
index d0ad388165..7a2f3eab38 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
@@ -22,6 +22,9 @@ layers {
   inputs {
     input_layer_name: "b"
   }
+  height: 0
+  width: 0
+  depth: 1
 }
 layers {
   name: "__concat_0__"
@@ -34,6 +37,9 @@ layers {
   inputs {
     input_layer_name: "b"
   }
+  height: 0
+  width: 0
+  depth: 1
 }
 layers {
   name: "__concat_1__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
index 7c848ef3fc..69a0a5b8ff 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
index c19bb9685a..97b41fb372 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
index 05810597b3..4e653dedb9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
@@ -20,12 +34,13 @@ lstm1 = lstmemory_group(
     input=m1,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
+
 lstm2 = lstmemory_group(
     input=m2,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
 
 softmax_param = ParamAttr(name='softmax_param')
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
index a5b5bb30b1..dc418325f8 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
new file mode 100644
index 0000000000..5b98e3fb34
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_BatchNorm3D.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-4)
+
+#data = data_layer(name='data', size=180, width=30, height=6)
+#batchNorm = batch_norm_layer(data, num_channels=1)
+#outputs(batchNorm)
+
+data3D = data_layer(name='data3D', size=120 * 3, width=20, height=6, depth=3)
+batchNorm3D = batch_norm_layer(data3D, num_channels=1, img3D=True)
+outputs(batchNorm3D)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
index cd7f609638..f3abdfe1ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
index be83f4f83c..4eb9f207e0 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bilinear_interp.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/v1_api_demo/mnist/data/generate_list.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
similarity index 70%
rename from v1_api_demo/mnist/data/generate_list.py
rename to python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
index 49981cc7a9..24564c105f 100644
--- a/v1_api_demo/mnist/data/generate_list.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-o = open("./" + "train.list", "w")
-o.write("./data/raw_data/train" + "\n")
-o.close()
+from paddle.trainer_config_helpers import *
 
-o = open("./" + "test.list", "w")
-o.write("./data/raw_data/t10k" + "\n")
-o.close()
+data = data_layer(name='input', size=300)
+clip = clip_layer(input=data, min=-10, max=10)
+
+outputs(clip)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
new file mode 100644
index 0000000000..35087c4228
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_conv3d_layer.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+num_channels = 3
+filter_size = 3
+filter_size_y = 3
+filter_size_z = 3
+stride = 2
+stride_y = 2
+stride_z = 2
+padding = 1
+padding_y = 1
+padding_z = 1
+groups = 1
+
+data = data_layer(
+    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
+# first
+conv3d_1 = img_conv3d_layer(
+    input=data,
+    name='conv3d_1',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=filter_size,
+    stride=stride,
+    padding=padding,
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=False,
+    layer_type="conv3d",
+    act=LinearActivation())
+# second
+conv3d_2 = img_conv3d_layer(
+    input=data,
+    name='conv3d_2',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=[filter_size, filter_size_y, filter_size_z],
+    stride=[stride, stride_y, stride_z],
+    padding=[padding, padding_y, padding_z],
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=False,
+    layer_type="conv3d",
+    act=LinearActivation())
+outputs(conv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index d2a3b702a1..b076b89106 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
@@ -33,7 +47,9 @@ outputs(
         input=probs, label=xe_label),
     cross_entropy_with_selfnorm(
         input=probs, label=xe_label),
-    huber_cost(
+    huber_regression_cost(
+        input=seq_in, label=labels),
+    huber_classification_cost(
         input=data_layer(
             name='huber_probs', size=1),
         label=data_layer(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index c369062930..fa7a1abe9a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
@@ -10,7 +24,7 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    mse_cost(
+    square_error_cost(
         input=fc, label=lbl, weight=wt),
     nce_layer(
         input=fc,
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
new file mode 100644
index 0000000000..569d747857
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+refernce_data = data_layer(name='data', size=768, height=16, width=16)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
+
+crop = crop_layer(input=[pool, refernce_data], axis=2)
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
new file mode 100644
index 0000000000..4a5bdf1181
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cross_entropy_over_beam.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+#coding=utf-8
+
+from paddle.trainer_config_helpers import *
+beam_size = 5
+
+# the first beam expansion.
+sentence_states = data_layer(name="sentence_states", size=32)
+sentence_scores = data_layer(name="sentence_scores", size=1)
+topk_sentence_ids = kmax_seq_score_layer(
+    input=sentence_scores, beam_size=beam_size)
+
+# the second beam expansion.
+topk_sen = sub_nested_seq_layer(
+    input=sentence_states, selected_indices=topk_sentence_ids)
+start_pos_scores = fc_layer(input=topk_sen, size=1, act=LinearActivation())
+topk_start_pos_ids = kmax_seq_score_layer(
+    input=sentence_scores, beam_size=beam_size)
+
+# the final beam expansion.
+topk_start_spans = seq_slice_layer(
+    input=topk_sen, starts=topk_start_pos_ids, ends=None)
+end_pos_scores = fc_layer(
+    input=topk_start_spans, size=1, act=LinearActivation())
+topk_end_pos_ids = kmax_seq_score_layer(
+    input=end_pos_scores, beam_size=beam_size)
+
+# define the cost
+sentence_idx = data_layer(name="sentences_ids", size=1)
+start_idx = data_layer(name="start_ids", size=1)
+end_idx = data_layer(name="end_ids", size=1)
+cost = cross_entropy_over_beam(input=[
+    BeamInput(
+        candidate_scores=sentence_scores,
+        selected_candidates=topk_sentence_ids,
+        gold=sentence_idx), BeamInput(
+            candidate_scores=start_pos_scores,
+            selected_candidates=topk_start_pos_ids,
+            gold=start_idx), BeamInput(
+                candidate_scores=end_pos_scores,
+                selected_candidates=topk_end_pos_ids,
+                gold=end_idx)
+])
+
+outputs(cost)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
new file mode 100644
index 0000000000..4f27d99873
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_deconv3d_layer.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+num_channels = 3
+filter_size = 3
+filter_size_y = 3
+filter_size_z = 3
+stride = 2
+stride_y = 2
+stride_z = 2
+padding = 1
+padding_y = 1
+padding_z = 1
+groups = 1
+
+data = data_layer(
+    name='data', size=12096 * num_channels, height=48, width=42, depth=6)
+
+# first
+deconv3d_1 = img_conv3d_layer(
+    input=data,
+    name='deconv3d_1',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=filter_size,
+    stride=stride,
+    padding=padding,
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=True,
+    layer_type="deconv3d",
+    act=LinearActivation())
+# second
+deconv3d_2 = img_conv3d_layer(
+    input=data,
+    name='deconv3d_2',
+    num_filters=16,
+    num_channels=num_channels,
+    filter_size=[filter_size, filter_size_y, filter_size_z],
+    stride=[stride, stride_y, stride_z],
+    padding=[padding, padding_y, padding_z],
+    groups=groups,
+    bias_attr=True,
+    shared_biases=True,
+    trans=True,
+    layer_type="deconv3d",
+    act=LinearActivation())
+outputs(deconv3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
new file mode 100644
index 0000000000..d37954222e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
+
+input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
+
+priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
+
+detout = detection_output_layer(
+    input_loc=input_loc,
+    input_conf=input_conf,
+    priorbox=priorbox,
+    num_classes=21,
+    nms_threshold=0.45,
+    nms_top_k=400,
+    keep_top_k=200,
+    confidence_threshold=0.01,
+    background_id=0,
+    name='test_detection_output')
+
+outputs(detout)
diff --git a/v1_api_demo/sequence_tagging/data/get_data.sh b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
old mode 100755
new mode 100644
similarity index 66%
rename from v1_api_demo/sequence_tagging/data/get_data.sh
rename to python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
index 0cdb394035..63ba0a72b9
--- a/v1_api_demo/sequence_tagging/data/get_data.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_dot_prod_layer.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+from paddle.trainer_config_helpers import *
 
-wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
-wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
+vec1 = data_layer(name='vector1', size=10)
+vec2 = data_layer(name='vector2', size=10)
+dot_product = dot_prod_layer(input1=vec1, input2=vec2)
+
+outputs(dot_product)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
index c53f10e0a4..9892bca05d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/v1_api_demo/model_zoo/resnet/predict.sh b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
old mode 100755
new mode 100644
similarity index 68%
rename from v1_api_demo/model_zoo/resnet/predict.sh
rename to python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
index 2b67b17c48..6fb773d9f7
--- a/v1_api_demo/model_zoo/resnet/predict.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,12 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-python classify.py \
-     --job=predict \
-     --conf=resnet.py\
-     --model=model/resnet_50 \
-     --multi_crop \
-     --use_gpu=1 \
-     --data=./example/test.list
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
index 2842d3429c..4dd37d0242 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000..082646b9d3
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,30 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
index 474e4f36ba..f5271b8280 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-4)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
index dff1c535b3..ad86d7d5bd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
new file mode 100644
index 0000000000..171da10f75
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
@@ -0,0 +1,9 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name="input_seq", size=128)
+scores = fc_layer(input=data, size=1, act=ExpActivation())
+kmax_seq_id = kmax_seq_score_layer(input=scores, beam_size=5)
+
+outputs(kmax_seq_id)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
new file mode 100644
index 0000000000..1796e1c6b6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_l2_distance_layer.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+outputs(
+    l2_distance_layer(
+        x=data_layer(
+            name='x', size=128), y=data_layer(
+                name='y', size=128)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
index 7ca1cc2db3..7484818ab2 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
index eb14270baa..22788be2e9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
new file mode 100644
index 0000000000..0dcccc49e4
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+input_loc = data_layer(name='input_loc', size=16, height=16, width=1)
+
+input_conf = data_layer(name='input_conf', size=8, height=1, width=8)
+
+priorbox = data_layer(name='priorbox', size=32, height=4, width=8)
+
+label = data_layer(name='label', size=24, height=4, width=6)
+
+multibox_loss = multibox_loss_layer(
+    input_loc=input_loc,
+    input_conf=input_conf,
+    priorbox=priorbox,
+    label=label,
+    num_classes=21,
+    overlap_threshold=0.5,
+    neg_pos_ratio=3.0,
+    neg_overlap=0.5,
+    background_id=0,
+    name='test_multibox_loss')
+
+outputs(multibox_loss)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
index d250001932..046d38741e 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multiplex_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
index b7a15666f0..d81128c77c 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
index 491e8c8caa..44b0b34d5a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
new file mode 100644
index 0000000000..e257e735ad
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pooling3D_layer.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100, learning_rate=1e-5)
+
+data_2d = data_layer(name='data_2d', size=6000, height=20, width=10)
+
+pool_2d = img_pool_layer(
+    name="pool___2d",
+    input=data_2d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_2d)
+
+data_3d = data_layer(
+    name='data_3d_1', size=60000, depth=10, height=20, width=10)
+
+pool_3d_1 = img_pool3d_layer(
+    name="pool_3d_1",
+    input=data_3d,
+    num_channels=30,
+    pool_size=5,
+    stride=3,
+    padding=1,
+    pool_type=AvgPooling())
+outputs(pool_3d_1)
+
+pool_3d_2 = img_pool3d_layer(
+    name="pool_3d_2",
+    input=data_3d,
+    num_channels=30,
+    pool_size=[5, 5, 5],
+    stride=[3, 3, 3],
+    padding=[1, 1, 1],
+    pool_type=MaxPooling())
+outputs(pool_3d_2)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
new file mode 100644
index 0000000000..098e2397ec
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300, height=10, width=10)
+prelu = prelu_layer(input=data, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=1, num_channels=3)
+prelu = prelu_layer(input=data, partial_sum=5, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=True, num_channels=3)
+prelu = prelu_layer(input=data, channel_shared=False, num_channels=3)
+
+outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
index 8da26ff44b..714d8893e9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
new file mode 100644
index 0000000000..188a3d2320
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -0,0 +1,30 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=100)
+
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)
diff --git a/v1_api_demo/quick_start/data/get_data.sh b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
old mode 100755
new mode 100644
similarity index 59%
rename from v1_api_demo/quick_start/data/get_data.sh
rename to python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
index a09a18f919..93b673afee
--- a/v1_api_demo/quick_start/data/get_data.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_repeat_layer.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,16 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
+from paddle.trainer_config_helpers import *
 
-# Download the preprocessed data
-wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+settings(batch_size=1000, learning_rate=1e-5)
 
-# Extract package
-tar zxvf preprocessed_data.tar.gz
+din = data_layer(name='data', size=30)
 
-# Remove compressed package
-rm preprocessed_data.tar.gz
+outputs(
+    repeat_layer(
+        input=din, num_repeats=10, as_row_vector=True),
+    repeat_layer(
+        input=din, num_repeats=10, act=TanhActivation(), as_row_vector=False))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
new file mode 100644
index 0000000000..3a202974e3
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+resized = resize_layer(input=data, size=150)
+
+outputs(resized)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 91010759e4..91074b8fdf 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
new file mode 100644
index 0000000000..f0a37f7e99
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_roi_pool_layer.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=3 * 14 * 14, height=14, width=14)
+
+rois = data_layer(name='rois', size=10)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=3,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+roi_pool = roi_pool_layer(
+    input=conv,
+    rois=rois,
+    pooled_width=7,
+    pooled_height=7,
+    spatial_scale=1. / 16)
+
+outputs(roi_pool)
diff --git a/v1_api_demo/quick_start/cluster/env.sh b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
similarity index 64%
rename from v1_api_demo/quick_start/cluster/env.sh
rename to python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
index a404993835..68b1a991f3 100644
--- a/v1_api_demo/quick_start/cluster/env.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,17 +11,13 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-function get_nics() {
-  machine=`uname -s`
-  local nics=""
-  if [ "$machine" == "Linux" ]; then
-    nics="lo"
-  elif [ "$machine" == "Darwin" ]; then
-    nics="lo0"
-  else
-    nics="unsupport"
-  fi
-  echo $nics
-}
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2560)
+
+row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
+
+outputs(row_conv)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
new file mode 100644
index 0000000000..c25393f580
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
@@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+row_l2_norm = row_l2_norm_layer(input=data)
+
+outputs(row_l2_norm)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
new file mode 100644
index 0000000000..3691e8daea
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_shift_layer.py
@@ -0,0 +1,23 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=100)
+
+scale = scale_shift_layer(input=data, bias_attr=False)
+
+scale_shift = scale_shift_layer(input=data)
+
+outputs(scale, scale_shift)
diff --git a/v1_api_demo/traffic_prediction/predict.sh b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
old mode 100755
new mode 100644
similarity index 59%
rename from v1_api_demo/traffic_prediction/predict.sh
rename to python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
index 2dbd5e8805..426afcf3a0
--- a/v1_api_demo/traffic_prediction/predict.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_scale_sub_region_layer.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,19 +11,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
 
-cfg=trainer_config.py
-# pass choice 
-model="output/pass-00000"
-paddle train \
-    --config=$cfg \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. 
+from paddle.trainer_config_helpers import *
 
-python gen_result.py > result.csv
+settings(batch_size=1000, learning_rate=1e-5)
 
-rm -rf rank-00000
+data = data_layer(name='data', size=2016, height=48, width=42)
+indices = data_layer(name='indices', size=6)
+
+scale_sub_region = scale_sub_region_layer(
+    input=data, indices=indices, value=0.0)
+
+outputs(scale_sub_region)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
index 5c161ba805..7296081857 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=1000, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
new file mode 100644
index 0000000000..510ad32208
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_slice_layer.py
@@ -0,0 +1,13 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+input_seq = data_layer("word", size=128)
+starts = data_layer("starts", size=5)
+ends = data_layer("ends", size=5)
+
+seq_slice1 = seq_slice_layer(input=input_seq, starts=starts, ends=ends)
+seq_slice2 = seq_slice_layer(input=input_seq, starts=starts, ends=None)
+seq_slice3 = seq_slice_layer(input=input_seq, starts=None, ends=ends)
+
+outputs(seq_slice1, seq_slice2, seq_slice3)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
index 3c49eb56c1..d13a5a8429 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
@@ -14,6 +28,14 @@ for pt in POOL_TYPE:
     for al in AGG_LEVEL:
         opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
 
+for pt in POOL_TYPE:
+    opts.append(
+        pooling_layer(
+            input=din,
+            agg_level=AggregateLevel.TO_NO_SEQUENCE,
+            pooling_type=pt(),
+            stride=5))
+
 opts.append(
     pooling_layer(
         input=din, pooling_type=MaxPooling(output_max_index=True)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
index 66629662dd..42225b8505 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 data = data_layer(name='input', size=300)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
index 318b4459ba..7ebdf7408d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_split_datasource.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 define_py_data_sources2(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
index e0b0d0d3be..1f19ea77ad 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_spp_layer.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(batch_size=100, learning_rate=1e-5)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
new file mode 100644
index 0000000000..6d1c3175ba
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sub_nested_seq_select_layer.py
@@ -0,0 +1,11 @@
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+beam_size = 5
+
+data = data_layer(name='input_seq', size=300)
+selected_ids = data_layer(name='input', size=beam_size)
+sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
+
+outputs(sub_nest_seq)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
index ebb39219bd..8581ba60ab 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 settings(batch_size=1000, learning_rate=1e-4)
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
index 27f1c8e993..a66c9515c7 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from paddle.trainer_config_helpers import *
 
 settings(learning_rate=1e-4, batch_size=1000)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test.py b/python/paddle/trainer_config_helpers/tests/layers_test.py
index 05902ea293..b3dd8f8fc7 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
@@ -17,3 +17,4 @@ from paddle.trainer.config_parser import parse_config_and_serialize
 if __name__ == '__main__':
     parse_config_and_serialize(
         'trainer_config_helpers/tests/layers_test_config.py', '')
+# layers_test_config.py
diff --git a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
index 0423babdb7..81186dedd2 100644
--- a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+++ b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import unittest
 from paddle.trainer.config_parser import parse_config
 
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
new file mode 100644
index 0000000000..5dc2111e37
--- /dev/null
+++ b/python/paddle/utils/dump_v2_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.layer import parse_network
+from paddle.proto import TrainerConfig_pb2
+
+__all__ = ["dump_v2_config"]
+
+
+def dump_v2_config(topology, save_path, binary=False):
+    """ Dump the network topology to a specified file.
+
+    This function is only used to dump network defined by using PaddlePaddle V2
+    APIs. This function will NOT dump configurations related to PaddlePaddle
+    optimizer.
+
+    :param topology: The output layers (can be more than one layers given in a
+                     Python List or Tuple) of the entire network. Using the
+                     specified layers (if more than one layer is given) as root,
+                     traversing back to the data layer(s), all the layers
+                     connected to the specified output layers will be dumped.
+                     Layers not connceted to the specified will not be dumped.
+    :type topology: LayerOutput|List|Tuple
+    :param save_path: The path to save the dumped network topology.
+    :type save_path: str
+    :param binary: Whether to dump the serialized network topology or not.
+                   The default value is false. NOTE that, if you call this
+                   function to generate network topology for PaddlePaddle C-API,
+                   a serialized version of network topology is required. When
+                   using PaddlePaddle C-API, this flag MUST be set to True.
+    :type binary: bool
+    """
+
+    if isinstance(topology, LayerOutput):
+        topology = [topology]
+    elif isinstance(topology, collections.Sequence):
+        for out_layer in topology:
+            assert isinstance(out_layer, LayerOutput), (
+                "The type of each element in the parameter topology "
+                "should be LayerOutput.")
+    else:
+        raise RuntimeError("Error input type for parameter topology.")
+
+    model_str = parse_network(topology)
+    with open(save_path, "w") as fout:
+        if binary:
+            fout.write(model_str.SerializeToString())
+        else:
+            fout.write(str(model_str))
diff --git a/python/paddle/utils/image_multiproc.py b/python/paddle/utils/image_multiproc.py
index e8db525ff5..fdbefef9ff 100644
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os, sys
 import numpy as np
 from PIL import Image
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
new file mode 100644
index 0000000000..2b10020772
--- /dev/null
+++ b/python/paddle/utils/merge_model.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gzip
+import struct
+import os
+
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.parameters import Parameters
+from paddle.proto import ModelConfig_pb2
+from paddle.v2.topology import Topology
+
+
+def merge_v2_model(net, param_file, output_file):
+    '''Merge the model config and parameters into one file.
+
+    The model configuration file describes the model structure which
+    ends with .py. The parameters file stores the parameters of the model
+    which ends with .tar.gz.
+
+    @param  net            The output layer of the network for inference.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by
+                           v2 api.
+    @param  output_file    Path of the merged file which will be generated.
+
+    Usage:
+
+        from paddle.utils.merge_model import merge_v2_model
+        # import your network configuration
+        from example_net import net_conf
+
+        net = net_conf(is_predict=True)
+        param_file = './param_pass_00000.tar.gz'
+        output_file = './output.paddle'
+
+        merge_v2_model(net, param_file, output_file)
+
+    '''
+
+    assert isinstance(net, LayerOutput), \
+            "The net should be the output of the network for inference"
+    assert os.path.exists(param_file), \
+            "The model parameters file %s does not exists " % (param_file)
+
+    model_proto = Topology(net).proto()
+    assert isinstance(model_proto, ModelConfig_pb2.ModelConfig)
+
+    with gzip.open(param_file) as f:
+        params = Parameters.from_tar(f)
+
+    if os.path.exists(output_file):
+        os.remove(output_file)
+
+    with open(output_file, 'w') as f:
+        param_names = [param.name for param in model_proto.parameters]
+        conf_str = model_proto.SerializeToString()
+        f.write(struct.pack('q', len(conf_str)))
+        f.write(conf_str)
+        for pname in param_names:
+            params.serialize(pname, f)
+
+    print 'Generate  %s  success!' % (output_file)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index b9d0a7f291..df710c33d0 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,19 +30,102 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
+import paddle.trainer.config_parser as cp
 
 __all__ = [
-    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
-    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
-    'topology', 'networks', 'infer', 'plot', 'evaluator', 'image'
+    'default_startup_program',
+    'default_main_program',
+    'optimizer',
+    'layer',
+    'activation',
+    'parameters',
+    'init',
+    'trainer',
+    'event',
+    'data_type',
+    'attr',
+    'pooling',
+    'dataset',
+    'reader',
+    'topology',
+    'networks',
+    'infer',
+    'plot',
+    'evaluator',
+    'image',
+    'master',
 ]
 
+cp.begin_parse()
+
+
+def set_env_vars(trainer_count):
+    '''Auto set CPU environment if have not set before.
+       For MKL:
+         export KMP_AFFINITY, OMP_DYNAMIC according to the Hyper Threading status.
+         export OMP_NUM_THREADS, MKL_NUM_THREADS according to trainer_count.
+       For OpenBLAS:
+         export OPENBLAS_NUM_THREADS, OPENBLAS_MAIN_FREE according to trainer_count. 
+    '''
+    import platform, paddle
+    if not platform.system() in ['Linux', 'Darwin']:
+        return
+
+    def set_env(key, value):
+        '''If the key has not been set in the environment, set it with value.'''
+        assert isinstance(key, str)
+        assert isinstance(value, str)
+        envset = os.environ.get(key)
+        if envset is None:
+            os.environ[key] = value
+
+    def num_physical_cores():
+        '''Get the number of physical cores'''
+        if platform.system() == "Linux":
+            num_sockets = int(
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            num_cores_per_socket = int(
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
+                .read())
+            return num_sockets * num_cores_per_socket
+        else:
+            cmds = {"Darwin": "sysctl -n hw.physicalcpu"}
+            return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    def num_logical_processors():
+        '''Get the number of logical processors'''
+        cmds = {
+            "Linux": "grep \"processor\" /proc/cpuinfo|sort -u|wc -l",
+            "Darwin": "sysctl -n hw.logicalcpu"
+        }
+        return int(os.popen(cmds.get(platform.system(), "expr 1")).read())
+
+    num_cores = num_physical_cores()
+    num_processors = num_logical_processors()
+    if paddle.version.mkl() == 'ON':
+        if num_processors > num_cores:  # Hyper Threading is enabled
+            set_env("OMP_DYNAMIC", "true")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,1,0")
+        else:
+            set_env("OMP_DYNAMIC", "false")
+            set_env("KMP_AFFINITY", "granularity=fine,compact,0,0")
+    threads = num_processors / trainer_count
+    threads = '1' if threads < 1 else str(threads)
+    if paddle.version.mkl() == 'ON':
+        set_env("OMP_NUM_THREADS", threads)
+        set_env("MKL_NUM_THREADS", threads)
+    else:
+        set_env("OPENBLAS_NUM_THREADS", threads)
+        if threads > 1:
+            set_env("OPENBLAS_MAIN_FREE", '1')
+
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
@@ -56,6 +138,17 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    set_env_vars(kwargs.get('trainer_count', 1))
+
+    if 'use_gpu' in kwargs:
+        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    if 'use_mkldnn' in kwargs:
+        cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
+    if 'use_mkl_packed' in kwargs:
+        cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
+    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
+                                         "supported in v2 APIs.")
+
     api.initPaddle(*args)
 
 
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
index 32f78614e7..5d23894d73 100644
--- a/python/paddle/v2/attr.py
+++ b/python/paddle/v2/attr.py
@@ -17,10 +17,12 @@ import paddle.trainer_config_helpers.attrs
 __all__ = [
     "Param",
     "Extra",
+    "Hook",
 ]
 
 Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
 Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+Hook = paddle.trainer_config_helpers.attrs.HookAttribute
 
 for each in paddle.trainer_config_helpers.attrs.__all__:
     globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e..98dfb85a0e 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 26252d5bbd..c1acbecd9c 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -24,9 +24,23 @@ import conll05
 import uci_housing
 import sentiment
 import wmt14
+import wmt16
 import mq2007
+import flowers
+import voc2012
 
 __all__ = [
-    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007'
+    'mnist',
+    'imikolov',
+    'imdb',
+    'cifar',
+    'movielens',
+    'conll05',
+    'sentiment'
+    'uci_housing',
+    'wmt14',
+    'wmt16',
+    'mq2007',
+    'flowers',
+    'voc2012',
 ]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 81af0a8e66..0a2a1ced11 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -31,10 +31,10 @@ images per class.
 import cPickle
 import itertools
 import numpy
-from common import download
+import paddle.v2.dataset.common
 import tarfile
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -75,7 +75,8 @@ def train100():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
 
 
 def test100():
@@ -88,7 +89,9 @@ def test100():
     :return: Test reader creator.
     :rtype: callable
     """
-    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
 
 
 def train10():
@@ -102,7 +105,8 @@ def train10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
 
 
 def test10():
@@ -116,9 +120,20 @@ def test10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
 
 
 def fetch():
-    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 418b592a5a..9aba35a648 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -15,19 +15,42 @@
 import requests
 import hashlib
 import os
+import errno
 import shutil
 import sys
 import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
 
-__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
+__all__ = [
+    'DATA_HOME',
+    'download',
+    'md5file',
+    'split',
+    'cluster_files_reader',
+    'convert',
+]
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
-if not os.path.exists(DATA_HOME):
-    os.makedirs(DATA_HOME)
+
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+def must_mkdirs(path):
+    try:
+        os.makedirs(DATA_HOME)
+    except OSError as exc:
+        if exc.errno != errno.EEXIST:
+            raise
+        pass
+
+
+must_mkdirs(DATA_HOME)
 
 
 def md5file(fname):
@@ -39,13 +62,23 @@ def md5file(fname):
     return hash_md5.hexdigest()
 
 
-def download(url, module_name, md5sum):
+def download(url, module_name, md5sum, save_name=None):
     dirname = os.path.join(DATA_HOME, module_name)
     if not os.path.exists(dirname):
         os.makedirs(dirname)
 
-    filename = os.path.join(dirname, url.split('/')[-1])
-    if not (os.path.exists(filename) and md5file(filename) == md5sum):
+    filename = os.path.join(dirname,
+                            url.split('/')[-1]
+                            if save_name is None else save_name)
+
+    retry = 0
+    retry_limit = 3
+    while not (os.path.exists(filename) and md5file(filename) == md5sum):
+        if retry < retry_limit:
+            retry += 1
+        else:
+            raise RuntimeError("Cannot download {0} within retry limit {1}".
+                               format(url, retry_limit))
         print "Cache file %s not found, downloading %s" % (filename, url)
         r = requests.get(url, stream=True)
         total_length = r.headers.get('content-length')
@@ -78,6 +111,19 @@ def fetch_all():
                 "fetch")()
 
 
+def fetch_all_recordio(path):
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "convert" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \
+                not module_name == "common":
+            ds_path = os.path.join(path, module_name)
+            must_mkdirs(ds_path)
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "convert")(ds_path)
+
+
 def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
     """
     you can call the function as:
@@ -149,3 +195,40 @@ def cluster_files_reader(files_pattern,
                     yield line
 
     return reader
+
+
+def convert(output_path, reader, line_count, name_prefix):
+    import recordio
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read
+                   data instances.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before
+                                 writing.
+    """
+
+    assert line_count >= 1
+    indx_f = 0
+
+    def write_data(indx_f, lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
+
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
+            lines = []
+            indx_f += 1
+            continue
+
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 12d648bf65..23f5a24a1c 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -23,9 +23,9 @@ to initialize SRL model.
 import tarfile
 import gzip
 import itertools
-from common import download
+import paddle.v2.dataset.common
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -182,9 +182,15 @@ def get_dict():
     """
     Get the word, verb and label dictionary of Wikipedia corpus.
     """
-    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
-    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
-    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    word_dict = load_dict(
+        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+                                          WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+                                          VERBDICT_MD5))
+    label_dict = load_dict(
+        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+                                          TRGDICT_MD5))
     return word_dict, verb_dict, label_dict
 
 
@@ -192,7 +198,7 @@ def get_embedding():
     """
     Get the trained word vector based on Wikipedia corpus.
     """
-    return download(EMB_URL, 'conll05st', EMB_MD5)
+    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
 
 
 def test():
@@ -209,15 +215,23 @@ def test():
     """
     word_dict, verb_dict, label_dict = get_dict()
     reader = corpus_reader(
-        download(DATA_URL, 'conll05st', DATA_MD5),
+        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
 def fetch():
-    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    download(EMB_URL, 'conll05st', EMB_MD5)
-    download(DATA_URL, 'conll05st', DATA_MD5)
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 07c13cf719..7bdddeaabe 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -13,72 +13,85 @@
 # limitations under the License.
 """
 This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
 and parse train/test set intopaddle reader creators.
 
-This set contains images of flowers belonging to 102 different categories. 
+This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a
 minimum of 40 images for each category.
 
 The database was used in:
 
 Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision, 
-Graphics and Image Processing (2008) 
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
 import cPickle
 import itertools
+import functools
 from common import download
 import tarfile
 import scipy.io as scio
 from paddle.v2.image import *
+from paddle.v2.reader import *
 import os
 import numpy as np
-import paddle.v2 as paddle
 from multiprocessing import cpu_count
 __all__ = ['train', 'test', 'valid']
 
 DATA_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/102flowers.tgz'
 LABEL_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/imagelabels.mat'
 SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
-DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
+DATA_MD5 = '33bfc11892f1e405ca193ae9a9f2a118'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
 
 
-def default_mapper(sample):
+def default_mapper(is_train, sample):
     '''
     map image bytes data to type needed by model input layer
     '''
     img, label = sample
-    img = paddle.image.load_image_bytes(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
+    img = load_image_bytes(img)
+    img = simple_transform(
+        img, 256, 224, is_train, mean=[103.94, 116.78, 123.68])
     return img.flatten().astype('float32'), label
 
 
+train_mapper = functools.partial(default_mapper, True)
+test_mapper = functools.partial(default_mapper, False)
+
+
 def reader_creator(data_file,
                    label_file,
                    setid_file,
                    dataset_name,
-                   mapper=default_mapper,
-                   buffered_size=1024):
+                   mapper,
+                   buffered_size=1024,
+                   use_xmap=True):
     '''
-    1. read images from tar file and 
+    1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
     2. get a reader to read sample from batch file
-    
-    :param data_file: downloaded data file 
+
+    :param data_file: downloaded data file
     :type data_file: string
-    :param label_file: downloaded label file 
+    :param label_file: downloaded label file
     :type label_file: string
     :param setid_file: downloaded setid file containing information
                         about how to split dataset
     :type setid_file: string
     :param dataset_name: data set name (tstid|trnid|valid)
     :type dataset_name: string
-    :param mapper: a function to map image bytes data to type 
+    :param mapper: a function to map image bytes data to type
                     needed by model input layer
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
@@ -103,17 +116,19 @@ def reader_creator(data_file,
             data = batch['data']
             labels = batch['label']
             for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label)
+                yield sample, int(label) - 1
 
-    return paddle.reader.xmap_readers(mapper, reader,
-                                      cpu_count(), buffered_size)
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
 
 
-def train(mapper=default_mapper, buffered_size=1024):
+def train(mapper=train_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers training set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -128,15 +143,15 @@ def train(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def test(mapper=default_mapper, buffered_size=1024):
+def test(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers test set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -151,15 +166,15 @@ def test(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def valid(mapper=default_mapper, buffered_size=1024):
+def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers validation set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -174,8 +189,8 @@ def valid(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 5dc5abfe53..37c4296f9b 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -23,12 +23,10 @@ Besides, this module also provides API for building dictionary.
 import paddle.v2.dataset.common
 import collections
 import tarfile
-import Queue
 import re
 import string
-import threading
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = ['build_dict', 'train', 'test', 'convert']
 
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
@@ -74,49 +72,22 @@ def build_dict(pattern, cutoff):
     return word_idx
 
 
-def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
+    INS = []
 
-    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
-
-    def load(pattern, queue):
+    def load(pattern, out, label):
         for doc in tokenize(pattern):
-            queue.put(doc)
-        queue.put(None)
+            out.append(([word_idx.get(w, UNK) for w in doc], label))
+
+    load(pos_pattern, INS, 0)
+    load(neg_pattern, INS, 1)
 
     def reader():
-        # Creates two threads that loads positive and negative samples
-        # into qs.
-        t0 = threading.Thread(
-            target=load, args=(
-                pos_pattern,
-                qs[0], ))
-        t0.daemon = True
-        t0.start()
-
-        t1 = threading.Thread(
-            target=load, args=(
-                neg_pattern,
-                qs[1], ))
-        t1.daemon = True
-        t1.start()
-
-        # Read alternatively from qs[0] and qs[1].
-        i = 0
-        doc = qs[i].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            i += 1
-            doc = qs[i % 2].get()
-
-        # If any queue is empty, reads from the other queue.
-        i += 1
-        doc = qs[i % 2].get()
-        while doc != None:
-            yield [word_idx.get(w, UNK) for w in doc], i % 2
-            doc = qs[i % 2].get()
-
-    return reader()
+        for doc, label in INS:
+            yield doc, label
+
+    return reader
 
 
 def train(word_idx):
@@ -133,7 +104,7 @@ def train(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/train/pos/.*\.txt$"),
-        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
 def test(word_idx):
@@ -150,7 +121,7 @@ def test(word_idx):
     """
     return reader_creator(
         re.compile("aclImdb/test/pos/.*\.txt$"),
-        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
 def word_dict():
@@ -166,3 +137,12 @@ def word_dict():
 
 def fetch():
     paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index dd3a4552d2..617c722c41 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -22,7 +22,7 @@ import paddle.v2.dataset.common
 import collections
 import tarfile
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
@@ -146,3 +146,16 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 
 def fetch():
     paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.v2.dataset.common.convert(path,
+                                     train(word_dict, N), 1000,
+                                     "imikolov_train")
+    paddle.v2.dataset.common.convert(path,
+                                     test(word_dict, N), 1000, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 435556b292..9f675bed89 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -21,7 +21,7 @@ import paddle.v2.dataset.common
 import subprocess
 import numpy
 import platform
-__all__ = ['train', 'test']
+__all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -113,3 +113,11 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
     paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 837a859126..5b61a9420a 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -23,14 +23,15 @@ set and test set into paddle reader creators.
 """
 
 import zipfile
-from common import download
+import paddle.v2.dataset.common
 import re
 import random
 import functools
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
 ]
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
@@ -99,7 +100,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(URL, "movielens", MD5)
+    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -246,7 +247,15 @@ def unittest():
 
 
 def fetch():
-    download(URL, "movielens", MD5)
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index fd71b34166..d3b3dd524c 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -24,7 +24,6 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 """
 
 import os
-import random
 import functools
 import rarfile
 from common import download
@@ -212,19 +211,19 @@ def gen_pair(querylist, partial_order="full"):
         for j in range(i + 1, len(querylist)):
             query_right = querylist[j]
             if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_left.feature_vector),
                     np.array(query_right.feature_vector)
                 ])
             elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_right.feature_vector),
                     np.array(query_left.feature_vector)
                 ])
     for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 
 
 def gen_list(querylist):
@@ -242,9 +241,9 @@ def gen_list(querylist):
     if not isinstance(querylist, QueryList):
         querylist = QueryList(querylist)
     querylist._correct_ranking_()
-    relevance_score_list = [query.relevance_score for query in querylist]
+    relevance_score_list = [[query.relevance_score] for query in querylist]
     feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
 
 
 def query_filter(querylists):
@@ -265,7 +264,7 @@ def query_filter(querylists):
     return filter_query
 
 
-def load_from_text(filepath, shuffle=True, fill_missing=-1):
+def load_from_text(filepath, shuffle=False, fill_missing=-1):
     """
   parse data file into querys
   """
@@ -287,17 +286,14 @@ def load_from_text(filepath, shuffle=True, fill_missing=-1):
             querylist._add_query(query)
     if querylist is not None:
         querylists.append(querylist)
-    if shuffle == True:
-        random.shuffle(querylists)
     return querylists
 
 
-def __reader__(filepath, format="pairwise", shuffle=True, fill_missing=-1):
+def __reader__(filepath, format="pairwise", shuffle=False, fill_missing=-1):
     """
   Parameters
   --------
   filename : string
-  shuffle : shuffle query-doc pair under the same query
   fill_missing : fill the missing value. default in MQ2007 is -1
   
   Returns
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 4dd34e7383..b0b9757c1a 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -26,9 +26,9 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import common
+import paddle.v2.dataset.common
 
-__all__ = ['train', 'test', 'get_word_dict']
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 
@@ -39,12 +39,13 @@ def download_data_if_not_yet():
     """
     try:
         # make sure that nltk can find the data
-        if common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(common.DATA_HOME)
+        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
         print "Downloading movie_reviews data set, please wait....."
-        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        nltk.download(
+            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
         print "Download data set success....."
         print "Path is " + nltk.data.find('corpora/movie_reviews').path
 
@@ -128,4 +129,13 @@ def test():
 
 
 def fetch():
-    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+    nltk.download(
+        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
index f9815d4f9e..cfa194eba3 100644
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -57,6 +57,38 @@ class TestCommon(unittest.TestCase):
         for idx, e in enumerate(reader()):
             self.assertEqual(e, str("0"))
 
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
index cc0626f4fe..a8ae9a07ac 100644
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -31,13 +31,13 @@ class TestFlowers(unittest.TestCase):
     def test_train(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 1020)
+        self.assertEqual(instances, 6149)
         self.assertEqual(max_label_value, 102)
 
     def test_test(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 6149)
+        self.assertEqual(instances, 1020)
         self.assertEqual(max_label_value, 102)
 
     def test_valid(self):
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
index 4e52810e6b..eed1458244 100644
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle.v2.dataset.imikolov
 import unittest
 
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000..31e72ebf5e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/v2/dataset/tests/wmt16_test.py
new file mode 100644
index 0000000000..cef6c3216e
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/wmt16_test.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.wmt16
+import unittest
+
+
+class TestWMT16(unittest.TestCase):
+    def checkout_one_sample(self, sample):
+        # train data has 3 field: source language word indices,
+        # target language word indices, and target next word indices.
+        self.assertEqual(len(sample), 3)
+
+        # test start mark and end mark in source word indices.
+        self.assertEqual(sample[0][0], 0)
+        self.assertEqual(sample[0][-1], 1)
+
+        # test start mask in target word indices
+        self.assertEqual(sample[1][0], 0)
+
+        # test en mask in target next word indices
+        self.assertEqual(sample[2][-1], 1)
+
+    def test_train(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.train(
+                    src_dict_size=100000, trg_dict_size=100000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_test(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.test(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_val(self):
+        for idx, sample in enumerate(
+                paddle.v2.dataset.wmt16.validation(
+                    src_dict_size=1000, trg_dict_size=1000)()):
+            if idx >= 10: break
+            self.checkout_one_sample(sample)
+
+    def test_get_dict(self):
+        dict_size = 1000
+        word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True)
+        self.assertEqual(len(word_dict), dict_size)
+        self.assertEqual(word_dict[0], "<s>")
+        self.assertEqual(word_dict[1], "<e>")
+        self.assertEqual(word_dict[2], "<unk>")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 3469fd9ce1..f10bf7e42a 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -21,7 +21,8 @@ parse training set and test set into paddle reader creators.
 
 import numpy as np
 import os
-from common import download
+import paddle.v2.dataset.common
+from paddle.v2.parameters import Parameters
 
 __all__ = ['train', 'test']
 
@@ -29,11 +30,13 @@ URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
     'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
+    'PTRATIO', 'B', 'LSTAT', 'convert'
 ]
 
 UCI_TRAIN_DATA = None
 UCI_TEST_DATA = None
+URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
+MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
 
 def feature_range(maximums, minimums):
@@ -82,7 +85,7 @@ def train():
     :rtype: callable
     """
     global UCI_TRAIN_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TRAIN_DATA:
@@ -102,7 +105,7 @@ def test():
     :rtype: callable
     """
     global UCI_TEST_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TEST_DATA:
@@ -111,5 +114,21 @@ def test():
     return reader
 
 
+def model():
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
+    with open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
 def fetch():
-    download(URL, 'uci_housing', MD5)
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000..617e212d67
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 0902f87741..5104e29051 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -22,19 +22,27 @@ parse training set and test set into paddle reader creators.
 import tarfile
 import gzip
 
-from paddle.v2.dataset.common import download
+import paddle.v2.dataset.common
 from paddle.v2.parameters import Parameters
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = [
+    'train',
+    'test',
+    'get_dict',
+    'convert',
+]
 
-URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+URL_DEV_TEST = ('http://www-lium.univ-lemans.fr/~schwenk/'
+                'cslm_joint_paper/data/dev+test.tgz')
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
-# this is a small set of data for test. The original data is too large and will be add later.
-URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+# this is a small set of data for test. The original data is too large and
+# will be add later.
+URL_TRAIN = ('http://paddlepaddle.cdn.bcebos.com/demo/'
+             'wmt_shrinked_data/wmt14.tgz')
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
 END = "<e>"
@@ -42,8 +50,8 @@ UNK = "<unk>"
 UNK_IDX = 2
 
 
-def __read_to_dict__(tar_file, dict_size):
-    def __to_dict__(fd, size):
+def __read_to_dict(tar_file, dict_size):
+    def __to_dict(fd, size):
         out_dict = dict()
         for line_count, line in enumerate(fd):
             if line_count < size:
@@ -58,19 +66,19 @@ def __read_to_dict__(tar_file, dict_size):
             if each_item.name.endswith("src.dict")
         ]
         assert len(names) == 1
-        src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        src_dict = __to_dict(f.extractfile(names[0]), dict_size)
         names = [
             each_item.name for each_item in f
             if each_item.name.endswith("trg.dict")
         ]
         assert len(names) == 1
-        trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        trg_dict = __to_dict(f.extractfile(names[0]), dict_size)
         return src_dict, trg_dict
 
 
 def reader_creator(tar_file, file_name, dict_size):
     def reader():
-        src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+        src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
         with tarfile.open(tar_file, mode='r') as f:
             names = [
                 each_item.name for each_item in f
@@ -115,7 +123,8 @@ def train(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
 
 
 def test(dict_size):
@@ -130,16 +139,18 @@ def test(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
 
 
 def gen(dict_size):
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
 
 
 def model():
-    tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
     with gzip.open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
@@ -148,8 +159,8 @@ def model():
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
     # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
     if reverse:
         src_dict = {v: k for k, v in src_dict.items()}
         trg_dict = {v: k for k, v in trg_dict.items()}
@@ -157,5 +168,15 @@ def get_dict(dict_size, reverse=True):
 
 
 def fetch():
-    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    download(URL_MODEL, 'wmt14', MD5_MODEL)
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.v2.dataset.common.convert(path,
+                                     train(dict_size), 1000, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test")
diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/v2/dataset/wmt16.py
new file mode 100644
index 0000000000..c8818f715b
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt16.py
@@ -0,0 +1,349 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+ACL2016 Multimodal Machine Translation. Please see this website for more
+details: http://www.statmt.org/wmt16/multimodal-task.html#task1
+
+If you use the dataset created for your task, please cite the following paper:
+Multi30K: Multilingual English-German Image Descriptions.
+
+@article{elliott-EtAl:2016:VL16,
+ author    = {{Elliott}, D. and {Frank}, S. and {Sima"an}, K. and {Specia}, L.},
+ title     = {Multi30K: Multilingual English-German Image Descriptions},
+ booktitle = {Proceedings of the 6th Workshop on Vision and Language},
+ year      = {2016},
+ pages     = {70--74},
+ year      = 2016
+}
+"""
+
+import os
+import tarfile
+import gzip
+from collections import defaultdict
+
+import paddle.v2.dataset.common
+
+__all__ = [
+    "train",
+    "test",
+    "validation",
+    "convert",
+    "fetch",
+    "get_dict",
+]
+
+DATA_URL = ("http://cloud.dlnel.org/filepub/"
+            "?uuid=46a0808e-ddd8-427c-bacd-0dbc6d045fed")
+DATA_MD5 = "0c38be43600334966403524a40dcd81e"
+
+TOTAL_EN_WORDS = 11250
+TOTAL_DE_WORDS = 19220
+
+START_MARK = "<s>"
+END_MARK = "<e>"
+UNK_MARK = "<unk>"
+
+
+def __build_dict(tar_file, dict_size, save_path, lang):
+    word_dict = defaultdict(int)
+    with tarfile.open(tar_file, mode="r") as f:
+        for line in f.extractfile("wmt16/train"):
+            line_split = line.strip().split("\t")
+            if len(line_split) != 2: continue
+            sen = line_split[0] if lang == "en" else line_split[1]
+            for w in sen.split():
+                word_dict[w] += 1
+
+    with open(save_path, "w") as fout:
+        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
+        for idx, word in enumerate(
+                sorted(
+                    word_dict.iteritems(), key=lambda x: x[1], reverse=True)):
+            if idx + 3 == dict_size: break
+            fout.write("%s\n" % (word[0]))
+
+
+def __load_dict(tar_file, dict_size, lang, reverse=False):
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    if not os.path.exists(dict_path) or (
+            len(open(dict_path, "r").readlines()) != dict_size):
+        __build_dict(tar_file, dict_size, dict_path, lang)
+
+    word_dict = {}
+    with open(dict_path, "r") as fdict:
+        for idx, line in enumerate(fdict):
+            if reverse:
+                word_dict[idx] = line.strip()
+            else:
+                word_dict[line.strip()] = idx
+    return word_dict
+
+
+def __get_dict_size(src_dict_size, trg_dict_size, src_lang):
+    src_dict_size = min(src_dict_size, (TOTAL_EN_WORDS if src_lang == "en" else
+                                        TOTAL_DE_WORDS))
+    trg_dict_size = min(trg_dict_size, (TOTAL_DE_WORDS if src_lang == "en" else
+                                        TOTAL_ENG_WORDS))
+    return src_dict_size, trg_dict_size
+
+
+def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
+    def reader():
+        src_dict = __load_dict(tar_file, src_dict_size, src_lang)
+        trg_dict = __load_dict(tar_file, trg_dict_size,
+                               ("de" if src_lang == "en" else "en"))
+
+        # the indice for start mark, end mark, and unk are the same in source
+        # language and target language. Here uses the source language
+        # dictionary to determine their indices.
+        start_id = src_dict[START_MARK]
+        end_id = src_dict[END_MARK]
+        unk_id = src_dict[UNK_MARK]
+
+        src_col = 0 if src_lang == "en" else 1
+        trg_col = 1 - src_col
+
+        with tarfile.open(tar_file, mode="r") as f:
+            for line in f.extractfile(file_name):
+                line_split = line.strip().split("\t")
+                if len(line_split) != 2:
+                    continue
+                src_words = line_split[src_col].split()
+                src_ids = [start_id] + [
+                    src_dict.get(w, unk_id) for w in src_words
+                ] + [end_id]
+
+                trg_words = line_split[trg_col].split()
+                trg_ids = [trg_dict.get(w, unk_id) for w in trg_words]
+
+                trg_ids_next = trg_ids + [end_id]
+                trg_ids = [start_id] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 train set reader.
+
+    This function returns the reader for train data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+
+    NOTE:
+    The original like for training data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/training.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The train reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type.  Only support: "
+                         "en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/train",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def test(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 test set reader.
+
+    This function returns the reader for test data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for test data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/mmt16_task1_test.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The test reader.
+    """
+
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/test",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def validation(src_dict_size, trg_dict_size, src_lang="en"):
+    """
+    WMT16 validation set reader.
+
+    This function returns the reader for validation data. Each sample the reader
+    returns is made up of three fields: the source language word index sequence,
+    target language word index sequence and next word index sequence.
+
+    NOTE:
+    The original like for validation data is:
+    http://www.quest.dcs.shef.ac.uk/wmt16_files_mmt/validation.tar.gz
+
+    paddle.dataset.wmt16 provides a tokenized version of the original dataset by
+    using moses's tokenization script:
+    https://github.com/moses-smt/mosesdecoder/blob/master/scripts/tokenizer/tokenizer.perl
+
+    Args:
+        src_dict_size(int): Size of the source language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        trg_dict_size(int): Size of the target language dictionary. Three
+                            special tokens will be added into the dictionary:
+                            <s> for start mark, <e> for end mark, and <unk> for
+                            unknown word.
+        src_lang(string): A string indicating which language is the source
+                          language. Available options are: "en" for English
+                          and "de" for Germany.
+
+    Returns:
+        callable: The validation reader.
+    """
+    if src_lang not in ["en", "de"]:
+        raise ValueError("An error language type. "
+                         "Only support: en (for English); de(for Germany).")
+    src_dict_size, trg_dict_size = __get_dict_size(src_dict_size, trg_dict_size,
+                                                   src_lang)
+
+    return reader_creator(
+        tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                                   "wmt16.tar.gz"),
+        file_name="wmt16/val",
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        src_lang=src_lang)
+
+
+def get_dict(lang, dict_size, reverse=False):
+    """
+    return the word dictionary for the specified language.
+
+    Args:
+        lang(string): A string indicating which language is the source
+                      language. Available options are: "en" for English
+                      and "de" for Germany.
+        dict_size(int): Size of the specified language dictionary.
+        reverse(bool): If reverse is set to False, the returned python
+                       dictionary will use word as key and use index as value.
+                       If reverse is set to True, the returned python
+                       dictionary will use index as key and word as value.
+
+    Returns:
+        dict: The word dictionary for the specific language.
+    """
+
+    if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS)
+    else: dict_size = min(dict_size, TOTAL_DE_WORDS)
+
+    dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME,
+                             "wmt16/%s_%d.dict" % (lang, dict_size))
+    assert os.path.exists(dict_path), "Word dictionary does not exist. "
+    "Please invoke paddle.dataset.wmt16.train/test/validation first "
+    "to build the dictionary."
+    tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz")
+    return __load_dict(tar_file, dict_size, lang, reverse)
+
+
+def fetch():
+    """download the entire dataset.
+    """
+    paddle.v4.dataset.common.download(DATA_URL, "wmt16", DATA_MD5,
+                                      "wmt16.tar.gz")
+
+
+def convert(path, src_dict_size, trg_dict_size, src_lang):
+    """Converts dataset to recordio format.
+    """
+
+    paddle.v2.dataset.common.convert(
+        path,
+        train(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_train")
+    paddle.v2.dataset.common.convert(
+        path,
+        test(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_test")
+    paddle.v2.dataset.common.convert(
+        path,
+        validation(
+            src_dict_size=src_dict_size,
+            trg_dict_size=trg_dict_size,
+            src_lang=src_lang),
+        1000,
+        "wmt16_validation")
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa33..01067ef426 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -1,3 +1,16 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Testing and training events.
 
@@ -9,15 +22,15 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
-    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult',
+    'EndForwardBackward'
 ]
 
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
@@ -54,10 +67,13 @@ class BeginPass(object):
 class EndPass(WithMetric):
     """
     Event On One Pass Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
     """
 
-    def __init__(self, pass_id, evaluator):
+    def __init__(self, pass_id, evaluator, gm):
         self.pass_id = pass_id
+        self.gm = gm
         WithMetric.__init__(self, evaluator)
 
 
@@ -71,13 +87,27 @@ class BeginIteration(object):
         self.batch_id = batch_id
 
 
+class EndForwardBackward(object):
+    """
+    Event On One Batch ForwardBackward Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, gm):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.gm = gm
+
+
 class EndIteration(WithMetric):
     """
     Event On One Batch Training Complete.
+    To get the output of a specific layer, add "event.gm.getLayerOutputs('predict_layer')"
+    in your event_handler call back
     """
 
-    def __init__(self, pass_id, batch_id, cost, evaluator):
+    def __init__(self, pass_id, batch_id, cost, evaluator, gm):
         self.pass_id = pass_id
         self.batch_id = batch_id
         self.cost = cost
+        self.gm = gm
         WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/fluid/.gitignore b/python/paddle/v2/fluid/.gitignore
new file mode 100644
index 0000000000..2ff540d576
--- /dev/null
+++ b/python/paddle/v2/fluid/.gitignore
@@ -0,0 +1 @@
+proto
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
new file mode 100644
index 0000000000..18c8343d09
--- /dev/null
+++ b/python/paddle/v2/fluid/__init__.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+# import all class inside framework into fluid module
+import framework
+from framework import *
+# import all class inside executor into fluid module
+import executor
+from executor import *
+
+import io
+import evaluator
+import initializer
+import layers
+import nets
+import optimizer
+import learning_rate_decay
+import backward
+import regularizer
+from param_attr import ParamAttr
+from data_feeder import DataFeeder
+from core import LoDTensor, CPUPlace, CUDAPlace
+from distribute_transpiler import DistributeTranspiler
+from distribute_transpiler_simple import SimpleDistributeTranspiler
+import clip
+from memory_optimization_transpiler import memory_optimize
+
+Tensor = LoDTensor
+
+__all__ = framework.__all__ + executor.__all__ + [
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'learning_rate_decay',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr'
+    'DataFeeder',
+    'clip',
+    'SimpleDistributeTranspiler',
+    'DistributeTranspiler',
+    'memory_optimize',
+]
+
+
+def __bootstrap__():
+    """
+    Enable reading gflags from environment variables.
+
+    Returns:
+        None
+    """
+    import sys
+    import core
+    import os
+
+    try:
+        num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
+    except ValueError:
+        num_threads = 1
+
+    if num_threads > 1:
+        print(
+            'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
+            'speed will not be optimized if you use data parallel. It will '
+            'fail if this PaddlePaddle binary is compiled with OpenBlas since'
+            ' OpenBlas does not support multi-threads.'.format(num_threads),
+            file=sys.stderr)
+        print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
+
+    os.environ['OMP_NUM_THREADS'] = str(num_threads)
+
+    read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
+    if core.is_compiled_with_cuda():
+        read_env_flags += ['fraction_of_gpu_memory_to_use']
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_glog(sys.argv[0])
+    core.init_devices()
+
+
+layers.monkey_patch_variable()
+__bootstrap__()
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
new file mode 100644
index 0000000000..29243c90e8
--- /dev/null
+++ b/python/paddle/v2/fluid/backward.py
@@ -0,0 +1,559 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.v2.fluid import framework as framework
+from . import core
+import collections
+import copy
+
+__all__ = [
+    'append_backward',
+    'calc_gradient',
+]
+
+
+def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
+    """
+    Traverse all ops in op_descs[begin_idx : end_idx],
+    if any op has inputs/outputs named "old_name", rename it as 'new_name'
+    """
+    if begin_idx is None:
+        begin_idx = 0
+    if end_idx is None:
+        end_idx = len(op_descs)
+    for i in range(begin_idx, end_idx):
+        op_desc = op_descs[i]
+        if isinstance(op_desc, tuple):
+            op_desc = op_desc[0]
+        op_desc.rename_input(old_name, new_name)
+        op_desc.rename_output(old_name, new_name)
+
+
+def _create_op_desc_(op_type, inputs, outputs, attrs):
+    """
+    Create a C++ OpDesc object with specified inputs, outputs and attributes.
+    """
+    op_desc = core.OpDesc()
+    op_desc.set_type(op_type)
+    for para, args in inputs.iteritems():
+        op_desc.set_input(para, args)
+    for para, args in outputs.iteritems():
+        op_desc.set_output(para, args)
+    for name, val in attrs.iteritems():
+        if isinstance(val, framework.Block):
+            op_desc.set_block_attr(name, val.desc)
+        else:
+            op_desc.set_attr(name, val)
+    return op_desc
+
+
+def _infer_var_data_type_(grad_var_name, block):
+    """
+    Infer the data type of given grad variable
+    """
+    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
+    if block.desc.has_var_recursive(fwd_name):
+        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        grad_var.set_dtype(fwd_var.dtype())
+    else:
+        grad_var.set_dtype(core.DataType.FP32)
+
+
+def _all_in_set_(cands, s):
+    """
+    Test if all elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if not c in s:
+            return False
+    return True
+
+
+def _some_in_set_(cands, s):
+    """
+    Test if some elements of 'cands' are in set 's'
+    """
+    if len(cands) == 0:
+        return False
+    for c in cands:
+        if c in s:
+            return True
+    return False
+
+
+def _strip_grad_suffix_(name):
+    """
+    Strip the grad suffix from the given varibale name
+    e.g. x@GRAD ==> x
+         y@GRAD@RENAME@1 ==> y
+    """
+    pos = name.find(core.grad_var_suffix())
+    return name[:pos] if pos != -1 else name
+
+
+def _append_grad_suffix_(name):
+    """
+    Append grad suffix to the given variable name
+    e.g. x ==> x@GRAD
+    """
+    return name + core.grad_var_suffix()
+
+
+def _addup_repetitive_outputs_(op_descs):
+    """
+    In backward part, an variable may be the output of more than one ops.
+    In this case, the variable should be the accumulation of all the outputs.
+    `sum_op`s are added to implement the accumulate.
+    """
+    pending_sum_ops = []
+    var_rename_count = collections.defaultdict(int)
+    renamed_vars = collections.defaultdict(list)
+    for idx, op_desc in enumerate(op_descs):
+        for var_name in op_desc.input_arg_names():
+            if len(renamed_vars[var_name]) > 1:
+                pending_sum_ops.append(
+                    (_create_op_desc_("sum", {"X": renamed_vars[var_name]},
+                                      {"Out": [var_name]}, {}), idx))
+                renamed_vars[var_name] = [var_name]
+        for var_name in op_desc.output_arg_names():
+            if var_name == core.empty_var_name(
+            ) or var_name in op_desc.input_arg_names():
+                # empty variable or inplace op
+                continue
+            if len(renamed_vars[var_name]) == 0:
+                # it's the first time we get the variable
+                renamed_vars[var_name] = [var_name]
+            else:
+                if len(renamed_vars[var_name]) == 1:
+                    new_name = var_name + "@RENAME@" + \
+                        str(var_rename_count[var_name])
+                    var_rename_count[var_name] += 1
+                    # rename original var_name
+                    renamed_vars[var_name][0] = new_name
+                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    _rename_arg_(pending_sum_ops, var_name, new_name)
+
+                new_name = var_name + "@RENAME@" + \
+                    str(var_rename_count[var_name])
+                var_rename_count[var_name] += 1
+                op_desc.rename_output(var_name, new_name)
+                renamed_vars[var_name].append(new_name)
+    for var_name, inputs in renamed_vars.iteritems():
+        if len(inputs) > 1:
+            pending_sum_ops.append((_create_op_desc_(
+                "sum", {"X": inputs}, {"Out": [var_name]}, {}), len(op_descs)))
+    # sum_op descs are sorted according to their insert position
+    for p in reversed(pending_sum_ops):
+        op_descs.insert(p[1], p[0])
+
+    return op_descs
+
+
+def _remove_no_grad_branch_(op_descs, no_grad_set):
+    """
+    Remove unnecessary grad ops
+    A grad op can be removed in two cases:
+        1. all outputs of the grad op are in 'no_grad_set'
+        2. all grad inputs of the grad op are in 'no_grad_set'
+    """
+
+    def _op_can_be_removed_(op_desc, no_grad_set):
+        out_arg_names = op_desc.output_arg_names()
+        if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set):
+            return True
+        if _all_in_set_(
+                filter(lambda name: name.find(core.grad_var_suffix()) != -1,
+                       op_desc.input_arg_names()), no_grad_set):
+            no_grad_set.update(out_arg_names)
+            return True
+        return False
+
+    # Remove ops whose outputs are all in no_grad_dict
+    op_descs = filter(
+        lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs)
+    # Insert fill_zeros_like_op
+    to_insert = []
+    for idx, op_desc in enumerate(op_descs):
+        for arg in op_desc.input_arg_names():
+            if core.grad_var_suffix() in arg and arg in no_grad_set:
+                to_insert.append((_create_op_desc_("fill_zeros_like", {
+                    "X": [_strip_grad_suffix_(arg)]
+                }, {"Out": [arg]}, {}), idx))
+
+    map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
+
+    return op_descs
+
+
+def _append_backward_ops_(block,
+                          ops,
+                          target_block,
+                          no_grad_dict,
+                          grad_to_var,
+                          callback=None):
+    """
+    Create all grad ops, and insert them into given block
+
+    Args:
+        block(Block): the block where forward ops are
+        ops(Op): the forward operators whose backward ops need to be added
+        target_block(Block): the block which is going to hold new generated grad ops
+        no_grad_dict(dict):
+            key(int)  block index
+            val(set) a set of varibale names. These varibales have no gradient
+        grad_to_var(dict)(output argument):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+        callback(callable object): a callable object used to decorate new generated grad ops
+    """
+    if callback is None:
+
+        def empty_callback(block, context):
+            pass
+
+        callback = empty_callback
+    elif not hasattr(callback, '__call__'):
+        raise ValueError("'callback' must be a callable object.")
+
+    # grad_op_descs holds created grad_op, and will be appended to target_block
+    grad_op_descs = []
+    program = block.program
+    for op in reversed(ops):
+        grad_sub_block_list = []
+        # If the op has its own sub-block, deal with the sub-block first
+        if op.has_attr("sub_block"):
+            sub_block = program.block(op.block_attr("sub_block"))
+            grad_sub_block = program.create_block(parent_idx=sub_block.idx)
+            _append_backward_ops_(sub_block, sub_block.ops, grad_sub_block,
+                                  no_grad_dict, grad_to_var)
+            grad_sub_block_list.append(grad_sub_block.desc)
+
+        # Getting op's corresponding grad_op
+        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
+            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+
+        grad_op_descs.extend(grad_op_desc)
+        grad_to_var.update(op_grad_to_var)
+
+    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs)
+
+    grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
+                                            no_grad_dict[block.idx])
+
+    # append op_desc in grad_op_descs to target_block
+    for op_desc in grad_op_descs:
+        new_op_desc = target_block.desc.append_op()
+        new_op_desc.copy_from(op_desc)
+        callback(block=target_block, context=grad_to_var)
+
+
+def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
+    """
+    Create new variables required by backward pass.
+
+    Args:
+        block(Block): the block where new variables will be created
+        start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
+        grad_to_var(dict):
+            key(str): grad variable name
+            val(str): corresponding forward variable name
+            In most cases, this dict is generated by _append_backward_ops_()
+        grad_info_map(dict)(output argument):
+            key(str): forward variable name
+            val(tuple): a tuple of (str, Block), str is the corresponding grad name, Block is the block containing grad variable
+    """
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        if op_desc.has_attr("sub_block"):
+            sub_block = block.program.block(op_desc.block_attr("sub_block"))
+            _append_backward_vars_(sub_block, 0, grad_to_var, grad_info_map)
+        new_vars = set()
+        # create new gradient variables
+        for grad_var_name in op_desc.output_arg_names():
+            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(
+                    grad_var_name) or grad_var_name == core.empty_var_name():
+                continue
+            block.desc.var(grad_var_name)
+            new_vars.add(grad_var_name)
+            if not grad_to_var.has_key(grad_var_name):
+                continue
+            grad_info_map[grad_to_var[grad_var_name]] = (grad_var_name, block)
+        # infer_shape and infer_type
+        op_desc.infer_var_type(block.desc)
+        op_desc.infer_shape(block.desc)
+        for arg in op_desc.output_arg_names():
+            if arg in new_vars:
+                _infer_var_data_type_(arg, block)
+
+
+def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
+    var_map = copy.copy(target_grad_map)
+    for op_idx in range(start_op_idx, block.desc.op_size()):
+        op_desc = block.desc.op(op_idx)
+        for name in op_desc.input_arg_names():
+            if name in var_map:
+                op_desc.rename_input(name, var_map[name])
+
+        for name in op_desc.output_arg_names():
+            if block.desc.find_var(name.encode("ascii")):
+                new_name = "%s_%s" % (name, core.unique_integer(name))
+                op_desc.rename_output(name, new_name)
+                var_map[name] = new_name
+
+    for g, ng in var_map.iteritems():
+        if g in grad_to_var:
+            grad_to_var[ng] = grad_to_var[g]
+            grad_to_var.pop(g)
+
+
+def _get_stop_gradients_(program):
+    no_grad_dict = dict()
+    assert isinstance(program, framework.Program)
+    for block in program.blocks:
+        assert isinstance(block, framework.Block)
+        block_no_grad_set = set()
+        for var in block.vars.itervalues():
+            assert isinstance(var, framework.Variable)
+            if var.stop_gradient:
+                block_no_grad_set.add(_append_grad_suffix_(var.name))
+        no_grad_dict[block.idx] = block_no_grad_set
+    return no_grad_dict
+
+
+def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
+    """
+    Append backward part to main_program
+
+    Args:
+        loss(Variable): The variable generated by cost function.
+        parameter_list(list[string]): Parameters that need to be updated by
+            optimizer. If None, it means all parameters need to be updated.
+        no_grad_set(set): Variables that have no gradients in Block 0.
+            All variables with `step_gradient=True` from all blocks will be
+            automatically added.
+
+    Return:
+        (list[(Variable,Variable)]): list of (parameter, gradient) pair.
+    """
+    assert isinstance(loss, framework.Variable)
+
+    program = loss.block.program
+    if no_grad_set is None:
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(program)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+
+    grad_info_map = dict()
+    root_block = program.block(0)
+
+    fwd_op_num = root_block.desc.op_size()
+    current_block_idx = program.current_block_idx
+    grad_to_var = dict()
+
+    op_desc = _create_op_desc_("fill_constant", {}, {
+        "Out": [_append_grad_suffix_(loss.name)]
+    }, {"shape": [1],
+        "value": 1.0,
+        "dtype": loss.dtype})
+    root_block.desc.append_op().copy_from(op_desc)
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(root_block, [loss], [], block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+
+    _append_backward_ops_(root_block, op_path, root_block, no_grad_dict,
+                          grad_to_var, callback)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(root_block, fwd_op_num, grad_to_var, {})
+
+    _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
+
+    program.current_block_idx = current_block_idx
+    program.sync_with_cpp()
+
+    if parameter_list is not None:
+        parameters = parameter_list
+    else:
+        params = program.global_block().all_parameters()
+        parameters = [param.name for param in params]
+
+    params_and_grads = []
+    for param in parameters:
+        if param not in grad_info_map:
+            raise ValueError("param %s is not in map" % param)
+        grad_info = grad_info_map[param]
+        grad_block = grad_info[1]
+        if not grad_block.has_var(grad_info[0]):
+            raise ValueError("grad block[{0}] did not have grad var {1}".format(
+                grad_info[1], grad_info[0]))
+        # Get the param var from the global block
+        param_var = program.global_block().var(param)
+        grad_var = grad_block.var(grad_info[0])
+        if loss.block.has_var(grad_info[0]):
+            params_and_grads.append((param_var, grad_var))
+        else:
+            params_and_grads.append((param_var, None))
+    return params_and_grads
+
+
+def _as_list(x):
+    if x is None:
+        return []
+    return list(x) if isinstance(x, collections.Sequence) else [x]
+
+
+def _find_op_path_(block, outputs, inputs, no_grad_set):
+    """
+    no_grad_set will also be changed
+    """
+    input_names = set([inp.name for inp in inputs])
+    output_names = set([out.name for out in outputs])
+
+    relevant_op_flags = [True] * len(block.ops)
+
+    # All the inputs of the block are used if inputs is empty,
+    if inputs:
+        for i, op in enumerate(block.ops):
+            if _some_in_set_(op.desc.input_arg_names(), input_names):
+                for name in op.desc.output_arg_names():
+                    if name not in no_grad_set:
+                        input_names.add(name)
+            else:
+                relevant_op_flags[i] = False
+
+    for i, op in reversed(list(enumerate(block.ops))):
+        if _some_in_set_(op.desc.output_arg_names(), output_names):
+            for name in op.desc.input_arg_names():
+                if name not in no_grad_set:
+                    output_names.add(name)
+        else:
+            relevant_op_flags[i] = False
+
+    op_path = [
+        block.ops[i] for i in range(len(block.ops)) if relevant_op_flags[i]
+    ]
+
+    if inputs:
+        for op in op_path:
+            for name in op.desc.input_arg_names():
+                if name not in input_names:
+                    no_grad_set.add(name)
+
+    return op_path
+
+
+def calc_gradient(targets, inputs, target_gradients=None, no_grad_set=None):
+    """
+    Backpropagate the graidents of targets to inputs.
+
+    Args:
+        targets(Variable|list[Variable]): The target variables
+        inputs(Variable|list[Variable]): The input variables
+        no_grad_set(set[string]): The names of variables that have no gradients
+            in Block 0. All variables with `stop_gradient=True` from all blocks
+            will be automatically added.
+
+    Return:
+        (list[Variable]): list of gradients for inputs
+        If an input does not affect targets, the corresponding gradient variable
+        will be None
+    """
+    targets = _as_list(targets)
+    inputs = _as_list(inputs)
+    target_gradients = _as_list(target_gradients)
+
+    block = targets[0].block
+    prog = block.program
+    block_idx = block.idx
+
+    if not target_gradients:
+        target_gradients = [None] * len(targets)
+
+    if len(targets) != len(target_gradients):
+        raise ValueError(
+            "Should have the same number of target_gradients as targets")
+
+    if no_grad_set is None:
+        no_grad_set = set()
+    no_grad_set = copy.copy(no_grad_set)
+    no_grad_dict = _get_stop_gradients_(prog)
+    no_grad_dict[0].update(map(_append_grad_suffix_, no_grad_set))
+
+    fwd_op_num = block.desc.op_size()
+
+    target_grad_map = {}
+    for i, grad in enumerate(target_gradients):
+        target = targets[i]
+        if grad is None:
+            grad_name = _append_grad_suffix_(target.name)
+            op_desc = _create_op_desc_("fill_constant_batch_size_like",
+                                       {"Input": [target.name]},
+                                       {"Out": [grad_name]}, {
+                                           "shape": target.shape,
+                                           "value": 1.0,
+                                           "dtype": target.dtype,
+                                           'input_dim_idx': 0,
+                                           'output_dim_idx': 0
+                                       })
+            block.desc.append_op().copy_from(op_desc)
+        else:
+            if target.block.idx != block_idx or target.block.program != prog:
+                raise ValueError("all targets must be in the same block")
+            if target.shape != grad.shape:
+                raise ValueError(
+                    "The shapes of target and grad are different: %s %s" % (
+                        target.name, grad.name))
+            target_grad_map[_append_grad_suffix_(target.name)] = grad.name
+
+    for input in inputs:
+        if input.block.program != prog:
+            raise "input must be in the same program as targets"
+
+    block_no_grad_set = set(map(_strip_grad_suffix_, no_grad_dict[0]))
+    op_path = _find_op_path_(block, targets, inputs, block_no_grad_set)
+    no_grad_dict[0].update(map(_append_grad_suffix_, block_no_grad_set))
+    grad_to_var = dict()
+    grad_info_map = dict()
+    _append_backward_ops_(block, op_path, block, no_grad_dict, grad_to_var)
+
+    # Because calc_gradient may be called multiple times,
+    # we need rename the internal gradient variables so that they have
+    # different names.
+    _rename_grad_(block, fwd_op_num, grad_to_var, target_grad_map)
+
+    _append_backward_vars_(block, fwd_op_num, grad_to_var, grad_info_map)
+    prog.sync_with_cpp()
+
+    grad_vars = []
+    for input_var in inputs:
+        if input_var.name not in grad_info_map:
+            grad_vars.append(None)
+        else:
+            grad_info = grad_info_map[input_var.name]
+            grad_block = grad_info[1]
+            grad_var = grad_block.var(grad_info[0])
+            grad_vars.append(grad_var)
+
+    if len(grad_vars) == 1:
+        return grad_vars[0]
+    else:
+        return grad_vars
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
new file mode 100644
index 0000000000..fdbc8524ab
--- /dev/null
+++ b/python/paddle/v2/fluid/clip.py
@@ -0,0 +1,236 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+import functools
+import layers
+import framework
+from . import core
+
+__all__ = [
+    'ErrorClipByValue',
+    'GradientClipByValue',
+    'GradientClipByNorm',
+    'GradientClipByGlobalNorm',
+    'append_gradient_clip_ops',
+    'error_clip_callback',
+]
+
+
+class BaseErrorClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
+    def append_clip_op(self, block, grad_name):
+        raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
+    def append_clip_op(self, block, grad_name):
+        clip_op_desc = block.desc.append_op()
+        clip_op_desc.set_type("clip")
+        clip_op_desc.set_input("X", [grad_name])
+        clip_op_desc.set_output("Out", [grad_name])
+        clip_op_desc.set_attr("min", self.min)
+        clip_op_desc.set_attr("max", self.max)
+
+
+def error_clip_callback(block, context):
+    # the context is a grad_to_var map
+    grad_to_var = context
+    op_desc = block.desc.op(block.desc.op_size() - 1)
+    for grad_n in filter(lambda n: grad_to_var.has_key(n),
+                         op_desc.output_arg_names()):
+        fwd_var = block.var_recursive(grad_to_var[grad_n])
+        error_clip = getattr(fwd_var, "error_clip", None)
+        if not (error_clip is None or isinstance(error_clip,
+                                                 BaseErrorClipAttr)):
+            raise TypeError(
+                "Variable's error_clip should be an instance of BaseErrorClipAttr or None."
+            )
+        if error_clip is not None:
+            error_clip.append_clip_op(block, grad_n)
+
+
+class BaseGradientClipAttr(object):
+    def __str__(self):
+        raise NotImplementedError()
+
+    def process_context(self, context, param, grad):
+        raise NotImplementedError()
+
+    def create_operators(self, param, grad):
+        raise NotImplementedError()
+
+
+class NullGradientClipAttr(BaseGradientClipAttr):
+    def __str__(self):
+        return "Null"
+
+    def process_context(self, context, param, grad):
+        pass
+
+    def create_operators(self, param, grad):
+        return param, grad
+
+
+class GradientClipByValue(BaseGradientClipAttr):
+    def __init__(self, max, min=None):
+        max = float(max)
+        if min is None:
+            min = -max
+        else:
+            min = float(min)
+        self.max = max
+        self.min = min
+
+    def __str__(self):
+        return "ByValue, min=%f, max=%f" % (self.min, self.max)
+
+    def process_context(self, context, param, grad):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip(x=grad, min=self.min, max=self.max)
+        return param, new_grad
+
+
+class GradientClipByNorm(BaseGradientClipAttr):
+    def __init__(self, clip_norm):
+        self.clip_norm = clip_norm
+
+    def __str__(self):
+        return "ByNorm, clip_norm=%f" % self.clip_norm
+
+    def process_context(self, context, param, grad):
+        pass
+
+    def create_operators(self, param, grad):
+        new_grad = layers.clip_by_norm(x=grad, max_norm=self.clip_norm)
+        return param, new_grad
+
+
+class GradientClipByGlobalNorm(BaseGradientClipAttr):
+    def __init__(self, clip_norm, group_name="default_group"):
+        if not isinstance(group_name, basestring):
+            raise TypeError("'group_name' must be a basestring.")
+
+        self.clip_norm = clip_norm
+        self.group_name = group_name
+
+    def __str__(self):
+        return "ByGlobalNorm, group_name=%s, clip_norm=%f" % (self.group_name,
+                                                              self.clip_norm)
+
+    def process_context(self, context, param, grad):
+        if self.group_name not in context:
+            context[self.group_name] = []
+            context[self.group_name + "_clip_value"] = self.clip_norm
+            context[self.group_name + "_clip"] = layers.fill_constant(
+                shape=[1], dtype="float32", value=self.clip_norm)
+        else:
+            if not self.clip_norm == context[self.group_name + "_clip_value"]:
+                raise ValueError(
+                    "All parameters' 'clip_norm' of a same group should be the same"
+                )
+
+        local_norm_var = layers.reduce_sum(input=layers.pow(x=grad, factor=2.0))
+        context[self.group_name].append(local_norm_var)
+
+        self.context = context
+
+    def create_operators(self, param, grad):
+        group_scale_name = self.group_name + "_scale"
+        if group_scale_name not in self.context:
+            group_norm_var = layers.sums(input=self.context[self.group_name])
+            layers.sqrt(x=group_norm_var, out=group_norm_var)
+            clip_var = self.context[self.group_name + "_clip"]
+            group_scale_var = layers.elementwise_div(
+                x=clip_var,
+                y=layers.elementwise_max(
+                    x=clip_var, y=group_norm_var))
+            assert group_scale_var.shape == (1L, )
+            self.context[group_scale_name] = group_scale_var
+
+        new_grad = layers.elementwise_mul(
+            x=grad, y=self.context[group_scale_name])
+        return param, new_grad
+
+
+def set_gradient_clip(clip, param_list=None, program=None):
+    """
+        To specify parameters that require gradient clip.
+        Args:
+            clip(BaseGradientClipAttr): An instance of some derived class of BaseGradientClipAttr, 
+                    which describes the type and detailed attributes of required gradient clip.
+            param_list(list, None by default): Parameters that require gradient clip. 
+                    It can be a list of parameter or a list of parameter's name. 
+                    When it's None, all parameters in the program will be included. 
+            program(Program, None by default): The program where parameters are. 
+                    Will be the default main program when assigned with None.
+    """
+    if not isinstance(clip, BaseGradientClipAttr):
+        raise TypeError(
+            "'clip' should be an instance of BaseGradientClipAttr's derived class"
+        )
+    if program is None:
+        program = framework.default_main_program()
+    if param_list is None:
+        param_list = program.block(0).all_parameters()
+    if all(isinstance(elem, basestring) for elem in param_list):
+        param_list = [program.block(0).var(elem) for elem in param_list]
+    if not all(isinstance(elem, framework.Parameter) for elem in param_list):
+        raise TypeError(
+            "'param_list' should be a list of Parameter or basestring(parameter's name)."
+        )
+
+    for param in param_list:
+        param.gradient_clip_attr = copy.deepcopy(clip)
+
+
+def append_gradient_clip_ops(param_grad):
+    context = dict()
+    create_op_callbacks = []
+    for p, g in param_grad:
+        clip_attr = getattr(p, 'gradient_clip_attr', NullGradientClipAttr())
+        if clip_attr is None:
+            clip_attr = NullGradientClipAttr()
+        if not isinstance(clip_attr, BaseGradientClipAttr):
+            raise TypeError(
+                "clip attribute should be an instance of BaseGradientClipAttr")
+
+        clip_attr.process_context(context=context, param=p, grad=g)
+        create_op_callbacks.append(
+            functools.partial(
+                clip_attr.create_operators, param=p, grad=g))
+
+    return [each_callback() for each_callback in create_op_callbacks]
+
+
+ClipByValue = GradientClipByValue
+ClipByNorm = GradientClipByNorm
+ClipByGlobalNorm = GradientClipByGlobalNorm
diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py
new file mode 100644
index 0000000000..a3b22a8633
--- /dev/null
+++ b/python/paddle/v2/fluid/data_feeder.py
@@ -0,0 +1,115 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import core
+import numpy
+import six.moves as six
+
+from framework import Variable, default_main_program
+
+__all__ = ['DataFeeder']
+
+
+class DataToLoDTensorConverter(object):
+    def __init__(self, place, lod_level, shape, dtype):
+        self.place = place
+        self.lod_level = lod_level
+        self.shape = shape
+        if dtype == core.DataType.FP32:
+            self.dtype = 'float32'
+        elif dtype == core.DataType.INT64:
+            self.dtype = 'int64'
+        elif dtype == core.DataType.FP64:
+            self.dtype = 'float64'
+        elif dtype == core.DataType.INT32:
+            self.dtype = 'int32'
+        else:
+            raise ValueError("dtype must be any of [int32, float32, int64, "
+                             "float64]")
+
+        self.data = []
+        self.lod = []
+
+        for i in six.range(lod_level):
+            self.lod.append([0])
+
+    def feed(self, data):
+        self._feed_impl_(data, self.lod, self.lod_level)
+
+    def _feed_impl_(self, data, lod, lod_level):
+        if lod_level == 0:
+            self.data.append(data)
+        else:
+            cur_lod_len = len(data)
+            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            for each_data in data:
+                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+
+    def done(self):
+        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        t = core.LoDTensor()
+        t.set(arr, self.place)
+        if self.lod_level > 0:
+            t.set_lod(self.lod)
+        return t
+
+
+class DataFeeder(object):
+    def __init__(self, feed_list, place, program=None):
+        self.feed_dtypes = []
+        self.feed_names = []
+        self.feed_shapes = []
+        self.feed_lod_level = []
+        if program is None:
+            program = default_main_program()
+        for each_var in feed_list:
+            if isinstance(each_var, basestring):
+                each_var = program.block(0).var(each_var)
+            if not isinstance(each_var, Variable):
+                raise TypeError("Feed list should contain a list of variable")
+            self.feed_dtypes.append(each_var.dtype)
+            self.feed_names.append(each_var.name)
+            shape = each_var.shape
+            batch_size_dim = -1
+            for i, s in enumerate(shape):
+                if s < 0:
+                    batch_size_dim = i
+                    break
+            if batch_size_dim == -1:
+                raise ValueError("Variable {0} must has a batch size dimension",
+                                 each_var.name)
+            self.feed_lod_level.append(each_var.lod_level)
+            self.feed_shapes.append(shape)
+
+        self.place = place
+
+    def feed(self, iterable):
+        converter = []
+        for lod_level, shape, dtype in six.zip(
+                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
+            converter.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=lod_level,
+                    shape=shape,
+                    dtype=dtype))
+
+        for each_sample in iterable:
+            for each_converter, each_slot in six.zip(converter, each_sample):
+                each_converter.feed(each_slot)
+        ret_dict = {}
+        for each_name, each_converter in six.zip(self.feed_names, converter):
+            ret_dict[each_name] = each_converter.done()
+        return ret_dict
diff --git a/python/paddle/v2/fluid/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
new file mode 100644
index 0000000000..a27280208b
--- /dev/null
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Default scope function.
+
+`Paddle` manages Scope as programming language's scope.  It just a
+thread-local stack of Scope. Top of that stack is current scope, the bottom
+of that stack is all scopes' parent.
+
+Invoking `var/find_var`  can `new/find` variable in current scope.
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local
+scope.
+
+A `scoped_function` will take a `function` as input. That function will be
+invoked in a new local scope.
+"""
+
+import paddle.v2.fluid.core
+import threading
+
+__tl_scope__ = threading.local()
+
+__all__ = [
+    'get_cur_scope',
+    'enter_local_scope',
+    'leave_local_scope',
+    'var',
+    'find_var',
+    'scoped_function',
+]
+
+
+def get_cur_scope():
+    """
+    Get current scope.
+    :rtype: paddle.v2.fluid.core.Scope
+    """
+    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
+    if cur_scope_stack is None:
+        __tl_scope__.cur_scope = list()
+    if len(__tl_scope__.cur_scope) == 0:
+        __tl_scope__.cur_scope.append(paddle.v2.fluid.core.Scope())
+    return __tl_scope__.cur_scope[-1]
+
+
+def enter_local_scope():
+    """
+    Enter a new local scope
+    """
+    cur_scope = get_cur_scope()
+    new_scope = cur_scope.new_scope()
+    __tl_scope__.cur_scope.append(new_scope)
+
+
+def leave_local_scope():
+    """
+    Leave local scope
+    """
+    __tl_scope__.cur_scope.pop()
+    get_cur_scope().drop_kids()
+
+
+def var(name):
+    """
+    create variable in current scope.
+    """
+    return get_cur_scope().var(name)
+
+
+def find_var(name):
+    """
+    get variable in current scope.
+    """
+    return get_cur_scope().find_var(name)
+
+
+def scoped_function(func):
+    """
+    invoke `func` in new scope.
+
+    :param func: a callable function that will be run in new scope.
+    :type func: callable
+    """
+    enter_local_scope()
+    try:
+        func()
+    except:
+        raise
+    finally:
+        leave_local_scope()
diff --git a/python/paddle/v2/fluid/distribute_transpiler.py b/python/paddle/v2/fluid/distribute_transpiler.py
new file mode 100644
index 0000000000..a4464a281a
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler.py
@@ -0,0 +1,554 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+from distributed_spliter import *
+import math
+from . import core
+
+
+class VarBlock:
+    def __init__(self, varname, offset, size):
+        self.varname = varname
+        # NOTE: real offset is offset * size
+        self.offset = offset
+        self.size = size
+
+    def __str__(self):
+        return "%s:%d:%d" % (self.varname, self.offset, self.size)
+
+
+def same_or_split_var(p_name, var_name):
+    return p_name == var_name or p_name.startswith(var_name + ".block")
+
+
+def split_dense_variable(var_list,
+                         pserver_count,
+                         min_block_size=1024,
+                         max_block_size=1048576):
+    """
+        We may need to split dense tensor to one or more blocks and put
+        them equally onto parameter server. One block is a sub-tensor
+        aligned by dim[0] of the tensor.
+
+        We need to have a minimal block size so that the calculations in
+        the parameter server side can gain better performance. By default
+        minimum block size is 1024. The max block size is used to prevent
+        very large blocks that may cause send error.
+    """
+    blocks = []
+    for var in var_list:
+        split_count = pserver_count
+        var_numel = reduce(lambda x, y: x * y, var.shape)
+        max_pserver_count = int(math.floor(var_numel / float(min_block_size)))
+        if max_pserver_count == 0:
+            max_pserver_count = 1
+        if max_pserver_count < pserver_count:
+            split_count = max_pserver_count
+        block_size = int(math.ceil(var_numel / float(split_count)))
+
+        if len(var.shape) >= 2:
+            # align by dim1(width)
+            dim1 = reduce(lambda x, y: x * y, var.shape[1:])
+            remains = block_size % dim1
+            if remains != 0:
+                block_size += dim1 - remains
+        # update split_count after aligning
+        split_count = int(math.ceil(var_numel / float(block_size)))
+        for block_id in xrange(split_count):
+            curr_block_size = min(block_size, var_numel - (
+                (block_id) * block_size))
+            block = VarBlock(var.name, block_id, curr_block_size)
+            blocks.append(str(block))
+    return blocks
+
+
+class DistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to distributed data-parallelism programs.
+            The main_program will be transformed to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            into a parameter server program.
+
+            Use different methods to split trainable variables to different
+            parameter servers.
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default is default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+            :return: return a list of programs
+        """
+        assert (callable(split_method))
+        if program is None:
+            program = default_main_program()
+        self.program = program
+        self.trainers = trainers
+        self.optimize_ops = optimize_ops
+        # steps to transpile:
+        # 1. split variable to multiple blocks, aligned by product(dim[1:]) (width).
+        # 2. modify trainer program add split_op to each Grad.
+        # 3. append send_op to trainer.
+        # 4. append concat_op to trainer to update local weights.
+        # 5. create new program for parameter server.
+        # 6. create parameter server program by split_method generated endpoint->VarBlock
+
+        pserver_endpoints = pservers.split(",")
+
+        # step1
+        param_list = [pg[0] for pg in params_grads]
+        grad_list = [pg[1] for pg in params_grads]
+        # TODO: add split selected rows support
+        grad_blocks = split_dense_variable(grad_list, len(pserver_endpoints))
+        param_blocks = split_dense_variable(param_list, len(pserver_endpoints))
+        # step2
+        grad_var_mapping = self._append_split_op(program, grad_blocks)
+
+        # step3
+        send_inputs = []
+        send_outputs = []
+        for b in grad_blocks:  # append by order
+            varname, block_id, _ = b.split(":")
+            send_inputs.append(grad_var_mapping[varname][int(block_id)])
+
+        param_var_mapping = self._create_vars_from_blocklist(program,
+                                                             param_blocks)
+        for b in param_blocks:
+            varname, block_id, _ = b.split(":")
+            send_outputs.append(param_var_mapping[varname][int(block_id)])
+        # let send_op know which endpoint to send which var to, eplist has the same
+        # order as send_inputs.
+        eplist = split_method(send_inputs, pserver_endpoints)
+        # create mapping of endpoint -> split var to create pserver side program
+        self.param_grad_ep_mapping = dict()
+        for i, ep in enumerate(eplist):
+            param = send_outputs[i]
+            grad = send_inputs[i]
+            if not self.param_grad_ep_mapping.has_key(ep):
+                self.param_grad_ep_mapping[ep] = {"params": [], "grads": []}
+            self.param_grad_ep_mapping[ep]["params"].append(param)
+            self.param_grad_ep_mapping[ep]["grads"].append(grad)
+
+        rpc_client_var = program.global_block().create_var(
+            name="RPC_CLIENT_VAR",
+            psersistable=True,
+            dtype='float32',  # dtype and shape is not used in fact
+            shape=[0])
+
+        # create send_op
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_inputs},
+            outputs={"Out": send_outputs,
+                     "RPCClient": rpc_client_var},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": eplist})
+        # step4
+        for varname, splited_var in param_var_mapping.iteritems():
+            if len(splited_var) <= 1:
+                continue
+            orig_param = program.global_block().vars[varname]
+            concat = program.global_block().append_op(
+                type="concat",
+                inputs={"X": splited_var},
+                outputs={"Out": [orig_param]},
+                attrs={"axis": 0})
+
+    def _create_vars_from_blocklist(self, program, block_list):
+        # Create respective variables using the block_list
+        block_map = dict()
+        var_mapping = dict()
+        for block_str in block_list:
+            varname, offset, size = block_str.split(":")
+            if not block_map.has_key(varname):
+                block_map[varname] = []
+            block_map[varname].append((long(offset), long(size)))
+        for varname, splited in block_map.iteritems():
+            orig_var = program.global_block().vars[varname]
+            var_mapping[varname] = []
+            if len(splited) == 1:
+                var_mapping[varname] = [orig_var]
+                continue
+            orig_shape = orig_var.shape
+            orig_dim1_flatten = 1
+            if len(orig_shape) >= 2:
+                orig_dim1_flatten = reduce(lambda x, y: x * y, orig_shape[1:])
+
+            for i, block in enumerate(splited):
+                size = block[1]
+                rows = size / orig_dim1_flatten
+                splited_shape = [rows]
+                if len(orig_shape) >= 2:
+                    splited_shape.extend(orig_shape[1:])
+                var = program.global_block().create_var(
+                    name="%s.block%d" % (varname, i),
+                    psersistable=False,
+                    dtype=orig_var.dtype,
+                    shape=splited_shape)  # flattend splited var
+                var_mapping[varname].append(var)
+        return var_mapping
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            # HACK: let all param in pserver be persistable so the child
+            # program in recv can get them
+            persistable=True)
+
+    def _append_split_op(self, program, gradblocks):
+        # Split variables that need to be split and append respective ops
+        var_mapping = self._create_vars_from_blocklist(program, gradblocks)
+        for varname, splited_vars in var_mapping.iteritems():
+            # variable that don't need to split have empty splited_vars
+            if len(splited_vars) <= 1:
+                continue
+            orig_var = program.global_block().vars[varname]
+            if orig_var.type == core.VarDesc.VarType.SELECTED_ROWS:
+                height_sections = []
+                for v in splited_vars:
+                    height_sections.append(v.shape[0])
+                program.global_block().append_op(
+                    type="split_selected_rows",
+                    inputs={"X": orig_var},
+                    outputs={"Out": splited_vars},
+                    attrs={"height_sections": height_sections})
+            elif orig_var.type == core.VarDesc.VarType.LOD_TENSOR:
+                sections = []
+                for v in splited_vars:
+                    sections.append(v.shape[0])
+                program.global_block().append_op(
+                    type="split",
+                    inputs={"X": orig_var},
+                    outputs={"Out": splited_vars},
+                    attrs={"sections": sections}  # assume split evenly
+                )
+            else:
+                AssertionError("Variable type should be in set "
+                               "[LOD_TENSOR, SELECTED_ROWS]")
+        return var_mapping
+
+    def get_trainer_program(self):
+        # remove optimize ops and add a send op to main_program
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        # For each trainer, create the necessary variables
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def _get_optimizer_input_shape(self, op_type, varkey, orig_shape,
+                                   param_shape):
+        """
+        Returns the shape for optimizer inputs that need to be reshaped when
+        Param and Grad is split to multiple servers.
+        """
+        # HACK(typhoonzero): Should use functions of corresponding optimizer in
+        # optimizer.py to get the shape, do not  bind this in the transpiler.
+        if op_type == "adam":
+            if varkey in ["Moment1", "Moment2"]:
+                return param_shape
+        elif op_type == "adagrad":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "adamax":
+            if varkey in ["Moment", "InfNorm"]:
+                return param_shape
+        elif op_type == "momentum":
+            if varkey == "Velocity":
+                return param_shape
+        elif op_type == "":
+            if varkey == "Moment":
+                return param_shape
+        elif op_type == "sgd":
+            pass
+        return orig_shape
+
+    def _is_op_on_pserver(self, endpoint, all_ops, idx):
+        """
+        Recursively check if the op need to run on current server.
+        Assume that ops are in the execution order.
+        """
+        param_names = [
+            p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
+        ]
+        op = all_ops[idx]
+        if op.inputs.has_key("Param"):
+            if op.inputs["Param"].name in param_names:
+                return True
+            else:
+                for n in param_names:
+                    if same_or_split_var(n, op.inputs[
+                            "Param"].name) and n != op.inputs["Param"].name:
+                        return True
+                return False
+        else:
+            j = idx - 1
+            while j >= 0:
+                prev_op = all_ops[j]
+                prev_output_names = [o.name for o in prev_op.outputs.values()]
+                prev_input_names = [o.name for o in prev_op.inputs.values()]
+                found1 = False
+                found2 = False
+                for _, v in op.inputs.iteritems():
+                    if v.name in prev_output_names:
+                        found1 = self._is_op_on_pserver(endpoint, all_ops, j)
+                # later ops may produce output for prev op's next batch use.
+                for _, v in op.outputs.iteritems():
+                    if v.name in prev_input_names:
+                        found2 = self._is_op_on_pserver(endpoint, all_ops, j)
+                if found1 or found2:
+                    return True
+                j -= 1
+            return False
+
+    def _append_pserver_ops(self, program, pserver_program, opt_op, endpoint):
+        new_inputs = dict()
+        # update param/grad shape first, then other inputs like
+        # moment can use the updated shape
+        for key, var in opt_op.inputs.iteritems():
+            if key == "Grad":
+                grad_block = None
+                for g in self.param_grad_ep_mapping[endpoint]["grads"]:
+                    if same_or_split_var(g.name, var.name):
+                        grad_block = g
+                        break
+                if not grad_block:
+                    # do not append this op if current endpoint
+                    # is not dealing with this grad block
+                    return
+                merged_var = program.global_block().create_var(
+                    name=grad_block.name,
+                    persistable=grad_block.persistable,
+                    dtype=grad_block.dtype,
+                    shape=grad_block.shape)
+                # append merging ops if trainers > 1
+                if self.trainers > 1:
+                    vars2merge = self._create_var_for_trainers(
+                        program.global_block(), grad_block, self.trainers)
+                    program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                new_inputs[key] = merged_var
+            elif key == "Param":
+                # param is already created on global program
+                param_block = None
+                for p in self.param_grad_ep_mapping[endpoint]["params"]:
+                    if same_or_split_var(p.name, var.name):
+                        param_block = p
+                        break
+                if not param_block:
+                    return
+                tmpvar = program.global_block().create_var(
+                    name=param_block.name,
+                    persistable=True,
+                    dtype=param_block.dtype,
+                    shape=param_block.shape)
+
+                new_inputs[key] = tmpvar
+
+        for key, var in opt_op.inputs.iteritems():
+            if key in ["Param", "Grad"]:
+                continue
+            # update accumulator variable shape
+            param_shape = new_inputs["Param"].shape
+            new_shape = self._get_optimizer_input_shape(opt_op.type, key,
+                                                        var.shape, param_shape)
+            tmpvar = program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=new_shape)
+            new_inputs[key] = tmpvar
+            # create var in pserver program global block.
+            # TODO(typhoonzero): put blocks in one program to avoid create two
+            # variables.
+            pserver_program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=new_shape)
+
+        # change output's ParamOut variable
+        opt_op.outputs["ParamOut"] = new_inputs["Param"]
+        program.global_block().append_op(
+            type=opt_op.type,
+            inputs=new_inputs,
+            outputs=opt_op.outputs,
+            attrs=opt_op.attrs)
+
+    def _append_pserver_non_opt_ops(self, program, pserver_program, opt_op):
+        # Append the ops for parameters that do not need to be optimized/updated
+        for _, var in opt_op.inputs.iteritems():
+            program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            pserver_program.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+        program.global_block().append_op(
+            type=opt_op.type,
+            inputs=opt_op.inputs,
+            outputs=opt_op.outputs,
+            attrs=opt_op.attrs)
+
+    def get_pserver_program(self, endpoint):
+        """
+        Get pserver side program using the endpoint
+
+        NOTE: assume blocks of the same variable is not distributed
+        on the same pserver, only change param/grad varnames for
+        trainers to fetch. For each pserver endpoint, server side
+        program must be a sub-set of the original optimization program.
+        """
+        # step5
+        pserver_program = Program()
+        for v in self.param_grad_ep_mapping[endpoint]["params"]:
+            self._clone_var(pserver_program.global_block(), v)
+        for v in self.param_grad_ep_mapping[endpoint]["grads"]:
+            # create vars for each trainer in global scope, so
+            # we don't need to create them when grad arrives.
+            pserver_program.global_block().create_var(
+                name=v.name, persistable=True, dtype=v.dtype, shape=v.shape)
+            for trainer_id in xrange(self.trainers):
+                print("create variable for program: %s.trainer_%d" %
+                      (v.name, trainer_id))
+                pserver_program.global_block().create_var(
+                    name="%s.trainer_%d" % (v.name, trainer_id),
+                    persistable=True,
+                    dtype=v.dtype,
+                    shape=v.shape)
+        # step6
+        optimize_sub_program = Program()
+        # Iterate through the ops and append ops as needed
+        for idx, opt_op in enumerate(self.optimize_ops):
+            is_op_on_pserver = self._is_op_on_pserver(endpoint,
+                                                      self.optimize_ops, idx)
+            if not is_op_on_pserver:
+                continue
+            if opt_op.inputs.has_key("Grad"):
+                self._append_pserver_ops(optimize_sub_program, pserver_program,
+                                         opt_op, endpoint)
+            else:
+                self._append_pserver_non_opt_ops(optimize_sub_program,
+                                                 pserver_program, opt_op)
+        # Append the recv op
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={},
+            outputs={},
+            attrs={
+                "OptimizeBlock": optimize_sub_program.global_block(),
+                "endpoint": endpoint,
+                "ParamList": [
+                    p.name
+                    for p in self.param_grad_ep_mapping[endpoint]["params"]
+                ],
+                "GradList": [
+                    p.name
+                    for p in self.param_grad_ep_mapping[endpoint]["grads"]
+                ],
+                "Fanin": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
+
+    def get_startup_program(self, endpoint, pserver_program):
+        """
+        Get startup program for current parameter server.
+        Modify operator input variables if there are variables that
+        were split to several blocks.
+        """
+        s_prog = Program()
+        orig_s_prog = framework.default_startup_program()
+        params = self.param_grad_ep_mapping[endpoint]["params"]
+
+        def _get_splited_name_and_shape(varname):
+            for idx, splited_param in enumerate(params):
+                pname = splited_param.name
+                if same_or_split_var(pname, varname) and varname != pname:
+                    return pname, splited_param.shape
+            return "", []
+
+        # 1. create vars in pserver program to startup program
+        pserver_vars = pserver_program.global_block().vars
+        created_var_map = dict()
+        for _, var in pserver_vars.iteritems():
+            tmpvar = s_prog.global_block().create_var(
+                name=var.name,
+                persistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            created_var_map[var.name] = tmpvar
+
+        # 2. rename op outputs
+        for op in orig_s_prog.global_block().ops:
+            new_outputs = dict()
+            # do not append startup op if var is not on this pserver
+            op_on_pserver = False
+            for key, var in op.outputs.iteritems():
+                newname, _ = _get_splited_name_and_shape(var.name)
+                if newname:
+                    op_on_pserver = True
+                    new_outputs[key] = created_var_map[newname]
+                elif var.name in pserver_vars:
+                    op_on_pserver = True
+                    new_outputs[key] = pserver_vars[var.name]
+
+            if op_on_pserver:
+                if op.type in [
+                        "gaussian_random", "fill_constant", "uniform_random"
+                ]:
+                    op.attrs["shape"] = new_outputs["Out"].shape
+                s_prog.global_block().append_op(
+                    type=op.type,
+                    inputs=op.inputs,
+                    outputs=new_outputs,
+                    attrs=op.attrs)
+        return s_prog
diff --git a/python/paddle/v2/fluid/distribute_transpiler_simple.py b/python/paddle/v2/fluid/distribute_transpiler_simple.py
new file mode 100644
index 0000000000..73d9bed1ae
--- /dev/null
+++ b/python/paddle/v2/fluid/distribute_transpiler_simple.py
@@ -0,0 +1,256 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import optimizer
+from layer_helper import LayerHelper
+
+
+def hash_name_to_server(params_grads, pserver_endpoints):
+    """
+    :param param_grads:
+    :return: a map of pserver endpoint -> 
+                    params -> [param list]
+                    grads  -> [grad list]
+    """
+
+    def _hash_param(param_name, total):
+        return hash(param_name) % total
+
+    param_grad_map = dict()
+    for param, grad in params_grads:
+        if param.trainable is True and grad is not None:
+            server_id = _hash_param(param.name, len(pserver_endpoints))
+            server_for_param = pserver_endpoints[server_id]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+    return param_grad_map
+
+
+def round_robin(params_grads, pserver_endpoints):
+    assert (len(params_grads) > len(pserver_endpoints))
+
+    param_grad_map = dict()
+    pserver_idx = 0
+    for param, grad in params_grads:
+        if param.trainable is True:
+            server_for_param = pserver_endpoints[pserver_idx]
+            if not param_grad_map.has_key(server_for_param):
+                param_grad_map[server_for_param] = {"params": [], "grads": []}
+
+            param_grad_map[server_for_param]["params"].append(param)
+            param_grad_map[server_for_param]["grads"].append(grad)
+
+            pserver_idx += 1
+            if pserver_idx >= len(pserver_endpoints):
+                pserver_idx = 0
+    return param_grad_map
+
+
+class SimpleDistributeTranspiler:
+    def transpile(self,
+                  optimize_ops,
+                  params_grads,
+                  program=None,
+                  pservers="127.0.0.1:6174",
+                  trainers=1,
+                  split_method=round_robin):
+        """
+            Transpile the program to a distributed data-parallelism programs.
+
+            The main_program will be transform to use a remote parameter server
+            to do parameter optimization. And the optimization graph will be put
+            in to a parameter server program.
+
+            Use different methods to split trainable varialbles to different
+            parameter servers.
+
+            Example to run:
+
+            exe = fluid.Executor(place)
+            t = fluid.DistributeTranspiler()
+            t.transpile(optimize_ops, params_grads, pservers="127.0.0.1:6174", trainers=1)
+
+            pserver_endpoint = os.getenv("PSERVER")
+            if pserver_endpoint:
+                pserver_prog = t.get_pserver_program(pserver_endpoint, optimize_ops)
+                exe.run(fluid.default_startup_program())
+                exe.run(pserver_prog)
+            else:
+                feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+                exe.run(fluid.default_startup_program())
+
+                for pass_id in range(PASS_NUM):
+                    ...
+
+            :param optimize_ops: op list of optimization, should be the
+                                 return value of Optimizer.minimize
+            :type optimize_ops: list
+            :param program: program to optimize, default default_main_program
+            :param pservers: parameter server endpoints like "m1:6174,m2:6174"
+            :type pservers: string
+
+            :return: return a list of programs
+        """
+        if program is None:
+            program = default_main_program()
+        self.program = program
+        self.trainers = trainers
+        self.optimize_ops = optimize_ops
+        self._optimize_distributed(
+            optimize_ops,
+            program,
+            params_grads,
+            pservers=pservers,
+            trainers=trainers,
+            split_method=split_method)
+
+    def _clone_param(self, block, v):
+        assert isinstance(v, Parameter)
+        new_p = Parameter(
+            block=block,
+            shape=v.shape,
+            dtype=v.dtype,
+            type=v.type,
+            lod_level=v.lod_level,
+            stop_gradient=v.stop_gradient,
+            trainable=v.trainable,
+            optimize_attr=v.optimize_attr,
+            regularizer=v.regularizer,
+            name=v.name)
+        block.vars[new_p.name] = new_p
+
+    def _clone_var(self, block, var):
+        assert isinstance(var, Variable)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=var.persistable)
+
+    def _optimize_distributed(self, optimize_ops, program, params_and_grads,
+                              **kwargs):
+        if kwargs.has_key("split_method"):
+            split_method = kwargs["split_method"]
+        else:
+            split_method = round_robin
+
+        assert (callable(split_method))
+        pserver_endpoints = kwargs["pservers"].split(",")
+        self.param_grad_map = split_method(params_and_grads, pserver_endpoints)
+
+        send_op_ordered_inputs = []
+        send_op_ordered_outputs = []
+        epmap = []
+        for ep, v in self.param_grad_map.iteritems():
+            send_op_ordered_inputs.extend(v["grads"])
+            send_op_ordered_outputs.extend(v["params"])
+            for i in v["grads"]:
+                epmap.append(ep)
+        send_op = program.global_block().append_op(
+            type="send",
+            inputs={"X": send_op_ordered_inputs
+                    },  # inputs is a list of tensors to be send
+            outputs={"Out": send_op_ordered_outputs},
+            attrs={"endpoints": pserver_endpoints,
+                   "epmap": epmap})
+
+    def get_trainer_program(self):
+        # remove optimize ops and add a send op to main_program
+        self.program.global_block().delete_ops(self.optimize_ops)
+        return self.program
+
+    def _create_var_for_trainers(self, block, var, trainers):
+        var_list = []
+        for i in xrange(trainers):
+            var_each = block.create_var(
+                name="%s.trainer_%d" % (var.name, i),
+                psersistable=var.persistable,
+                dtype=var.dtype,
+                shape=var.shape)
+            var_list.append(var_each)
+        return var_list
+
+    def get_pserver_program(self, endpoint, optimize_ops):
+        pserver_program = Program()
+        for v in self.param_grad_map[endpoint]["params"]:
+            self._clone_param(pserver_program.global_block(), v)
+
+        optimize_sub_program = Program()
+        grad_var_names = [
+            var.name for var in self.param_grad_map[endpoint]["grads"]
+        ]
+        for opt_op in optimize_ops:
+            for _, var in opt_op.inputs.iteritems():
+                # NOTE: append operators to merge gradients from multiple
+                # trainers. If trainers == 1, this is not needed.
+                if self.trainers > 1 and var.name in grad_var_names:
+                    vars2merge = self._create_var_for_trainers(
+                        optimize_sub_program.global_block(), var, self.trainers)
+                    merged_var = optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+                    optimize_sub_program.global_block().append_op(
+                        type="sum",
+                        inputs={"X": vars2merge},
+                        outputs={"Out": merged_var})
+                    optimize_sub_program.global_block().append_op(
+                        type="scale",
+                        inputs={"X": merged_var},
+                        outputs={"Out": merged_var},
+                        attrs={"scale": 1.0 / float(self.trainers)})
+                else:
+                    optimize_sub_program.global_block().create_var(
+                        name=var.name,
+                        persistable=var.persistable,
+                        dtype=var.dtype,
+                        shape=var.shape)
+
+            if opt_op.inputs.has_key("Grad"):
+                if opt_op.inputs["Grad"].name in grad_var_names:
+                    optimize_sub_program.global_block().append_op(
+                        type=opt_op.type,
+                        inputs=opt_op.inputs,
+                        outputs=opt_op.outputs,
+                        attrs=opt_op.attrs)
+            else:
+                optimize_sub_program.global_block().append_op(
+                    type=opt_op.type,
+                    inputs=opt_op.inputs,
+                    outputs=opt_op.outputs,
+                    attrs=opt_op.attrs)
+        pserver_program.global_block().append_op(
+            type="recv",
+            inputs={"RX":
+                    self.param_grad_map[endpoint]["grads"]},  # grads to recv
+            outputs={},
+            attrs={
+                "OptimizeBlock": optimize_sub_program.global_block(),
+                "endpoint": endpoint,
+                "ParamList":
+                [p.name for p in self.param_grad_map[endpoint]["params"]],
+                "GradList":
+                [p.name for p in self.param_grad_map[endpoint]["grads"]],
+                "Trainers": self.trainers
+            })
+        pserver_program.sync_with_cpp()
+        return pserver_program
diff --git a/python/paddle/v2/fluid/distributed_spliter.py b/python/paddle/v2/fluid/distributed_spliter.py
new file mode 100644
index 0000000000..8cf0b06786
--- /dev/null
+++ b/python/paddle/v2/fluid/distributed_spliter.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def hash_name(varlist, pserver_endpoints):
+    """
+    hash variable names to several endpoints.
+
+    :param varlist: a list of Variables
+    :return: a map of pserver endpoint -> varname
+    """
+
+    def _hash_block(block_str, total):
+        return hash(block_str) % total
+
+    eplist = []
+    for var in varlist:
+        server_id = _hash_block(var.name(), len(pserver_endpoints))
+        server_for_param = pserver_endpoints[server_id]
+        eplist.append(server_for_param)
+    return eplist
+
+
+def round_robin(varlist, pserver_endpoints):
+    """
+    distribute variables to several endpoints.
+    """
+    assert (len(varlist) > len(pserver_endpoints))
+
+    eplist = []
+    pserver_idx = 0
+    for var in varlist:
+        server_for_param = pserver_endpoints[pserver_idx]
+        eplist.append(server_for_param)
+
+        pserver_idx += 1
+        if pserver_idx >= len(pserver_endpoints):
+            pserver_idx = 0
+    return eplist
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
new file mode 100644
index 0000000000..2686a5bdfc
--- /dev/null
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -0,0 +1,267 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import layers
+from framework import Program, unique_name, Variable, program_guard
+from layer_helper import LayerHelper
+
+__all__ = [
+    'Accuracy',
+    'ChunkEvaluator',
+]
+
+
+def _clone_var_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.dtype,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
+
+
+class Evaluator(object):
+    """
+    Base Class for all evaluators
+
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
+            temporary variable name.
+        main_program(Program, optional): The evaluator should be added to this
+            main_program. Default default_main_program()
+        startup_program(Program, optional):The parameter should be added to this
+            startup_program. Default default_startup_program()
+
+    Attributes:
+        states(list): The list of state variables. states will be reset to zero
+            when `reset` is invoked.
+        metrics(list): The list of metrics variables. They will be calculate
+            every mini-batch
+    """
+
+    def __init__(self, name, **kwargs):
+        self.states = []
+        self.metrics = []
+        self.helper = LayerHelper(name, **kwargs)
+
+    def reset(self, executor, reset_program=None):
+        """
+        reset metric states at the begin of each pass/user specified batch
+        """
+        if reset_program is None:
+            reset_program = Program()
+
+        with program_guard(main_program=reset_program):
+            for var in self.states:
+                assert isinstance(var, Variable)
+                g_var = _clone_var_(reset_program.current_block(), var)
+                layers.fill_constant(
+                    shape=g_var.shape, value=0.0, dtype=g_var.dtype, out=g_var)
+
+        executor.run(reset_program)
+
+    def eval(self, executor, eval_program=None):
+        """
+        Evaluate the statistics merged by multiple mini-batches.
+        """
+        raise NotImplementedError()
+
+    def create_state(self, suffix, dtype, shape):
+        """
+        Create state variable.
+
+        NOTE: It is not a public API.
+
+        Args:
+            suffix(str): the state suffix.
+            dtype(str|core.DataType): the state data type
+            shape(tuple|list): the shape of state
+
+        Returns: State variable
+
+        """
+        state = self.helper.create_variable(
+            name="_".join([unique_name(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        self.states.append(state)
+        return state
+
+
+class Accuracy(Evaluator):
+    """
+    Average Accuracy for multiple mini-batches.
+    """
+
+    def __init__(self, input, label, k=1, **kwargs):
+        super(Accuracy, self).__init__("accuracy", **kwargs)
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
+        self.correct = self.create_state(
+            dtype='int64', shape=[1], suffix='correct')
+        total = self.helper.create_tmp_variable(dtype='int')
+        correct = self.helper.create_tmp_variable(dtype='int')
+        acc = layers.accuracy(
+            input=input, label=label, k=k, total=total, correct=correct)
+        total = layers.cast(x=total, dtype='int64')
+        correct = layers.cast(x=correct, dtype='int64')
+        layers.sums(input=[self.total, total], out=self.total)
+        layers.sums(input=[self.correct, correct], out=self.correct)
+
+        self.metrics.append(acc)
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        with program_guard(main_program=eval_program):
+            total = _clone_var_(block, self.total)
+            correct = _clone_var_(block, self.correct)
+            total = layers.cast(total, dtype='float32')
+            correct = layers.cast(correct, dtype='float32')
+            out = layers.elementwise_div(x=correct, y=total)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
+
+
+class ChunkEvaluator(Evaluator):
+    """
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
+    numbers.
+    """
+
+    def __init__(
+            self,
+            input,
+            label,
+            chunk_scheme,
+            num_chunk_types,
+            excluded_chunk_types=None, ):
+        super(ChunkEvaluator, self).__init__("chunk_eval")
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.num_infer_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_infer_chunks')
+        self.num_label_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_label_chunks')
+        self.num_correct_chunks = self.create_state(
+            dtype='int64', shape=[1], suffix='num_correct_chunks')
+        precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks = layers.chunk_eval(
+            input=input,
+            label=label,
+            chunk_scheme=chunk_scheme,
+            num_chunk_types=num_chunk_types,
+            excluded_chunk_types=excluded_chunk_types, )
+        layers.sums(
+            input=[self.num_infer_chunks, num_infer_chunks],
+            out=self.num_infer_chunks)
+        layers.sums(
+            input=[self.num_label_chunks, num_label_chunks],
+            out=self.num_label_chunks)
+        layers.sums(
+            input=[self.num_correct_chunks, num_correct_chunks],
+            out=self.num_correct_chunks)
+
+        self.metrics.extend([precision, recall, f1_score])
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        num_infer_chunks, num_label_chunks, num_correct_chunks = executor.run(
+            eval_program,
+            fetch_list=[_clone_var_(block, state) for state in self.states])
+        num_infer_chunks = num_infer_chunks[0]
+        num_label_chunks = num_label_chunks[0]
+        num_correct_chunks = num_correct_chunks[0]
+        precision = float(
+            num_correct_chunks) / num_infer_chunks if num_infer_chunks else 0
+        recall = float(
+            num_correct_chunks) / num_label_chunks if num_label_chunks else 0
+        f1_score = float(2 * precision * recall) / (
+            precision + recall) if num_correct_chunks else 0
+        return np.array(
+            [precision], dtype='float32'), np.array(
+                [recall], dtype='float32'), np.array(
+                    [f1_score], dtype='float32')
+
+
+class EditDistance(Evaluator):
+    """
+    Accumulate edit distance sum and sequence number from mini-batches and
+    compute the average edit_distance of all batches.
+
+    Args:
+        input: the sequences predicted by network.
+        label: the target sequences which must has same sequence count
+        with input.
+        ignored_tokens(list of int): Tokens that should be removed before
+        calculating edit distance.
+
+    Example:
+
+        exe = fluid.executor(place)
+        distance_evaluator = fluid.Evaluator.EditDistance(input, label)
+        for epoch in PASS_NUM:
+            distance_evaluator.reset(exe)
+            for data in batches:
+                loss, sum_distance = exe.run(fetch_list=[cost] + distance_evaluator.metrics)
+                avg_distance = distance_evaluator.eval(exe)
+            pass_distance = distance_evaluator.eval(exe)
+
+        In the above example:
+        'sum_distance' is the sum of the batch's edit distance.
+        'avg_distance' is the average of edit distance from the firt batch to the current batch.
+        'pass_distance' is the average of edit distance from all the pass.
+
+    """
+
+    def __init__(self, input, label, ignored_tokens=None, **kwargs):
+        super(EditDistance, self).__init__("edit_distance", **kwargs)
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total_error = self.create_state(
+            dtype='float32', shape=[1], suffix='total_error')
+        self.seq_num = self.create_state(
+            dtype='int64', shape=[1], suffix='seq_num')
+        error, seq_num = layers.edit_distance(
+            input=input, label=label, ignored_tokens=ignored_tokens)
+        #error = layers.cast(x=error, dtype='float32')
+        sum_error = layers.reduce_sum(error)
+        layers.sums(input=[self.total_error, sum_error], out=self.total_error)
+        layers.sums(input=[self.seq_num, seq_num], out=self.seq_num)
+        self.metrics.append(sum_error)
+
+    def eval(self, executor, eval_program=None):
+        if eval_program is None:
+            eval_program = Program()
+        block = eval_program.current_block()
+        with program_guard(main_program=eval_program):
+            total_error = _clone_var_(block, self.total_error)
+            seq_num = _clone_var_(block, self.seq_num)
+            seq_num = layers.cast(x=seq_num, dtype='float32')
+            out = layers.elementwise_div(x=total_error, y=seq_num)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
new file mode 100644
index 0000000000..9f48815b8b
--- /dev/null
+++ b/python/paddle/v2/fluid/executor.py
@@ -0,0 +1,281 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import contextlib
+from framework import Program, default_main_program
+from . import core
+
+__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
+
+g_scope = core.Scope()
+
+
+def global_scope():
+    return g_scope
+
+
+def switch_scope(scope):
+    global g_scope
+    ex = g_scope
+    g_scope = scope
+    return ex
+
+
+@contextlib.contextmanager
+def scope_guard(scope):
+    ex = switch_scope(scope)
+    yield
+    switch_scope(ex)
+
+
+def as_numpy(tensor):
+    if isinstance(tensor, list):
+        return [as_numpy(t) for t in tensor]
+    assert isinstance(tensor, core.LoDTensor)
+    lod = tensor.lod()
+    tensor_data = np.array(tensor)
+    if len(lod) == 0:
+        ans = tensor_data
+    else:
+        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
+    # elif len(lod) == 1:
+    #     ans = []
+    #     idx = 0
+    #     while idx < len(lod) - 1:
+    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
+    #         idx += 1
+    # else:
+    #     for l in reversed(lod):
+    #         ans = []
+    #         idx = 0
+    #         while idx < len(l) - 1:
+    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
+    #             idx += 1
+    #         tensor_data = ans
+    #     ans = tensor_data
+    return ans
+
+
+def has_feed_operators(block, feed_targets, feed_holder_name):
+    """ Check whether the block already has feed operators.
+
+    Return false if the block does not have any feed operators.
+    If some feed operators have been prepended to the block, check that
+    the info contained in these feed operators matches the feed_targets
+    and feed_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has feed operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        feed_targets: a dictionary of {feed_target_name: feed_target_data}
+        feed_holder_name: the name of the variable that holds the data of 
+            all feed targets. The type of this feed_holder variable is 
+            FEED_MINIBATCH, which is essentially vector<LoDTensor>.
+
+    Returns:
+        A boolean value that indicates whether a block has feed operators 
+        that match the info contained in feed_targets and feed_holder_name.
+    """
+
+    feed_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'feed':
+            feed_count += 1
+            assert op.desc.input('X')[0] == feed_holder_name
+            feed_target_name = op.desc.output('Out')[0]
+            if feed_target_name not in feed_targets:
+                raise Exception("'feed_targets' does not have {} variable".
+                                format(feed_target_name))
+        else:
+            break
+    if feed_count > 0 and feed_count != len(feed_targets):
+        raise Exception(
+            "Feed operators in program desc do not match 'feed_targets'")
+    return feed_count > 0
+
+
+def has_fetch_operators(block, fetch_targets, fetch_holder_name):
+    """ Check whether the block already has fetch operators.
+    
+    Return false if the block does not have any fetch operators.
+    If some fetch operators have been appended to the block, check that
+    the info contained in these fetch operators matches the fetch_targets
+    and fetch_holder_name. Raise exception when any mismatch is found.
+    Return true when the block has fetch operators with matching info.
+
+    Args:
+        block: a block instance (typically global block of a program)
+        fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
+        fetch_holder_name: the name of the variable that holds the data of 
+            all fetch targets. The type of this fetch_holder variable is 
+            FETCH_LIST, which is essentially vector<LoDTensor>.    
+
+    Return:    
+        A boolean value that indicates whether a block has fetch operators 
+        that match the info contained in fetch_targets and fetch_holder_name.     
+    """
+
+    fetch_count = 0
+    for op in block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_count += 1
+            assert op.desc.output('Out')[0] == fetch_holder_name
+            fetch_target_name = op.desc.input('X')[0]
+            if fetch_target_name not in [
+                    var.desc.name() for var in fetch_targets
+            ]:
+                raise Exception("'fetch_targets' does not have {} variable".
+                                format(fetch_target_name))
+            idx = op.desc.attr('col')
+            assert fetch_target_name == fetch_targets[idx].desc.name()
+    if fetch_count > 0 and fetch_count != len(fetch_targets):
+        raise Exception(
+            "Fetch operators in program desc do not match 'fetch_targets'")
+    return fetch_count > 0
+
+
+class Executor(object):
+    def __init__(self, places):
+        if not isinstance(places, list) and not isinstance(places, tuple):
+            places = [places]
+
+        act_places = []
+        for each in places:
+            p = core.Place()
+            p.set_place(each)
+            act_places.append(p)
+
+        # TODO(dzhwinter) : only use the first place
+        self.executor = core.Executor(act_places[0])
+        self.places = places
+
+    def aslodtensor(self, data):
+        def accumulate(data):
+            if not isinstance(data, list):
+                return 1
+            return sum([accumulate(sub) for sub in data])
+
+        def parselod(data):
+            seq_lens = [accumulate(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            return lod
+
+        assert len(self.places) != 0
+        if not isinstance(data, list):
+            # pure tensor case
+            tensor = core.LoDTensor()
+            tensor.set(data, self.places[0])
+            return tensor
+        else:
+            raise RuntimeError("Current implementation lacks unittests")
+            # lodtensor case
+            lod = []
+            if not isinstance(data[0], list):
+                lod.append(parselod(data))
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            else:
+                while isinstance(data[0], list):
+                    lod.append(parselod(seq))
+                    flattened_data = [item for seq in data for item in seq]
+                    data = flattened_data
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            flattened_data = flattened_data.reshape([len(flattened_data), 1])
+            tensor = core.LoDTensor()
+            tensor.set(flattened_data, self.places[0])
+            tensor.set_lod(lod)
+            return tensor
+
+    def run(self,
+            program=None,
+            feed=None,
+            fetch_list=None,
+            feed_var_name='feed',
+            fetch_var_name='fetch',
+            scope=None,
+            return_numpy=True):
+        if feed is None:
+            feed = {}
+        if fetch_list is None:
+            fetch_list = []
+
+        if program is None:
+            program = default_main_program()
+
+        if not isinstance(program, Program):
+            raise TypeError()
+
+        if scope is None:
+            scope = global_scope()
+
+        program = program.clone()
+        global_block = program.global_block()
+
+        if feed_var_name in global_block.vars:
+            feed_var = global_block.var(feed_var_name)
+        else:
+            feed_var = global_block.create_var(
+                name=feed_var_name,
+                type=core.VarDesc.VarType.FEED_MINIBATCH,
+                persistable=True)
+
+        if fetch_var_name in global_block.vars:
+            fetch_var = global_block.var(fetch_var_name)
+        else:
+            fetch_var = global_block.create_var(
+                name=fetch_var_name,
+                type=core.VarDesc.VarType.FETCH_LIST,
+                persistable=True)
+
+        if not has_feed_operators(global_block, feed, feed_var_name):
+            for i, name in enumerate(feed):
+                out = global_block.var(name)
+                global_block.prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i})
+
+        for op in global_block.ops:
+            if op.desc.type() == 'feed':
+                feed_target_name = op.desc.output('Out')[0]
+                cur_feed = feed[feed_target_name]
+                if not isinstance(cur_feed, core.LoDTensor):
+                    cur_feed = self.aslodtensor(cur_feed)
+                idx = op.desc.attr('col')
+                core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
+            else:
+                break
+
+        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
+            for i, var in enumerate(fetch_list):
+                global_block.append_op(
+                    type='fetch',
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i})
+
+        self.executor.run(program.desc, scope, 0, True, True)
+        outs = [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
+
+        if return_numpy:
+            outs = as_numpy(outs)
+        return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
new file mode 100644
index 0000000000..8bf545e2ec
--- /dev/null
+++ b/python/paddle/v2/fluid/framework.py
@@ -0,0 +1,1125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import contextlib
+import re
+
+import numpy as np
+
+import proto.framework_pb2 as framework_pb2
+from . import core
+
+__all__ = [
+    'Block',
+    'Variable',
+    'Program',
+    'Operator',
+    'default_startup_program',
+    'default_main_program',
+    'program_guard',
+    'switch_startup_program',
+    'switch_main_program',
+]
+
+EMPTY_VAR_NAME = core.kEmptyVarName()
+TEMP_VAR_NAME = core.kTempVarName()
+GRAD_VAR_SUFFIX = core.kGradVarSuffix()
+ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
+
+
+def grad_var_name(var_name):
+    """
+    return gradient name for a certain var name
+    """
+    return var_name + GRAD_VAR_SUFFIX
+
+
+def unique_name(prefix):
+    """
+    Generate unique names with prefix
+
+    Args:
+        prefix(str): The prefix of return string
+
+    Returns(str): A unique string with the prefix
+
+    """
+    uid = core.unique_integer(prefix)  # unique during whole process.
+    return "_".join([prefix, str(uid)])
+
+
+def convert_np_dtype_to_dtype_(np_dtype):
+    """
+    Convert the data type in numpy to the data type in Paddle
+    Args:
+        np_dtype(np.dtype): the data type in numpy
+
+    Returns(core.DataType): the data type in Paddle
+
+    """
+    dtype = np.dtype(np_dtype)
+    if dtype == np.float32:
+        return core.DataType.FP32
+    elif dtype == np.float64:
+        return core.DataType.FP64
+    elif dtype == np.float16:
+        return core.DataType.FP16
+    elif dtype == np.int32:
+        return core.DataType.INT32
+    elif dtype == np.int16:
+        return core.DataType.INT16
+    elif dtype == np.int64:
+        return core.DataType.INT64
+    elif dtype == np.bool:
+        return core.DataType.BOOL
+    else:
+        raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def dtype_is_floating(dtype):
+    """
+    Check the data type is floating or not.
+    Args:
+        dtype(np.dtype|core.DataType): data type.
+            Could be numpy format or Paddle format
+
+    Returns(bool): True if data type is a float value
+
+    """
+    if not isinstance(dtype, core.DataType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
+
+    return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64]
+
+
+def _debug_string_(proto, throw_on_error=True):
+    """
+    Get the debug string of a protobuf message. The message could be not
+    initialized.
+    Args:
+        proto(google.protobuf.message.Message): The protobuf message
+        throw_on_error(bool): True if raise an error when the protobuf message
+            is not initialized.
+
+    Returns(str): The debug string of the protobuf message
+
+    """
+    error_fields = list()
+    if not proto.IsInitialized(error_fields) and throw_on_error:
+        raise ValueError("{0} are not initialized.\nThe message is {1}:\n".
+                         format(error_fields, proto))
+    return proto.__str__()
+
+
+class Variable(object):
+    """
+    Python variable. Every input and output of an operator is a variable. Every
+    variable belongs to a block. The variable has a name and two variables in
+    different blocks could have the same name.
+
+    There are many kinds of variables. Please reference the framework.proto for
+    details.
+
+    Notes: The constructor of Variable should not be invoked directly. Please
+    use `Block.create_var` to create a variable.
+
+    >>> cur_program = Program()
+    >>> cur_block = cur_program.current_block()
+    >>> new_variable = cur_block.create_var(
+    >>>                    name="X", shape=[-1, 23, 48], dtype='float32')
+
+    Args:
+        block(Block): The associated block. It will be passed by
+            `Block.create_var` automatically.
+        type(core.VarDesc.VarType): Variable type. Please reference the
+            framework.proto for details.
+        shape(tuple|list|None): The shape of variable. -1 means the batch size.
+            Some kinds of variable do not contain shape, just set it to None.
+        dtype(np.dtype|core.DataType|str): The data type of variable.
+        lod_level(int): The level of lod tensor. 0 means there is not a time
+            series data.
+        persistable(bool): True if the variable should be saved as check point.
+            Defaults to False.
+        stop_gradient(bool): True if the variable will stop to calculate
+            gradients when backward. Defaults to False.
+    """
+
+    def __init__(self,
+                 block,
+                 type=core.VarDesc.VarType.LOD_TENSOR,
+                 name=None,
+                 shape=None,
+                 dtype=None,
+                 lod_level=None,
+                 persistable=None,
+                 error_clip=None,
+                 stop_gradient=False,
+                 **kwargs):
+        self.block = block
+        self.error_clip = error_clip
+
+        if name is None:
+            name = Variable._unique_var_name_()
+        is_new_var = False
+        self.desc = self.block.desc.find_var(name)
+
+        if self.desc is None:
+            self.desc = self.block.desc.var(name)
+            is_new_var = True
+
+        if is_new_var:
+            self.desc.set_type(type)
+        elif self.desc.type() != type:
+            raise ValueError("Variable {0} has been created before. The "
+                             "previous type is {1}; the new type is {2}. They"
+                             " are not matched".format(self.name,
+                                                       self.desc.type(), type))
+
+        if shape is not None:
+            if is_new_var:
+                self.desc.set_shape(shape)
+            else:
+                old_shape = self.shape
+                shape = tuple(shape)
+                if shape != old_shape:
+                    raise ValueError(
+                        "Variable {0} has been created before. the previous "
+                        "shape is {1}; the new shape is {2}. They are not "
+                        "matched.".format(self.name, old_shape, shape))
+        if dtype is not None:
+            if not isinstance(dtype, core.DataType):
+                dtype = convert_np_dtype_to_dtype_(dtype)
+            if is_new_var:
+                self.desc.set_dtype(dtype)
+            else:
+                old_dtype = self.dtype
+                if dtype != old_dtype:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous data type is {1}; the new "
+                                     "data type is {2}. They are not "
+                                     "matched.".format(self.name, old_dtype,
+                                                       dtype))
+
+        if lod_level is not None:
+            if is_new_var:
+                self.desc.set_lod_level(lod_level)
+            else:
+                if lod_level != self.lod_level:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous lod_level is {1}; the new "
+                                     "lod_level is {2}. They are not "
+                                     "matched".format(self.name, self.lod_level,
+                                                      lod_level))
+        if persistable is not None:
+            if is_new_var:
+                self.desc.set_persistable(persistable)
+            else:
+                if persistable != self.persistable:
+                    raise ValueError(
+                        "Variable {0} has been created before."
+                        "The previous persistable is {1}; the new "
+                        "persistable is {2}. They are not matched".format(
+                            self.name, self.persistable, persistable))
+
+        self.block.vars[name] = self
+        self.op = None
+        self.stop_gradient = stop_gradient
+
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        Get debug string.
+
+        Args:
+            throw_on_error(bool): True if raise an exception when self is not
+                intialized.
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        res_str = _debug_string_(proto, throw_on_error)
+        if with_details:
+            additional_attr = ("error_clip", "stop_gradient")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        return res_str
+
+    __repr__ = __str__
+
+    def set_desc(self, input):
+        self.desc = input
+
+    @property
+    def persistable(self):
+        return self.desc.persistable()
+
+    @persistable.setter
+    def persistable(self, p):
+        self.desc.set_persistable(p)
+
+    @property
+    def name(self):
+        return self.desc.name()
+
+    @property
+    def shape(self):
+        # convert to tuple, make it as same as numpy API.
+        return tuple(self.desc.shape())
+
+    @property
+    def dtype(self):
+        return self.desc.dtype()
+
+    @property
+    def lod_level(self):
+        return self.desc.lod_level()
+
+    @property
+    def type(self):
+        return self.desc.type()
+
+    @staticmethod
+    def _unique_var_name_():
+        prefix = "_generated_var"
+        uid = core.unique_integer(prefix)  # unique during whole process.
+        return "_".join([prefix, str(uid)])
+
+    def set_error_clip(self, error_clip):
+        self.error_clip = error_clip
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from PaddlePaddle C++ end.
+
+    Returns(list): list of OpProto
+
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+class OpProtoHolder(object):
+    """
+    A global variable to hold all OpProtos from C++ as a map
+    """
+
+    @classmethod
+    def instance(cls):
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self):
+        assert not hasattr(
+            self.__class__,
+            '_instance'), 'Please use `instance()` to get OpProtoHolder opject!'
+        op_protos = get_all_op_protos()
+        self.op_proto_map = {}
+        for proto in op_protos:
+            self.op_proto_map[proto.type] = proto
+
+    def get_op_proto(self, type):
+        """
+        Get OpProto by a type string.
+        Args:
+            type(str): The type that operator registered in C++ side.
+
+        Returns(framework_pb2.OpProto): The OpProto
+
+        """
+        if type not in self.op_proto_map:
+            raise ValueError("Operator \"%s\" has not been registered." % type)
+        return self.op_proto_map[type]
+
+
+class Operator(object):
+    """
+    Python Operator class. The operator represents the build in instructs in a
+    Block. Users can use the build in instructs to describe their neural
+    network.
+    """
+
+    def __init__(self,
+                 block,
+                 desc,
+                 type=None,
+                 inputs=None,
+                 outputs=None,
+                 attrs=None):
+        """
+        Constructor.
+
+        Notes: The constructor of operator should not be invoked directly. Use
+        Block.append_op or Block.prepend_op instead.
+
+        >>> cur_program = Program()
+        >>> cur_block = cur_program.current_block()
+        >>> # var1 += var2 + var3
+        >>> cur_block.append_op(type="sum",
+        >>>                     inputs={"X": [var1, var2, var3]},
+        >>>                     outputs={"Out": [var1]})
+
+        Args:
+            block(Block): The block has the current operator.
+            desc(core.OpDesc): The protobuf description.
+            type(str): The type of operator.
+            inputs(dict): The input dictionary. Key is the input parameter name.
+                Value is a list of variables.
+            outputs(dict): The output dictionary which has the same format with
+                           inputs.
+            attrs(dict): The attributes dictionary. Key is attribute name. Value
+                is the attribute value. The attribute type should be as same as
+                the type registered in C++
+        """
+        self.block = block
+        self.desc = desc
+        # for clone a new operator
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
+        if len(self.desc.type()) != 0:
+            return
+        if type is None:
+            raise ValueError(
+                "`type` to initilized an Operator can not be None.")
+        self.desc.set_type(type)
+        proto = OpProtoHolder.instance().get_op_proto(type)
+
+        def find_name(var_list, name):
+            for var_name in var_list:
+                if var_list[var_name] is not None and var_name == name:
+                    return True
+            return False
+
+        if inputs is not None:
+            for in_proto in proto.inputs:
+                found = find_name(inputs, in_proto.name)
+                assert found or in_proto.dispensable, "Input {} not found".format(
+                    in_proto.name)
+
+                if found:
+                    in_args = inputs[in_proto.name]
+                    if not isinstance(in_args, list):
+                        in_args = [in_args]
+                    if not in_proto.duplicable and len(in_args) > 1:
+                        raise ValueError(
+                            "Input %s expects only one input, but %d are given."
+                            % (in_proto.name, len(in_args)))
+                    in_arg_names = []
+                    for arg in in_args:
+                        if isinstance(arg, basestring):
+                            in_arg_names.append(arg)
+                        else:
+                            in_arg_names.append(arg.name)
+                    self.desc.set_input(in_proto.name, in_arg_names)
+                else:
+                    self.desc.set_input(in_proto.name, [])
+
+        if outputs is not None:
+            given = set()
+            need = set()
+            for n in outputs:
+                given.add(n)
+            for m in proto.outputs:
+                need.add(m.name)
+            if not given == need:
+                raise ValueError(("Incorrect setting for output(s) of "
+                                  "operator \"%s\". Need: [%s] Given: [%s]") %
+                                 (type, ", ".join(str(e)
+                                                  for e in need), ", ".join(
+                                                      str(e) for e in given)))
+
+            for out_proto in proto.outputs:
+                out_args = outputs[out_proto.name]
+                if not isinstance(out_args, list):
+                    out_args = [out_args]
+                if not out_proto.duplicable and len(out_args) > 1:
+                    raise ValueError(
+                        "Output %s expects only one output, but %d are given." %
+                        (out_proto.name, len(out_args)))
+                out_arg_names = []
+                for arg in out_args:
+                    out_arg_names.append(arg.name)
+                    arg.op = self
+                self.desc.set_output(out_proto.name, out_arg_names)
+
+        if attrs is not None:
+            if not isinstance(attrs, dict):
+                raise TypeError("'attrs' should be a dict.")
+            for attr in proto.attrs:
+                attr_name = attr.name
+                if (not attr_name in attrs) or (attrs[attr_name] is None):
+                    continue
+                if isinstance(attrs[attr_name], Block):
+                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                elif isinstance(attrs[attr_name], core.BlockDesc) or \
+                   isinstance(attrs[attr_name], core.ProgramDesc):
+                    self.desc.set_serialized_attr(
+                        attr_name, attrs[attr_name].serialize_to_string())
+                else:
+                    self.desc.set_attr(attr_name, attrs[attr_name])
+
+        self.desc.check_attrs()
+        no_kernel_op_set = {
+            'feed', 'fetch', 'save', 'load', 'recurrent',
+            'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
+            'recv', 'parallel_do'
+        }
+        if type not in no_kernel_op_set:
+            self.desc.infer_var_type(self.block.desc)
+            self.desc.infer_shape(self.block.desc)
+
+    def to_string(self, throw_on_error):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+
+        Returns(str): The debug string.
+
+        """
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        return _debug_string_(proto, throw_on_error)
+
+    def __str__(self):
+        return self.to_string(True)
+
+    __repr__ = __str__
+
+    @property
+    def type(self):
+        return self.desc.type()
+
+    def input(self, name):
+        """
+        Get input arguments by the input parameter name
+        Args:
+            name(str): The input parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
+        return self.desc.input(name)
+
+    @property
+    def input_names(self):
+        """
+        Get all input parameter names
+        Returns(list): return a list of input parameter names
+
+        """
+        return self.desc.input_names()
+
+    def output(self, name):
+        """
+        Get output arguments by the output parameter name
+        Args:
+            name(str): The output parameter name
+
+        Returns(list): return the list of argument names associated with the
+            specific parameter name.
+
+        """
+        return self.desc.output(name)
+
+    @property
+    def output_names(self):
+        """
+        Get all output parameter names
+        Returns(list): return a list of output parameter names
+
+        """
+        return self.desc.output_names()
+
+    @property
+    def idx(self):
+        """
+        Return the array index of current operator.
+        Returns(int): The array index in block.ops array
+        Raises:
+            ValueError: when the operator is not found.
+        """
+        for i, op in enumerate(self.block.ops):
+            if op == self:
+                return i
+        raise ValueError(
+            "Can't find op itself in it's block. It could be a bug of Paddle.")
+
+    def has_attr(self, name):
+        """
+        operator has the attribute with name or not.
+        Args:
+            name(str): the attribute name
+
+        Returns(bool): True if has this attribute.
+
+        """
+        return self.desc.has_attr(name)
+
+    def attr_type(self, name):
+        """
+        Get the type of attribute by attribute name
+        Args:
+            name(str): the attribute name
+
+        Returns(core.AttrType): the attribute type
+
+        """
+        return self.desc.attr_type(name)
+
+    @property
+    def attr_names(self):
+        """
+        Get all attribute names
+        Returns(list): The list of attribute name
+
+        """
+        return self.desc.attr_names()
+
+    def attr(self, name):
+        """
+        Get attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(bool|int|str|float|list): The attribute value. The return value
+            can be any valid attribute type.
+
+        """
+        return self.desc.attr(name)
+
+    def block_attr(self, name):
+        """
+        Get the block attribute by name
+        Args:
+            name(str): the attribute name
+
+        Returns(int): the block index
+
+        """
+        return self.desc.block_attr(name)
+
+
+class Block(object):
+    def __init__(self, program, idx):
+        self.desc = program.desc.block(idx)
+        self.vars = dict()  # var_name --> var
+        self.ops = collections.deque()  # operator list
+        self.program = program
+        self.removed_vars = dict()
+
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            re_add_indent = re.compile(r"\n(.)")
+            res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
+                self.idx, self.parent_idx)
+            for var in self.vars.itervalues():
+                res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", var.to_string(throw_on_error, with_details))
+            for op in self.ops:
+                res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
+                    r"\n    \1", op.to_string(throw_on_error))
+            res_str += "\n}"
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.BlockDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
+
+    __repr__ = __str__
+
+    @property
+    def parent_idx(self):
+        return self.desc.parent
+
+    @property
+    def idx(self):
+        return self.desc.id
+
+    def var(self, name):
+        if not isinstance(name, basestring):
+            raise TypeError()
+        v = self.vars.get(name, None)
+        if v is None:
+            raise ValueError("var %s not in this block" % name)
+        return v
+
+    def var_recursive(self, name):
+        if self.has_var(name):
+            return self.var(name)
+        else:
+            if self.idx == 0:
+                raise ValueError("var %s is not in block(%d) nor its parents." %
+                                 name, self.idx)
+            else:
+                parent_block = self.program.block(self.parent_idx)
+                return parent_block.var_recursive(name)
+
+    def all_parameters(self):
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
+
+    def create_var(self, *args, **kwargs):
+        var = Variable(self, *args, **kwargs)
+        if 'initializer' in kwargs:
+            kwargs['initializer'](var, self)
+        return var
+
+    def has_var(self, name):
+        return name in self.vars
+
+    def create_parameter(self, *args, **kwargs):
+        global_block = self.program.global_block()
+        param = Parameter(global_block, *args, **kwargs)
+        if 'initializer' in kwargs:
+            kwargs['initializer'](param, self)
+        return param
+
+    def append_op(self, *args, **kwargs):
+        op_desc = self.desc.append_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.append(op)
+        return op
+
+    def delete_ops(self, ops):
+        # remove from cpp
+        # FIXME(typhoonzero): remove only the first occuracy.
+        try:
+            start = list(self.ops).index(ops[0])
+            end = list(self.ops).index(ops[-1])
+        except Exception, e:
+            raise e
+        self.desc.remove_op(start, end + 1)
+
+    def prepend_op(self, *args, **kwargs):
+        op_desc = self.desc.prepend_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.appendleft(op)
+        return op
+
+    def sync_with_cpp(self):
+        # sync variables from cpp
+        for var in self.desc.all_vars():
+            if not self.has_var(var.name()):
+                self.create_var(name=var.name(), desc=var, type=var.type())
+
+        # sync operators from cpp
+        ops_in_cpp = []
+        for op_idx in range(0, self.desc.op_size()):
+            ops_in_cpp.append(self.desc.op(op_idx))
+
+        if len(self.ops) != 0:
+            first_op_in_python = self.ops[0].desc
+            last_op_in_python = self.ops[len(self.ops) - 1].desc
+            start_index = None
+            end_index = None
+            for index in range(len(ops_in_cpp)):
+                if first_op_in_python == ops_in_cpp[index]:
+                    start_index = index
+                if last_op_in_python == ops_in_cpp[index]:
+                    end_index = index
+            assert start_index is not None
+            assert end_index is not None
+            assert start_index <= end_index
+        else:
+            start_index = 0
+            end_index = -1
+
+        # sync ops append to the head of cpp_ops
+        for index in range((start_index - 1 - 1), -1, -1):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.appendleft(op)
+
+        # sync ops append to the end of cpp_ops
+        for index in range((end_index + 1), len(ops_in_cpp)):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.append(op)
+
+        assert len(self.ops) == len(ops_in_cpp)
+        for index in range(len(self.ops)):
+            assert self.ops[index].desc == ops_in_cpp[index]
+
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                gradient_clip_attr=p.gradient_clip_attr,
+                error_clip=p.error_clip,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
+
+class Program(object):
+    def __init__(self):
+        self.desc = core.ProgramDesc()
+        self.blocks = [Block(self, 0)]
+        self.current_block_idx = 0
+        self._seed = 0
+
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = ""
+            for block in self.blocks:
+                res_str += block.to_string(throw_on_error, with_details)
+        else:
+            protostr = self.desc.serialize_to_string()
+            proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+            res_str = _debug_string_(proto, throw_on_error)
+        return res_str
+
+    def get_desc(self):
+        return self.desc
+
+    def clone(self):
+        p = Program()
+        p.desc = core.ProgramDesc(self.desc)
+        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+        p.sync_with_cpp()
+        p.copy_param_info_from(self)
+        return p
+
+    def prune(self, targets):
+        if not isinstance(targets, list):
+            targets = [targets]
+        targets_idx = []
+        for t in targets:
+            if not isinstance(t, Operator):
+                if isinstance(t, Variable):
+                    t = t.op
+                else:
+                    raise ValueError(("All targets of prune() can only be "
+                                      "Variable or Operator."))
+
+            targets_idx.append([t.block.idx, t.idx])
+        res = Program()
+        res.desc = core.prune(self.desc, targets_idx)
+        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.sync_with_cpp()
+        return res
+
+    def inference_optimize(self):
+        res = Program()
+        res.desc = core.inference_optimize(self.desc)
+        res.blocks = [Block(res, i) for i in xrange(res.desc.num_blocks())]
+        res.sync_with_cpp()
+        return res
+
+    @staticmethod
+    def parse_from_string(binary_str):
+        p = Program()
+        p.desc = core.ProgramDesc(binary_str)
+        p.blocks = [Block(p, i) for i in xrange(p.desc.num_blocks())]
+        p.sync_with_cpp()
+        return p
+
+    @property
+    def random_seed(self):
+        return self._seed
+
+    @random_seed.setter
+    def random_seed(self, seed):
+        if not isinstance(seed, int):
+            raise ValueError("Seed must be a integer.")
+        self._seed = seed
+
+    def __repr__(self):
+        return str(self)
+
+    def global_block(self):
+        return self.blocks[0]
+
+    def block(self, index):
+        return self.blocks[index]
+
+    def current_block(self):
+        return self.blocks[self.current_block_idx]
+
+    def append_backward(self, target, no_grad_set=None):
+        """
+        return map(param_name -> (grad_name, block_index, op_index))
+        """
+        assert isinstance(target, Variable)
+        if no_grad_set is None:
+            no_grad_set = set()
+        try:
+            param_to_grad_info = self.desc.append_backward(target.desc,
+                                                           no_grad_set)
+        except Exception as e:
+            raise core.EnforceNotMet(
+                str(e) + "\nCurrent protobuf is\n{0}".format(
+                    self.to_string(False)))
+
+        self.sync_with_cpp()
+        return param_to_grad_info
+
+    def create_block(self, parent_idx=None):
+        new_block_idx = len(self.blocks)
+        parent = self.current_block() if parent_idx is None else self.block(
+            parent_idx)
+        self.desc.append_block(parent.desc)
+        self.current_block_idx = new_block_idx
+        self.blocks.append(Block(self, self.current_block_idx))
+        return self.current_block()
+
+    def rollback(self):
+        self.current_block_idx = self.current_block().parent_idx
+
+    def sync_with_cpp(self):
+        for block_idx in range(len(self.blocks), self.desc.num_blocks()):
+            self.blocks.append(Block(self, block_idx))
+        for block in self.blocks:
+            block.sync_with_cpp()
+
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program.
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
+    def list_vars(self):
+        for each_block in self.blocks:
+            for each_var in each_block.vars.itervalues():
+                yield each_var
+
+
+class Parameter(Variable):
+    def __init__(self, block, shape, dtype, **kwargs):
+        if shape is None or dtype is None:
+            raise ValueError("Parameter must set shape and dtype")
+        if len(shape) == 0:
+            raise ValueError("Parameter shape cannot be empty")
+
+        for each in shape:
+            if each < 0:
+                raise ValueError("Parameter shape should not be related with "
+                                 "batch-size")
+
+        Variable.__init__(
+            self, block, persistable=True, shape=shape, dtype=dtype, **kwargs)
+        self.trainable = kwargs.get('trainable', True)
+
+        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
+
+        self.regularizer = kwargs.get('regularizer', None)
+
+        self.gradient_clip_attr = kwargs.get('gradient_clip_attr', None)
+
+    def __str__(self):
+        return self.to_string(True)
+
+    def to_string(self, throw_on_error, with_details=False):
+        """
+        To debug string.
+        Args:
+            throw_on_error(bool): raise exception when self is not initialized
+                when throw_on_error is True
+            with_details(bool): more details about variables and parameters
+                (e.g. trainable, optimize_attr, ...) will be printed when with_details is True
+
+        Returns(str): The debug string.
+
+        """
+        assert isinstance(throw_on_error, bool) and isinstance(with_details,
+                                                               bool)
+        if with_details:
+            res_str = Variable.to_string(self, throw_on_error, True)
+            additional_attr = ("trainable", "optimize_attr", "regularizer",
+                               "gradient_clip_attr")
+            for attr_name in additional_attr:
+                res_str += "%s: %s\n" % (attr_name,
+                                         str(getattr(self, attr_name)))
+        else:
+            res_str = Variable.to_string(self, throw_on_error, False)
+        return res_str
+
+    __repr__ = __str__
+
+
+# program is a global instance.
+_main_program_ = Program()
+_startup_program_ = Program()
+
+
+def default_startup_program():
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
+
+
+def default_main_program():
+    """
+    Get default main program. The main program is used for training or testing.
+
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement.
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
new file mode 100644
index 0000000000..b9c0d12ad6
--- /dev/null
+++ b/python/paddle/v2/fluid/initializer.py
@@ -0,0 +1,412 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import framework
+import numpy as np
+
+__all__ = [
+    'Constant',
+    'Uniform',
+    'Normal',
+    'Xavier',
+]
+
+
+class Initializer(object):
+    """Base class for variable initializers
+
+    Defines the common interface of variable initializers.
+    They add operations to the init program that are used
+    to initialize variables. Users should not use this class
+    directly, but need to use one of its implementations.
+    """
+
+    def __init_(self):
+        pass
+
+    def __call__(self, param, block):
+        """Add corresponding initialization operations to the network
+        """
+        raise NotImplementedError()
+
+    def _compute_fans(self, var):
+        """Compute the fan_in and the fan_out for layers
+
+        This method computes the fan_in and the fan_out
+        for neural network layers, if not specified. It is
+        not possible to perfectly estimate fan_in and fan_out.
+        This method will estimate it correctly for matrix multiply and
+        convolutions.
+
+        Args:
+            var: variable for which fan_in and fan_out have to be computed
+
+        Returns:
+            tuple of two integers (fan_in, fan_out)
+        """
+        shape = var.shape
+        if not shape or len(shape) == 0:
+            fan_in = fan_out = 1
+        elif len(shape) == 1:
+            fan_in = fan_out = shape[0]
+        elif len(shape) == 2:
+            # This is the case for simple matrix multiply
+            fan_in = shape[0]
+            fan_out = shape[1]
+        else:
+            # Assume this to be a convolutional kernel
+            # In PaddlePaddle, the shape of the kernel is like:
+            # [num_filters, num_filter_channels, ...] where the remaining
+            # dimensions are the filter_size
+            receptive_field_size = np.prod(shape[2:])
+            fan_in = shape[1] * receptive_field_size
+            fan_out = shape[0] * receptive_field_size
+
+        return (fan_in, fan_out)
+
+
+class ConstantInitializer(Initializer):
+    """Implements the constant initializer
+    """
+
+    def __init__(self, value=0.0):
+        """Constructor for ConstantInitializer
+
+        Args:
+            value: constant value to initialize the variable
+        """
+        assert value is not None
+        super(ConstantInitializer, self).__init__()
+        self._value = value
+
+    def __call__(self, var, block):
+        """Add constant initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        op = block.prepend_op(
+            type="fill_constant",
+            outputs={"Out": var},
+            attrs={
+                "shape": var.shape,
+                "dtype": int(var.dtype),
+                "value": self._value
+            })
+        var.op = op
+        return op
+
+
+class UniformInitializer(Initializer):
+    """Implements the random uniform distribution initializer
+    """
+
+    def __init__(self, low=-1.0, high=1.0, seed=0):
+        """Constructor for UniformInitializer
+
+        Args:
+            low: lower boundary of the uniform distribution
+            high: upper boundary of the uniform distribution
+            seed: random seed
+        """
+        assert low is not None
+        assert high is not None
+        assert high >= low
+        assert seed is not None
+        super(UniformInitializer, self).__init__()
+        self._low = low
+        self._high = high
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add uniform distribution initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+        op = block.prepend_op(
+            type="uniform_random",
+            outputs={"Out": var},
+            attrs={
+                "shape": var.shape,
+                "dtype": int(var.dtype),
+                "min": self._low,
+                "max": self._high,
+                "seed": self._seed
+            })
+        var.op = op
+        return op
+
+
+class NormalInitializer(Initializer):
+    """Implements the  random Normal(Gaussian) distribution initializer
+    """
+
+    def __init__(self, loc=0.0, scale=1.0, seed=0):
+        """Constructor for NormalInitializer
+
+        Args:
+            loc: mean of the normal distribution
+            scale: standard deviation of the normal distribution
+            seed: random seed
+        """
+        assert loc is not None
+        assert scale is not None
+        assert seed is not None
+        super(NormalInitializer, self).__init__()
+        self._mean = loc
+        self._std_dev = scale
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add normal distribution initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        # Initialization Ops should be prepended and not appended
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+        op = block.prepend_op(
+            type="gaussian_random",
+            outputs={"Out": var},
+            attrs={
+                "shape": var.shape,
+                "dtype": int(var.dtype),
+                "mean": self._mean,
+                "std": self._std_dev,
+                "seed": self._seed
+            })
+        var.op = op
+        return op
+
+
+class XavierInitializer(Initializer):
+    """Implements the Xavier initializer
+
+    This class implements the Xavier weight initializer from the paper
+    Understanding the difficulty of training deep feedforward neural
+    networks[1] by Xavier Glorot and Yoshua Bengio.
+
+    This initializer is designed to keep the scale of the gradients
+    approximately same in all the layers. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / (fan_in + fan_out)).
+    In case of Normal distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ (fan_in + fan_out)).
+
+    References:
+        [1] Understanding the difficulty of training deep feedforward neural
+            networks. International conference on artificial intelligence and
+            statistics.
+            (http://proceedings.mlr.press/v9/glorot10a.html)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, fan_out=None, seed=0):
+        """Constructor for XavierInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for Xavier initialization. If None, it is
+                    inferred from the variable.
+            fan_out: fan_out for Xavier initialization. If None, it is
+                     inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in and fan_out to None for
+              most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(XavierInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._fan_out = fan_out
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add xavier initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in and fan_out are passed, use them
+        fan_in = f_in if self._fan_in is None else self._fan_in
+        fan_out = f_out if self._fan_out is None else self._fan_out
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in + fan_out))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in + fan_out))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
+
+
+class MSRAInitializer(Initializer):
+    """Implements the MSRA initializer a.k.a. Kaiming Initializer
+
+    This class implements the weight initialization from the paper
+    Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+    ImageNet Classification[1] by Kaiming He, Xiangyu Zhang, Shaoqing Ren
+    and Jian Sun. This is a robust initialization method that particularly
+    considers the rectifier nonlinearities. In case of Uniform distribution,
+    the range is [-x, x], where x = sqrt(6 / fan_in). In case of Normal
+    distribution, the mean is 0 and the standard deviation
+    is sqrt(2/ fan_in).
+
+    References:
+        [1] Delving Deep into Rectifiers: Surpassing Human-Level Performance
+            on ImageNet Classification
+            (https://arxiv.org/abs/1502.01852)
+    """
+
+    def __init__(self, uniform=True, fan_in=None, seed=0):
+        """Constructor for MSRAInitializer
+
+        Args:
+            uniform: whether to use uniform or normal distribution
+            fan_in: fan_in for MSRAInitializer. If None, it is
+                    inferred from the variable.
+            seed: random seed
+
+        Note: It is recommended to set fan_in to None for most cases.
+        """
+        assert uniform is not None
+        assert seed is not None
+        super(MSRAInitializer, self).__init__()
+        self._uniform = uniform
+        self._fan_in = fan_in
+        self._seed = seed
+
+    def __call__(self, var, block):
+        """Add MSRA initialization ops for a variable
+
+        Args:
+            var: Variable that needs to be initialized
+            block: The block in which initialization ops
+                   should be added
+
+        Returns:
+            the initialization op
+        """
+        assert isinstance(var, framework.Variable)
+        assert isinstance(block, framework.Block)
+        f_in, f_out = self._compute_fans(var)
+
+        # If fan_in is passed, use it
+        fan_in = f_in if self._fan_in is None else self._fan_in
+
+        if self._seed == 0:
+            self._seed = block.program.random_seed
+
+        if self._uniform:
+            limit = np.sqrt(6.0 / float(fan_in))
+            op = block.prepend_op(
+                type="uniform_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "min": -limit,
+                    "max": limit,
+                    "seed": self._seed
+                })
+
+        else:
+            std = np.sqrt(2.0 / float(fan_in))
+            op = block.prepend_op(
+                type="gaussian_random",
+                outputs={"Out": var},
+                attrs={
+                    "shape": var.shape,
+                    "dtype": int(var.dtype),
+                    "mean": 0.0,
+                    "std": std,
+                    "seed": self._seed
+                })
+        var.op = op
+        return op
+
+
+# We short the class name, since users will use the initializer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
+#
+# It is no need to add an `Initializer` as the class suffix
+Constant = ConstantInitializer
+Uniform = UniformInitializer
+Normal = NormalInitializer
+Xavier = XavierInitializer
+MSRA = MSRAInitializer
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
new file mode 100644
index 0000000000..d56ec45c53
--- /dev/null
+++ b/python/paddle/v2/fluid/io.py
@@ -0,0 +1,385 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from paddle.v2.fluid.evaluator import Evaluator
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
+from . import core
+
+__all__ = [
+    'save_vars',
+    'save_params',
+    'save_persistables',
+    'load_vars',
+    'load_params',
+    'load_persistables',
+    'save_inference_model',
+    'load_inference_model',
+    'get_inference_program',
+]
+
+
+def is_parameter(var):
+    """Check whether the variable is a Parameter.
+
+    This function checks whether the input variable is a Parameter.
+
+    Args:
+        var : The input variable.
+
+    Returns:
+        boolean result whether the variable is a Parameter.
+    """
+    return isinstance(var, Parameter)
+
+
+def is_persistable(var):
+    return var.persistable
+
+
+def _clone_var_in_block_(block, var):
+    assert isinstance(var, Variable)
+    return block.create_var(
+        name=var.name,
+        shape=var.shape,
+        dtype=var.dtype,
+        type=var.type,
+        lod_level=var.lod_level,
+        persistable=True)
+
+
+def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+    """
+    Save variables to directory by executor.
+
+    :param executor: executor that save variable
+    :param dirname: directory path
+    :param main_program: program. If vars is None, then filter all variables in this
+    program which fit `predicate`. Default default_main_program.
+    :param predicate: The Predicate describes a callable that returns a variable
+    as a bool. If it returns true, the variables will be saved.
+    :param vars: variables need to be saved. If specify vars, program & predicate
+    will be ignored
+    :return: None
+    """
+    if vars is None:
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program should be as Program type or None")
+
+        save_vars(
+            executor,
+            dirname=dirname,
+            vars=filter(predicate, main_program.list_vars()))
+    else:
+        save_program = Program()
+        save_block = save_program.global_block()
+        for each_var in vars:
+            new_var = _clone_var_in_block_(save_block, each_var)
+            save_block.append_op(
+                type='save',
+                inputs={'X': [new_var]},
+                outputs={},
+                attrs={'file_path': os.path.join(dirname, new_var.name)})
+        executor.run(save_program)
+
+
+def save_params(executor, dirname, main_program=None):
+    """
+    Save all parameters to directory with executor.
+    """
+    save_vars(
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        vars=None,
+        predicate=is_parameter)
+
+
+def save_persistables(executor, dirname, main_program=None):
+    """
+    Save all persistables to directory with executor.
+    """
+    save_vars(
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        vars=None,
+        predicate=is_persistable)
+
+
+def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+    """
+    Load variables from directory by executor.
+
+    :param executor: executor that save variable
+    :param dirname: directory path
+    :param main_program: program. If vars is None, then filter all variables in this
+    program which fit `predicate`. Default default_main_program().
+    :param predicate: The Predicate describes a callable that returns a variable
+    as a bool. If it returns true, the variables will be loaded.
+    :param vars: variables need to be loaded. If specify vars, program &
+    predicate will be ignored
+    :return: None
+    """
+    if vars is None:
+        if main_program is None:
+            main_program = default_main_program()
+        if not isinstance(main_program, Program):
+            raise TypeError("program's type should be Program")
+
+        load_vars(
+            executor,
+            dirname=dirname,
+            vars=filter(predicate, main_program.list_vars()))
+    else:
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        for each_var in vars:
+            assert isinstance(each_var, Variable)
+            new_var = _clone_var_in_block_(load_block, each_var)
+            load_block.append_op(
+                type='load',
+                inputs={},
+                outputs={"Out": [new_var]},
+                attrs={'file_path': os.path.join(dirname, new_var.name)})
+
+        executor.run(load_prog)
+
+
+def load_params(executor, dirname, main_program=None):
+    """
+    load all parameters from directory by executor.
+    """
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_parameter)
+
+
+def load_persistables(executor, dirname, main_program=None):
+    """
+    load all persistables from directory by executor.
+    """
+    load_vars(
+        executor,
+        dirname=dirname,
+        main_program=main_program,
+        predicate=is_persistable)
+
+
+def get_inference_program(target_vars, main_program=None):
+    if main_program is None:
+        main_program = default_main_program()
+    if not isinstance(target_vars, list):
+        target_vars = [target_vars]
+    vars = []
+    for var in target_vars:
+        if isinstance(var, Evaluator):
+            vars.extend(var.states)
+            vars.extend(var.metrics)
+        else:
+            vars.append(var)
+    pruned_program = main_program.prune(targets=vars)
+    inference_program = pruned_program.inference_optimize()
+    return inference_program
+
+
+def prepend_feed_ops(inference_program,
+                     feed_target_names,
+                     feed_holder_name='feed'):
+    global_block = inference_program.global_block()
+    feed_var = global_block.create_var(
+        name=feed_holder_name,
+        type=core.VarDesc.VarType.FEED_MINIBATCH,
+        persistable=True)
+
+    for i, name in enumerate(feed_target_names):
+        out = global_block.var(name)
+        global_block.prepend_op(
+            type='feed',
+            inputs={'X': [feed_var]},
+            outputs={'Out': [out]},
+            attrs={'col': i})
+
+
+def append_fetch_ops(inference_program,
+                     fetch_target_names,
+                     fetch_holder_name='fetch'):
+    global_block = inference_program.global_block()
+    fetch_var = global_block.create_var(
+        name=fetch_holder_name,
+        type=core.VarDesc.VarType.FETCH_LIST,
+        persistable=True)
+
+    for i, name in enumerate(fetch_target_names):
+        global_block.append_op(
+            type='fetch',
+            inputs={'X': [name]},
+            outputs={'Out': [fetch_var]},
+            attrs={'col': i})
+
+
+def save_inference_model(dirname,
+                         feeded_var_names,
+                         target_vars,
+                         executor,
+                         main_program=None):
+    """
+    Build a model especially for inference,
+    and save it to directory by the executor.
+
+    :param dirname: directory path
+    :param feeded_var_names: Names of variables that need to be feeded data during inference
+    :param target_vars: Variables from which we can get inference results.
+    :param executor: executor that save inference model
+    :param main_program: original program, which will be pruned to build the inference model.
+            Default default_main_program().
+
+    :return: None
+    """
+    if isinstance(feeded_var_names, basestring):
+        feeded_var_names = [feeded_var_names]
+    else:
+        if not (bool(feeded_var_names) and all(
+                isinstance(name, basestring) for name in feeded_var_names)):
+            raise ValueError("'feed_var_names' should be a list of str.")
+
+    if isinstance(target_vars, Variable):
+        target_vars = [target_vars]
+    else:
+        if not (bool(target_vars) and all(
+                isinstance(var, Variable) for var in target_vars)):
+            raise ValueError("'target_vars' should be a list of Variable.")
+
+    if main_program is None:
+        main_program = default_main_program()
+
+    if not os.path.isdir(dirname):
+        os.makedirs(dirname)
+
+    pruned_program = main_program.prune(targets=target_vars)
+    inference_program = pruned_program.inference_optimize()
+    fetch_var_names = [v.name for v in target_vars]
+
+    prepend_feed_ops(inference_program, feeded_var_names)
+    append_fetch_ops(inference_program, fetch_var_names)
+
+    model_file_name = dirname + "/__model__"
+    with open(model_file_name, "wb") as f:
+        f.write(inference_program.desc.serialize_to_string())
+
+    save_params(executor, dirname, main_program)
+
+
+def load_persistables_if_exist(executor, dirname, main_program=None):
+    filenames = next(os.walk(dirname))[2]
+    filenames = set(filenames)
+
+    def _is_presistable_and_exist_(var):
+        if not is_persistable(var):
+            return False
+        else:
+            return var.name in filenames
+
+    load_vars(
+        executor,
+        dirname,
+        main_program=main_program,
+        vars=None,
+        predicate=_is_presistable_and_exist_)
+
+
+def get_feed_targets_names(program):
+    feed_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'feed':
+            feed_targets_names.insert(0, op.desc.output('Out')[0])
+    return feed_targets_names
+
+
+def get_fetch_targets_names(program):
+    fetch_targets_names = []
+    global_block = program.global_block()
+    for op in global_block.ops:
+        if op.desc.type() == 'fetch':
+            fetch_targets_names.append(op.desc.input('X')[0])
+    return fetch_targets_names
+
+
+def load_inference_model(dirname, executor):
+    """
+    Load inference model from a directory
+
+    :param dirname: directory path
+    :param executor: executor that load inference model
+
+    :return: [program, feed_target_names, fetch_targets]
+             program: program especially for inference.
+             feed_target_names: Names of variables that need to feed data
+             fetch_targets: Variables from which we can get inference results.
+    """
+    if not os.path.isdir(dirname):
+        raise ValueError("There is no directory named '%s'", dirname)
+
+    model_file_name = dirname + "/__model__"
+    with open(model_file_name, "rb") as f:
+        program_desc_str = f.read()
+
+    program = Program.parse_from_string(program_desc_str)
+    load_persistables_if_exist(executor, dirname, program)
+
+    feed_target_names = get_feed_targets_names(program)
+    fetch_target_names = get_fetch_targets_names(program)
+    fetch_targets = [
+        program.global_block().var(name) for name in fetch_target_names
+    ]
+
+    return [program, feed_target_names, fetch_targets]
+
+
+def get_parameter_value(para, executor):
+    """
+    Get the LoDTensor for the parameter
+
+    :param executor: executor for retrieving the value
+    :param para: the given parameter
+    :return: the LoDTensor for the parameter
+    """
+    assert is_parameter(para)
+
+    get_program = Program()
+    block = get_program.global_block()
+    new_var = _clone_var_in_block_(block, para)
+    return executor.run(get_program, feed={}, fetch_list=[new_var])[0]
+
+
+def get_parameter_value_by_name(name, executor, program=None):
+    """
+    Get the LoDTensor for paramter with the given name
+
+    :param executor: executor for retrieving the value
+    :param name: the name of the parameter
+    :param program: the program where the variable is found
+            Default default_main_program().
+    :return: the LoDTensor for the variable
+    """
+    if program is None:
+        program = default_main_program()
+    var = program.global_block().var(name)
+    return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
new file mode 100644
index 0000000000..2119ca12c8
--- /dev/null
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -0,0 +1,397 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import itertools
+
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
+from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr, WeightNormParamAttr
+
+
+class LayerHelper(object):
+    def __init__(self, layer_type, **kwargs):
+        self.kwargs = kwargs
+        self.layer_type = layer_type
+        name = self.kwargs.get('name', None)
+        if name is None:
+            self.kwargs['name'] = unique_name(self.layer_type)
+
+    @property
+    def name(self):
+        return self.kwargs['name']
+
+    @property
+    def main_program(self):
+        return default_main_program()
+
+    @property
+    def startup_program(self):
+        return default_startup_program()
+
+    def append_op(self, *args, **kwargs):
+        return self.main_program.current_block().append_op(*args, **kwargs)
+
+    def multiple_input(self, input_param_name='input'):
+        inputs = self.kwargs.get(input_param_name, [])
+        type_error = TypeError(
+            "Input of {0} layer should be Variable or sequence of Variable".
+            format(self.layer_type))
+        if isinstance(inputs, Variable):
+            inputs = [inputs]
+        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            raise type_error
+        else:
+            for each in inputs:
+                if not isinstance(each, Variable):
+                    raise type_error
+        return inputs
+
+    def input(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        if len(inputs) != 1:
+            raise "{0} layer only takes one input".format(self.layer_type)
+        return inputs[0]
+
+    @property
+    def param_attr(self):
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
+
+    @property
+    def bias_attr(self):
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
+
+    def multiple_param_attr(self, length):
+        param_attr = self.param_attr
+        if isinstance(param_attr, ParamAttr):
+            param_attr = [param_attr]
+
+        if len(param_attr) != 1 and len(param_attr) != length:
+            raise ValueError("parameter number mismatch")
+        elif len(param_attr) == 1 and length != 1:
+            tmp = [None] * length
+            for i in xrange(length):
+                tmp[i] = copy.deepcopy(param_attr[0])
+            param_attr = tmp
+        return param_attr
+
+    def iter_inputs_and_params(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        param_attrs = self.multiple_param_attr(len(inputs))
+        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+            yield ipt, param_attr
+
+    def input_dtype(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        dtype = None
+        for each in inputs:
+            if dtype is None:
+                dtype = each.dtype
+            elif dtype != each.dtype:
+                raise ValueError("Data Type mismatch: %d to %d" %
+                                 (dtype, each.dtype))
+        return dtype
+
+    def _create_weight_normalize(self, attr, shape, dtype):
+        from .layers import elementwise_mul, elementwise_div, reshape
+
+        # Remove these ops when LayerHelper and layers support indicating
+        # program and block.
+        def __norm_op(x,
+                      out=None,
+                      p=2,
+                      dim=None,
+                      keep_dim=False,
+                      block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            abs_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_abs'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='abs', inputs={'X': x}, outputs={'Out': abs_out})
+            pow_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_pow'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='pow',
+                inputs={'X': abs_out},
+                outputs={'Out': pow_out},
+                attrs={'factor': float(p)})
+            sum_out = block.create_var(
+                name=unique_name(".".join([self.name, 'weight_norm_sum'])),
+                dtype=dtype,
+                persistable=False)
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': pow_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': dim,
+                    'keep_dim': keep_dim,
+                    'reduce_all': True if dim is None else False
+                })
+            block.append_op(
+                type='pow',
+                inputs={'X': sum_out},
+                outputs={'Out': out},
+                attrs={'factor': 1. / p})
+            return out
+
+        def __reshape_op(x,
+                         shape,
+                         out=None,
+                         block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_reshape'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='reshape',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'shape': shape})
+            return out
+
+        def __transpose_op(x,
+                           axis,
+                           out=None,
+                           block=self.startup_program.global_block()):
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join(
+                        [self.name, 'weight_norm_transpose'])),
+                    dtype=dtype,
+                    persistable=False)
+            block.append_op(
+                type='transpose',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'axis': axis})
+            return out
+
+        def __norm_except_dim(x,
+                              out=None,
+                              dim=None,
+                              block=self.startup_program.global_block()):
+            """Computes the norm over all dimensions except dim"""
+            if out is None:
+                out = block.create_var(
+                    name=unique_name(".".join([self.name, 'weight_norm_norm'])),
+                    dtype=dtype,
+                    persistable=False)
+            if dim is None:
+                __norm_op(x, out, dim=dim, block=block)
+            elif dim == 0:
+                out_shape = [x.shape[0]] + [1] * (len(x.shape) - 1)
+                reshape = __reshape_op(x, shape=[x.shape[0], -1], block=block)
+                norm = __norm_op(reshape, dim=1, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            elif dim == len(x.shape) - 1:
+                out_shape = [1] * (len(x.shape) - 1) + [x.shape[-1]]
+                reshape = __reshape_op(x, shape=[-1, x.shape[-1]], block=block)
+                norm = __norm_op(reshape, dim=0, block=block)
+                __reshape_op(norm, out=out, shape=out_shape, block=block)
+            else:
+                perm = range(len(x.shape))
+                perm[0], perm[dim] = dim, 0
+                transpose = __transpose_op(x, perm, block=block)
+                norm = __norm_op(transpose, dim=0, block=block)
+                __transpose_op(norm, perm, out=out, block=block)
+            return out
+
+        def __weight_normalize(g, v, dim):
+            """Calculations for weight normalization"""
+            norm = __norm_except_dim(
+                v, dim=dim, block=self.main_program.current_block())
+            scale = elementwise_div(
+                x=g, y=norm)  # The shapes of g and norm are the same.
+            # Currently, elementwise_mul only support broadcast when the shape
+            # of y is a subset of the shape of x. Thus, we reshape y to squeeze
+            # to achive the subset.
+            w = elementwise_mul(
+                x=v,
+                y=scale if dim is None else reshape(
+                    x=scale, shape=[v.shape[dim]]),
+                axis=-1 if dim is None else dim)
+            # To serialize the original parameter for inference, maybe a
+            # parameter rather than a variable should be returned.
+            return w
+
+        g_param_attr = copy.deepcopy(attr)
+        g_param_attr.name = attr.name + '_g'
+        g_param_shape = [1] * len(shape)
+        if attr.dim is not None:
+            g_param_shape[attr.dim] = shape[attr.dim]
+        v_param_attr = copy.deepcopy(attr)
+        v_param_attr.name = attr.name + '_v'
+        v_param_shape = shape
+
+        # Add to startup_program to initialize g and v.
+        # Try to reconstruct the initializer of w by initializing g and v.
+        # Set the initializers of g and v as below, then the distribution
+        # of w is the same as initializing w with the given initializer.
+        # For Data-Dependent Initialization, please compute the init-values
+        # of g and v in external and then feed the values to g and v by
+        # executing an extra program.
+        g_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=g_param_shape,
+            **g_param_attr.to_kwargs(with_initializer=False))
+        v_param = self.startup_program.global_block().create_parameter(
+            dtype=dtype,
+            shape=v_param_shape,
+            **v_param_attr.to_kwargs(with_initializer=True))
+        __norm_except_dim(
+            x=v_param,
+            out=g_param,
+            dim=attr.dim,
+            block=self.startup_program.global_block())
+
+        # Add weight normalization to main_program
+        g_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=g_param_shape, **g_param_attr.to_kwargs())
+        v_param = self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=v_param_shape, **v_param_attr.to_kwargs())
+        w_param = __weight_normalize(g_param, v_param, dim=attr.dim)
+        return w_param
+
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
+        # Deepcopy the attr so that parameters can be shared in program
+        attr = copy.deepcopy(attr)
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
+        if default_initializer is None and attr.initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
+        else:
+            attr.set_default_initializer(default_initializer)
+
+        # If weight normalization is set, insert extra parameters and ops.
+        # Refer to https://arxiv.org/pdf/1602.07868.pdf
+        if isinstance(attr, WeightNormParamAttr):
+            param = self._create_weight_normalize(attr, shape, dtype)
+            WeightNormParamAttr.params_with_weight_norm.append(param)
+            return param
+
+        self.startup_program.global_block().create_parameter(
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
+        return self.main_program.global_block().create_parameter(
+            dtype=dtype, shape=shape, **attr.to_kwargs())
+
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
+
+    def create_tmp_variable(self, dtype, stop_gradient=False):
+        return self.main_program.current_block().create_var(
+            name=unique_name(".".join([self.name, 'tmp'])),
+            dtype=dtype,
+            persistable=False,
+            stop_gradient=stop_gradient)
+
+    def create_variable(self, *args, **kwargs):
+        return self.main_program.current_block().create_var(*args, **kwargs)
+
+    def create_global_variable(self, persistable=False, *args, **kwargs):
+        return self.main_program.global_block().create_var(
+            *args, persistable=persistable, **kwargs)
+
+    def set_variable_initializer(self, var, initializer):
+        assert isinstance(var, Variable)
+        self.startup_program.global_block().create_var(
+            name=var.name,
+            type=var.type,
+            dtype=var.dtype,
+            shape=var.shape,
+            persistable=True,
+            initializer=initializer)
+
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
+        """
+        Append bias operator and return its output. If the user does not set
+        bias_attr, append_bias_op will return input_var
+
+        :param input_var: the input variable. The len(input_var.shape) is
+        larger or equal than 2.
+        :bias_initializer: an instance of a subclass of Initializer used to
+        initialize the bias
+        :param dim_start:
+        :param dim_end: the shape of the bias will be
+        input_var.shape[dim_start:dim_end]. The bias is broadcasted to other
+        dimensions and added to input_var to get the output
+        """
+        size = list(input_var.shape[dim_start:dim_end])
+        bias_attr = self.bias_attr
+        if not bias_attr:
+            return input_var
+
+        b = self.create_parameter(
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+        self.append_op(
+            type='elementwise_add',
+            inputs={'X': [input_var],
+                    'Y': [b]},
+            outputs={'Out': [tmp]},
+            attrs={'axis': dim_start})
+        return tmp
+
+    def append_activation(self, input_var):
+        act = self.kwargs.get('act', None)
+        if act is None:
+            return input_var
+        if isinstance(act, basestring):
+            act = {'type': act}
+        tmp = self.create_tmp_variable(dtype=input_var.dtype)
+        act_type = act.pop('type')
+        self.append_op(
+            type=act_type,
+            inputs={"X": [input_var]},
+            outputs={"Out": [tmp]},
+            attrs=act)
+        return tmp
+
+    def _get_default_initializer(self, dtype):
+        if dtype is None or dtype_is_floating(dtype) is True:
+            return Xavier()
+        else:
+            # For integer and boolean types, initialize with all zeros
+            return Constant()
+
+    def is_instance(self, param_name, cls):
+        param = self.kwargs.get(param_name, None)
+        if not isinstance(param, cls):
+            raise TypeError("The input {0} parameter of method {1} must be {2}",
+                            param_name, self.layer_type, cls.__name__)
diff --git a/python/paddle/v2/fluid/layers/__init__.py b/python/paddle/v2/fluid/layers/__init__.py
new file mode 100644
index 0000000000..a83dd3db74
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/__init__.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ops
+from ops import *
+import nn
+from nn import *
+import io
+from io import *
+import tensor
+from tensor import *
+import control_flow
+from control_flow import *
+import device
+from device import *
+import math_op_patch
+from math_op_patch import *
+
+__all__ = []
+__all__ += nn.__all__
+__all__ += io.__all__
+__all__ += tensor.__all__
+__all__ += control_flow.__all__
+__all__ += ops.__all__
+__all__ += device.__all__
+__all__ += math_op_patch.__all__
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
new file mode 100644
index 0000000000..0fcbfe0e2f
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -0,0 +1,1493 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import contextlib
+
+from layer_function_generator import autodoc
+from tensor import assign, fill_constant
+from .. import core
+from ..framework import Program, Variable, Operator
+from ..layer_helper import LayerHelper, unique_name
+
+__all__ = [
+    'split_lod_tensor',
+    'merge_lod_tensor',
+    'BlockGuard',
+    'BlockGuardWithCompletion',
+    'StaticRNNMemoryLink',
+    'WhileGuard',
+    'While',
+    'lod_rank_table',
+    'max_sequence_len',
+    'topk',
+    'lod_tensor_to_array',
+    'array_to_lod_tensor',
+    'increment',
+    'array_write',
+    'create_array',
+    'less_than',
+    'array_read',
+    'shrink_memory',
+    'array_length',
+    'IfElse',
+    'DynamicRNN',
+    'ConditionalBlock',
+    'StaticRNN',
+    'reorder_lod_tensor_by_rank',
+    'ParallelDo',
+    'Print',
+]
+
+
+def split_lod_tensor(input, mask, level=0):
+    """
+    **split_lod_tensor**
+
+    This function takes in an input that contains the complete lod information,
+    and takes in a mask which is used to mask certain parts of the input.
+    The output is the true branch and the false branch with the mask applied to
+    the input at a certain level in the tensor.
+
+    Args:
+        input(tuple|list|None): The input tensor that contains complete
+                                lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The true branch of tensor as per the mask applied to input.
+        Variable: The false branch of tensor as per the mask applied to input.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(name='x', shape=[1])
+          x.persistable = True
+
+          y = layers.data(name='y', shape=[1])
+          y.persistable = True
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+    """
+    helper = LayerHelper('split_lod_tensor', **locals())
+    out_true = helper.create_tmp_variable(dtype=input.dtype)
+    out_false = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true,
+                 'OutFalse': out_false},
+        attrs={'level': level})
+    return out_true, out_false
+
+
+def merge_lod_tensor(in_true, in_false, x, mask, level=0):
+    """
+    **merge_lod_tensor**
+
+    This function takes in an input :math:`x`, the True branch, the False
+    branch and a binary :math:`mask`. Using this information, this function
+    merges the True and False branches of the tensor into a single Output
+    at a certain lod level indiacted by :math:`level`.
+
+    Args:
+        in_true(tuple|list|None): The True branch to be merged.
+        in_false(tuple|list|None): The False branch to be merged.
+        x(tuple|list|None): The input tensor that contains complete
+                            lod information needed to construct the output.
+        mask(list): A bool column vector which masks the input.
+        level(int): The specific lod level to rank.
+
+    Returns:
+        Variable: The merged output tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = layers.data(
+                      name='x', shape=[1], dtype='float32', stop_gradient=False)
+          y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+          level = 0
+
+          out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+          out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+    """
+    helper = LayerHelper('merge_lod_tensor', **locals())
+    out = helper.create_tmp_variable(dtype=in_true.dtype)
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x,
+                'Mask': mask,
+                'InTrue': in_true,
+                'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level})
+    return out
+
+
+def Print(input,
+          first_n=-1,
+          message=None,
+          summarize=-1,
+          print_tensor_name=True,
+          print_tensor_type=True,
+          print_tensor_shape=True,
+          print_tensor_lod=True,
+          print_phase='both'):
+    '''
+    **Print operator**
+
+    This creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.
+
+    Args:
+        input (Variable): A Tensor to print.
+        summarize (int): Print this number of elements in the tensor, will print
+                all if left is negative.
+        message (str): A string message to print as a prefix.
+        first_n (int): Only log `first_n` number of times.
+        print_tensor_name (bool): Print the tensor name.
+        print_tensor_type (bool): Print the tensor type.
+        print_tensor_shape (bool): Print the tensor shape.
+        print_tensor_lod (bool): Print the tensor lod.
+        print_phase (bool): Which phase to displace, including 'forward',
+                'backward' and 'both'. If set to 'backward' or 'both', will
+                print the gradients of input tensor.
+
+    Returns:
+        Variable: Output tensor, same data with input tensor.
+
+    Examples:
+        .. code-block:: python
+
+        value = some_layer(...)
+        Print(value, summarize=10,
+              message="The content of some_layer: ")
+    '''
+    helper = LayerHelper('print', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='print',
+        inputs={'In': input},
+        attrs={
+            'first_n': first_n,
+            'summarize': summarize,
+            'message': message or "",
+            'print_tensor_name': print_tensor_name,
+            'print_tensor_type': print_tensor_type,
+            'print_tensor_shape': print_tensor_shape,
+            'print_tensor_lod': print_tensor_lod,
+            'print_phase': print_phase.upper()
+        },
+        outputs={'Out': out})
+    return out
+
+
+class BlockGuard(object):
+    """
+    BlockGuard class.
+
+    BlockGuard class is used to create a sub-block in a program by
+    using the Python `with` keyword.
+    """
+
+    def __init__(self, main_program):
+        if not isinstance(main_program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.main_program = main_program
+
+    def __enter__(self):
+        self.main_program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.main_program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class ParallelDo(object):
+    """
+    ParallelDo class.
+
+    ParallelDo class is used to create a ParallelDo.
+    """
+
+    def __init__(self, places, name=None):
+        self.helper = LayerHelper("parallel_do", name=name)
+        self.inputs = []
+        self.places = places
+        self.outputs = []
+        self.status = StaticRNN.BEFORE_RNN_BLOCK
+
+    def do(self):
+        return BlockGuardWithCompletion(self)
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def read_input(self, var):
+        self.inputs.append(var)
+        return var
+
+    def write_output(self, var):
+        self.outputs.append(var)
+
+    def get_parameters(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in current_block.ops:
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+
+        params = list()
+        for op in current_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+        params = list(set(params))
+
+        return [parent_block.var(name) for name in params]
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        self.outputs = [
+            parent_block.create_var(
+                name=o.name,
+                shape=o.shape,
+                dtype=o.dtype,
+                lod_level=o.lod_level,
+                persistable=o.persistable,
+                stop_gradient=o.stop_gradient) for o in self.outputs
+        ]
+
+        inputs = [parent_block.var(i.name) for i in self.inputs]
+        outputs = [parent_block.var(o.name) for o in self.outputs]
+
+        parent_block.append_op(
+            type='parallel_do',
+            inputs={
+                'inputs': inputs,
+                'parameters': self.get_parameters(),
+                'places': self.places
+            },
+            outputs={'outputs': outputs,
+                     'parallel_scopes': [step_scope]},
+            attrs={'sub_block': current_block})
+
+
+class BlockGuardWithCompletion(BlockGuard):
+    """
+    BlockGuardWithCompletion class.
+
+    BlockGuardWithCompletion class is used to create an op with a block in a program.
+    """
+
+    def __init__(self, rnn):
+        if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
+            raise TypeError(
+                "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
+        super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(BlockGuardWithCompletion, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_op()
+        return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
+                                                              exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    StaticRNNMemoryLink class.
+
+    Args:
+        init: the initial variable for Memory
+        init: Variable
+        pre_mem: the memory variable in previous time step
+        pre_mem: Variable
+        mem: the memory variable in current time step
+        mem: Variable
+
+    StaticRNNMemoryLink class is used to create a link between two
+    memory cells of a StaticRNN.
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    """
+    StaticRNN class.
+
+    StaticRNN class is used to create a StaticRNN. The RNN will have its
+    own parameters like inputs, outputs, memories, status and length.
+    """
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None):
+        self.helper = LayerHelper("static_rnn", name=name)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return BlockGuardWithCompletion(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self,
+               init=None,
+               shape=None,
+               batch_ref=None,
+               init_value=0.0,
+               init_batch_dim_idx=0,
+               ref_batch_dim_idx=1):
+        """
+        Args:
+            init: boot memory, if not set, a shape, batch_ref must be provided
+            shape: shape of the boot memory
+            batch_ref: batch size reference variable
+            init_value: the init value of boot memory
+            init_batch_dim_idx: the index of batch size in init's dimension
+            ref_batch_dim_idx: the index of batch size in batch_ref's dimension
+        """
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or batch_ref is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and batch_ref")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.dtype,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[0]
+        elif self.seq_len != x.shape[0]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        tmp_o = self.helper.create_tmp_variable(dtype=o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype})
+
+        out_var = self.parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        rnn_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        local_inputs = set()
+
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    local_inputs.add(out_var_name)
+
+        for var in self.inputs:
+            local_inputs.add(var.name)
+        for m in self.memories:
+            local_inputs.add(m)
+
+        params = list()
+        for op in rnn_block.ops:
+            assert isinstance(op, Operator)
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in local_inputs:
+                        params.append(in_var_name)
+
+        parameters = [parent_block.var(name) for name in params]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        inlinks = [parent_block.var(i.name) for i in self.inputs]
+        outlinks = self.outputs
+
+        boot_memories = []
+        pre_memories = []
+        memories = []
+        for _, mem in self.memories.iteritems():
+            boot_memories.append(mem.init)
+            pre_memories.append(mem.pre_mem.name)
+            mem_var = rnn_block.var(mem.mem.name)
+            assert isinstance(mem_var, Variable)
+            new_mem = self.helper.create_tmp_variable(dtype=mem_var.dtype)
+
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype})
+
+            memories.append(new_mem.name)
+
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters
+            },
+            outputs={'outputs': outlinks,
+                     'step_scopes': [step_scope]},
+            attrs={
+                'ex_states': pre_memories,
+                'states': memories,
+                'sub_block': rnn_block
+            })
+
+
+class WhileGuard(BlockGuard):
+    def __init__(self, while_op):
+        if not isinstance(while_op, While):
+            raise TypeError("WhileGuard takes a while op")
+        super(WhileGuard, self).__init__(while_op.helper.main_program)
+        self.while_op = while_op
+
+    def __enter__(self):
+        self.while_op.status = While.IN_WHILE_BLOCK
+        return super(WhileGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+        self.while_op.status = While.AFTER_WHILE_BLOCK
+        self.while_op.complete()
+        return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class While(object):
+    BEFORE_WHILE_BLOCK = 0
+    IN_WHILE_BLOCK = 1
+    AFTER_WHILE_BLOCK = 2
+
+    def __init__(self, cond, name=None):
+        self.helper = LayerHelper("while", name=name)
+        self.status = While.BEFORE_WHILE_BLOCK
+        if not isinstance(cond, Variable):
+            raise TypeError("condition should be a variable")
+        assert isinstance(cond, Variable)
+        if cond.dtype != core.DataType.BOOL:
+            raise TypeError("condition should be a bool variable")
+        if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
+            raise TypeError("condition should be a bool scalar")
+        self.cond_var = cond
+
+    def block(self):
+        return WhileGuard(self)
+
+    def complete(self):
+        main_program = self.helper.main_program
+        while_block = main_program.current_block()
+        parent_block = main_program.block(main_program.current_block()
+                                          .parent_idx)
+
+        inner_outputs = {self.cond_var.name}
+        x_name_list = set()
+        for op in while_block.ops:
+            for iname in op.input_names:
+                for in_var_name in op.input(iname):
+                    if in_var_name not in inner_outputs:
+                        x_name_list.add(in_var_name)
+
+            for oname in op.output_names:
+                for out_var_name in op.output(oname):
+                    inner_outputs.add(out_var_name)
+
+        out_vars = []
+        for inner_out_name in inner_outputs:
+            if inner_out_name in parent_block.vars:
+                out_vars.append(parent_block.var(inner_out_name))
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+
+        parent_block.append_op(
+            type='while',
+            inputs={
+                'X': [parent_block.var(x_name) for x_name in x_name_list],
+                'Condition': [self.cond_var]
+            },
+            outputs={'Out': out_vars,
+                     'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block})
+
+
+def lod_rank_table(x, level=0):
+    """LoD Rank Table Operator. Given an input variable **x** and a level number
+    of LoD, this layer creates a LodRankTable object. A LoDRankTable object
+    contains a list of bi-element tuples. Each tuple consists of an index and
+    a length, both of which are int type. Refering to specified level of LoD,
+    the index is the sequence index number and the length representes the
+    sequence length. Please note that the list is ranked in descending order by
+    the length. The following is an example:
+
+        .. code-block:: text
+
+            x is a LoDTensor:
+                x.lod = [[0,                2, 3],
+                         [0,             5, 6, 7]]
+                x.data = [a, b, c, d, e, f, g]
+
+            1. set level to 0:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=0)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 2), (1, 1)]
+
+            2. set level to 1:
+                Create lod rank table:
+                    lod_rank_table_obj = lod_rank_table(x, level=1)
+
+                Get:
+                    lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)]
+
+    Args:
+        x (Variable): Input variable, a LoDTensor based which to create the lod
+            rank table.
+        level (int): Specify the LoD level, on which to create the lod rank
+            table.
+
+    Returns:
+        Variable: The created LoDRankTable object.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            out = layers.lod_rank_table(x=x, level=0)
+    """
+    helper = LayerHelper("lod_rank_table", **locals())
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name("lod_rank_table"))
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level})
+    return table
+
+
+def max_sequence_len(rank_table):
+    """Max Sequence Len Operator. Given a LoDRankTable object, this layer
+    returns the max length of a batch of sequences. In fact, a LoDRankTable
+    object contains a list of tuples(<sequence index, sequence length>) and
+    the list is already sorted by sequence length in descending order, so the
+    operator just returns the sequence length of the first tuple element.
+
+    Args:
+        rank_table (Variable): Input variable which is a LoDRankTable object.
+
+    Returns:
+        Variable: The max length of sequence.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10],
+                            dtype='float32', lod_level=1)
+            rank_table = layers.lod_rank_table(x=x, level=0)
+            max_seq_len = layers.max_sequence_len(rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k):
+    """
+    **topk**
+
+    This function performs the operation that selects the k entries in the input
+    vector and outputs their values and indices as vectors. Thus topk_out[j] is
+    the j-th largest entry in input, and its index is topk_indices[j]
+
+    Args:
+        input (Variable|list): The input tensor that has all the data.
+        k (int): The number of top elements that the function will pick.
+
+    Returns:
+        Variable: The variable of type array that contains the k largest entries
+                  from input.
+        Variable: The variable of type array that contains the indices of k
+                  largest entries from input.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          k = 5
+          array = fluid.layers.topk(x, k)
+    """
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
+def lod_tensor_to_array(x, table):
+    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
+
+    Args:
+        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type array that has been converted from a
+                  tensor.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+    """
+    helper = LayerHelper("lod_tensor_to_array", **locals())
+    array = helper.create_variable(
+        name=unique_name("lod_tensor_to_array"),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=x.dtype)
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': array})
+    return array
+
+
+def array_to_lod_tensor(x, table):
+    """Convert a LoD_Tensor_Aarry to an LoDTensor.
+
+    Args:
+        x (Variable|list): The lod tensor array to be converted to a tensor.
+        table (ParamAttr|list): The variable that stores the level of lod
+                                which is ordered by sequence length in
+                                descending order.
+
+    Returns:
+        Variable: The variable of type tensor that has been converted
+                  from an array.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name='x', shape=[10])
+          table = fluid.layers.lod_rank_table(x, level=0)
+          array = fluid.layers.lod_tensor_to_array(x, table)
+          lod_tensor = fluid.layers.array_to_lod_tensor(array, table)
+    """
+    helper = LayerHelper("array_to_lod_tensor", **locals())
+    tmp = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x,
+                'RankTable': table},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def increment(x, value=1.0, in_place=True):
+    """
+    This function performs an operation that increments each value in the
+    input :math:`x` by an amount: :math:`value` as mentioned in the input
+    parameter. This operation is performed in-place by default.
+
+    Args:
+        x (Variable|list): The tensor that has the input values.
+        value (float): The amount by which the values should be incremented.
+        in_place (bool): If the increment should be performed in-place.
+
+    Returns:
+        Variable: The tensor variable storing the transformation of
+                  element-wise increment of each value in the input.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32')
+          data = fluid.layers.increment(x=data, value=3.0, in_place=True)
+    """
+    helper = LayerHelper("increment", **locals())
+    if not in_place:
+        out = helper.create_tmp_variable(dtype=x.dtype)
+    else:
+        out = x
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)})
+    return out
+
+
+def array_write(x, i, array=None):
+    """
+    This function writes the given input variable to the specified position
+    indicating by the arrary index to an output LOD_TENSOR_ARRAY. If the
+    output LOD_TENSOR_ARRAY is not given(None), a new one will be created and
+    returned.
+
+    Args:
+        x (Variable|list): The input tensor from which the data will be read.
+        i (Variable|list): The index of the output LOD_TENSOR_ARRAY, pointing to
+                           the position to which the input tensor will be
+                           written.
+        array (Variable|list): The output LOD_TENSOR_ARRAY to which the input
+                               tensor will be written. If this parameter is
+                               NONE, a new LOD_TENSOR_ARRAY will be created and
+                               returned.
+
+    Returns:
+        Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_write(tmp, i=i)
+    """
+    helper = LayerHelper('array_write', **locals())
+    if array is None:
+        array = helper.create_variable(
+            name="{0}.out".format(helper.name),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x],
+                'I': [i]},
+        outputs={'Out': [array]})
+    return array
+
+
+def create_array(dtype):
+    """This function creates an array of type :math:`LOD_TENSOR_ARRAY` using the
+    LayerHelper.
+
+    Args:
+        dtype (int|float): The data type of the elements in the array.
+
+    Returns:
+        Variable: The tensor variable storing the elements of data type.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.create_array(dtype='float32')
+
+    """
+    helper = LayerHelper("array", **locals())
+    return helper.create_variable(
+        name="{0}.out".format(helper.name),
+        type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+        dtype=dtype)
+
+
+def less_than(x, y, cond=None, **ignored):
+    """
+    **Less than**
+
+    This layer returns the truth value of :math:`x < y` elementwise.
+
+    Args:
+        x(Variable): First operand of *less_than*
+        y(Variable): Second operand of *less_than*
+        cond(Variable|None): Optional output variable to store the result of *less_than*
+
+    Returns:
+        Variable: The tensor variable storing the output of *less_than*.
+
+    Examples:
+        .. code-block:: python
+
+          less = fluid.layers.less_than(x=label, y=limit)
+    """
+    helper = LayerHelper("less_than", **locals())
+    if cond is None:
+        cond = helper.create_tmp_variable(dtype='bool')
+        cond.stop_gradient = True
+
+    helper.append_op(
+        type='less_than', inputs={'X': [x],
+                                  'Y': [y]}, outputs={'Out': [cond]})
+    return cond
+
+
+def array_read(array, i):
+    """This function performs the operation to read the data in as an
+    LOD_TENSOR_ARRAY.
+    Args:
+        array (Variable|list): The input tensor that will be written to an array.
+        i (Variable|list): The subscript index in tensor array, that points the
+                           place where data will be written to.
+    Returns:
+        Variable: The tensor type variable that has the data written to it.
+    Examples:
+        .. code-block::python
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = layers.array_read(tmp, i=i)
+    """
+    helper = LayerHelper('array_read', **locals())
+    if not isinstance(
+            array,
+            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        raise TypeError("array should be tensor array vairable")
+    out = helper.create_tmp_variable(dtype=array.dtype)
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array],
+                'I': [i]},
+        outputs={'Out': [out]})
+    return out
+
+
+def shrink_memory(x, i, table):
+    """
+    This function creates an operator to shrink_rnn_memory using the RankTable
+    as mentioned in the input parameter.
+    """
+    helper = LayerHelper('shrink_memory', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x],
+                'I': [i],
+                'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={})
+    return out
+
+
+def array_length(array):
+    """This function performs the operation to find the length of the input
+    LOD_TENSOR_ARRAY.
+
+    Args:
+        array (LOD_TENSOR_ARRAY): The input array that will be used
+                                  to compute the length.
+
+    Returns:
+        Variable: The length of the input LoDTensorArray.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          arr = fluid.layers.array_write(tmp, i=i)
+          arr_len = fluid.layers.array_length(arr)
+    """
+    helper = LayerHelper('array_length', **locals())
+    tmp = helper.create_tmp_variable(dtype='int64')
+    tmp.stop_gradient = True
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]})
+    return tmp
+
+
+class ConditionalBlockGuard(BlockGuard):
+    def __init__(self, block):
+        if not isinstance(block, ConditionalBlock):
+            raise TypeError("block should be conditional block")
+        super(ConditionalBlockGuard, self).__init__(block.helper.main_program)
+        self.block = block
+
+    def __enter__(self):
+        return super(ConditionalBlockGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.block.complete()
+        return super(ConditionalBlockGuard, self).__exit__(exc_type, exc_val,
+                                                           exc_tb)
+
+
+class ConditionalBlock(object):
+    def __init__(self, inputs, name=None):
+        for each_input in inputs:
+            if not isinstance(each_input, Variable):
+                raise TypeError("Each input should be variable")
+        self.inputs = inputs
+        self.helper = LayerHelper('conditional_block', name=name)
+
+    def block(self):
+        return ConditionalBlockGuard(self)
+
+    def complete(self):
+        inside_block = self.helper.main_program.current_block()
+        parent_block = self.helper.main_program.block(inside_block.parent_idx)
+
+        intermediate = set()
+        params = set()
+
+        for each_op in inside_block.ops:
+            assert isinstance(each_op, Operator)
+            for iname in each_op.input_names:
+                for in_var_name in each_op.input(iname):
+                    if in_var_name not in intermediate:
+                        params.add(in_var_name)
+
+            for oname in each_op.output_names:
+                for out_var_name in each_op.output(oname):
+                    intermediate.add(out_var_name)
+        input_set = set([ipt.name for ipt in self.inputs])
+
+        param_list = [
+            parent_block.var(each_name) for each_name in params
+            if each_name not in input_set
+        ]
+
+        out_list = [
+            parent_block.var(var_name) for var_name in parent_block.vars
+            if var_name in intermediate
+        ]
+
+        step_scope = parent_block.create_var(
+            type=core.VarDesc.VarType.STEP_SCOPES)
+        parent_block.append_op(
+            type='conditional_block',
+            inputs={
+                'X': self.inputs,
+                'Params': param_list,
+            },
+            outputs={'Out': out_list,
+                     'Scope': [step_scope]},
+            attrs={'sub_block': inside_block})
+
+
+class IfElseBlockGuard(object):
+    def __init__(self, is_true, ifelse):
+        if not isinstance(ifelse, IfElse):
+            raise TypeError("ifelse must be an instance of IfElse class")
+
+        if ifelse.status != IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("You cannot invoke IfElse.block() inside a block")
+
+        self.is_true = is_true
+        self.ie = ifelse
+        if is_true:
+            self.cond_block = ifelse.conditional_true_block
+        else:
+            self.cond_block = ifelse.conditional_false_block
+
+        if not isinstance(self.cond_block, ConditionalBlock):
+            raise TypeError("Unexpected situation")
+
+        self.cond_block = self.cond_block.block()
+
+    def __enter__(self):
+        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.cond_block.__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if not self.cond_block.__exit__(exc_type, exc_val, exc_tb):
+            # re-raise inside exception
+            return False
+        if len(self.ie.output_table[1 if self.is_true else 0]) == 0:
+            raise ValueError("Must set output inside block")
+        self.ie.status = IfElse.OUT_IF_ELSE_BLOCKS
+
+
+class IfElse(object):
+    OUT_IF_ELSE_BLOCKS = 0
+    IN_IF_ELSE_TRUE_BLOCKS = 1
+    IN_IF_ELSE_FALSE_BLOCKS = 2
+
+    def __init__(self, cond, name=None):
+        if not isinstance(cond, Variable):
+            raise TypeError("cond must be a Variable")
+        self.helper = LayerHelper('ifelse', name=name)
+        self.cond = cond
+        self.input_table = {}
+        self.status = IfElse.OUT_IF_ELSE_BLOCKS
+        self.conditional_true_block = ConditionalBlock(inputs=[self.cond])
+        self.conditional_false_block = ConditionalBlock(inputs=[self.cond])
+        self.output_table = ([], [])  # (true_outs, false_outs)
+
+    def input(self, x):
+        if self.status == IfElse.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("input must in true/false blocks")
+        if id(x) not in self.input_table:
+            parent_block = self.parent_block()
+            out_true = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+
+            out_false = parent_block.create_var(
+                name=unique_name('ifelse_input' + self.helper.name),
+                dtype=x.dtype)
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true,
+                         'OutFalse': out_false},
+                attrs={'level': 0})
+            self.input_table[id(x)] = (out_true, out_false)
+        else:
+            out_true, out_false = self.input_table[id(x)]
+
+        if self.status == IfElse.IN_IF_ELSE_TRUE_BLOCKS:
+            return out_true
+        else:
+            return out_false
+
+    def parent_block(self):
+        current_block = self.helper.main_program.current_block()
+        return self.helper.main_program.block(current_block.parent_idx)
+
+    def true_block(self):
+        return IfElseBlockGuard(True, self)
+
+    def false_block(self):
+        return IfElseBlockGuard(False, self)
+
+    def output(self, *outs):
+        if self.status == self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("output can only be invoked in the sub-block")
+
+        out_table = self.output_table[1 if self.status ==
+                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        parent_block = self.parent_block()
+        for each_out in outs:
+            if not isinstance(each_out, Variable):
+                raise TypeError("Each output should be a variable")
+            # create outside tensor
+            outside_out = parent_block.create_var(
+                name=unique_name("_".join([self.helper.name, 'output'])),
+                dtype=each_out.dtype)
+            out_table.append(outside_out)
+
+            # assign local var to outside
+            assign(input=each_out, output=outside_out)
+
+    def __call__(self):
+        if self.status != self.OUT_IF_ELSE_BLOCKS:
+            raise ValueError("IfElse::__call__ must be out of sub-block")
+        false_len, true_len = map(len, self.output_table)
+        if false_len == 0 and true_len == 0:
+            raise ValueError("Must invoke true_block/false_block before "
+                             "__call__")
+        elif false_len != true_len and false_len != 0 and true_len != 0:
+            raise ValueError("The output side must be same")
+        elif false_len == 0 or true_len == 0:
+            return self.output_table[0 if false_len != 0 else 1]
+
+        # else none of false_len/true_len is zero
+        # merge together
+        rlist = []
+        for false_var, true_var in zip(*self.output_table):
+            rlist.append(
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0))
+        return rlist
+
+
+class DynamicRNN(object):
+    BEFORE_RNN = 0
+    IN_RNN = 1
+    AFTER_RNN = 2
+
+    def __init__(self, name=None):
+        self.helper = LayerHelper('dynamic_rnn', name=name)
+        self.status = DynamicRNN.BEFORE_RNN
+        self.lod_rank_table = None
+        self.max_seq_len = None
+        self.step_idx = None
+        self.zero_idx = fill_constant(
+            shape=[1], value=0, dtype='int64', force_cpu=True)
+        self.mem_dict = dict()
+        self.output_array = []
+        self.outputs = []
+        self.cond = self.helper.create_tmp_variable(dtype='bool')
+        self.cond.stop_gradient = False
+        self.while_op = While(self.cond)
+        self.input_array = []
+        self.mem_link = []
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_("step_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "step_input() can only take a Variable as its input.")
+        parent_block = self._parent_block_()
+        if self.lod_rank_table is None:
+            self.lod_rank_table = parent_block.create_var(
+                name=unique_name('lod_rank_table'),
+                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+            self.lod_rank_table.stop_gradient = True
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table})
+            self.max_seq_len = parent_block.create_var(
+                name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64')
+            self.max_seq_len.stop_gradient = False
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len})
+            self.cond.stop_gradient = True
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx,
+                        'Y': self.max_seq_len},
+                outputs={'Out': self.cond})
+
+        input_array = parent_block.create_var(
+            name=unique_name('dynamic_rnn_input_array'),
+            type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+            dtype=x.dtype)
+        self.input_array.append((input_array, x.dtype))
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x,
+                    'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array})
+        return array_read(array=input_array, i=self.step_idx)
+
+    def static_input(self, x):
+        self._assert_in_rnn_block_("static_input")
+        if not isinstance(x, Variable):
+            raise TypeError(
+                "static_input() can only take a Variable as its input")
+        if self.lod_rank_table is None:
+            raise RuntimeError(
+                "static_input() must be called after step_input().")
+        parent_block = self._parent_block_()
+        x_reordered = parent_block.create_var(
+            name=unique_name("dynamic_rnn_static_input_reordered"),
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            dtype=x.dtype)
+        parent_block.append_op(
+            type='reorder_lod_tensor_by_rank',
+            inputs={'X': [x],
+                    'RankTable': [self.lod_rank_table]},
+            outputs={'Out': [x_reordered]})
+        return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
+
+    @contextlib.contextmanager
+    def block(self):
+        if self.status != DynamicRNN.BEFORE_RNN:
+            raise ValueError("rnn.block() can only be invoke once")
+        self.step_idx = fill_constant(
+            shape=[1], dtype='int64', value=0, force_cpu=True)
+        self.step_idx.stop_gradient = False
+        self.status = DynamicRNN.IN_RNN
+        with self.while_op.block():
+            yield
+            increment(x=self.step_idx, value=1.0, in_place=True)
+
+            for new_mem, mem_array in self.mem_link:
+                array_write(x=new_mem, i=self.step_idx, array=mem_array)
+
+            less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond)
+
+        self.status = DynamicRNN.AFTER_RNN
+        for each_array in self.output_array:
+            self.outputs.append(
+                array_to_lod_tensor(
+                    x=each_array, table=self.lod_rank_table))
+
+    def __call__(self, *args, **kwargs):
+        if self.status != DynamicRNN.AFTER_RNN:
+            raise ValueError(("Output of the dynamic RNN can only be visited "
+                              "outside the rnn block."))
+        if len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def memory(self,
+               init=None,
+               shape=None,
+               value=0.0,
+               need_reorder=False,
+               dtype='float32'):
+        self._assert_in_rnn_block_('memory')
+        if init is not None:
+            if not isinstance(init, Variable):
+                raise TypeError(
+                    "The input arg `init` of memory() must be a Variable")
+            parent_block = self._parent_block_()
+            init_tensor = init
+            if need_reorder == True:
+                if self.lod_rank_table is None:
+                    raise ValueError(
+                        'If set need_reorder to True, make sure step_input be '
+                        'invoked before '
+                        'memory(init=init, need_reordered=True, ...).')
+                init_reordered = parent_block.create_var(
+                    name=unique_name('dynamic_rnn_mem_init_reordered'),
+                    type=core.VarDesc.VarType.LOD_TENSOR,
+                    dtype=init.dtype)
+                parent_block.append_op(
+                    type='reorder_lod_tensor_by_rank',
+                    inputs={
+                        'X': [init_tensor],
+                        'RankTable': [self.lod_rank_table]
+                    },
+                    outputs={'Out': [init_reordered]})
+                init_tensor = init_reordered
+            mem_array = parent_block.create_var(
+                name=unique_name('dynamic_rnn_mem_array'),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=init.dtype)
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init_tensor,
+                        'I': self.zero_idx},
+                outputs={'Out': mem_array})
+            retv = array_read(array=mem_array, i=self.step_idx)
+            retv = shrink_memory(
+                x=retv, i=self.step_idx, table=self.lod_rank_table)
+            self.mem_dict[retv.name] = mem_array
+            return retv
+        else:
+            if len(self.input_array) == 0:
+                raise ValueError(
+                    "step_input should be invoked before memory(shape=..., value=...)"
+                )
+            parent_block = self._parent_block_()
+            init = parent_block.create_var(
+                name=unique_name('mem_init'), dtype=dtype)
+            arr, dtype = self.input_array[0]
+            in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype)
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr],
+                        'I': [self.zero_idx]},
+                outputs={'Out': [in0]})
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype
+                })
+            return self.memory(init=init)
+
+    def update_memory(self, ex_mem, new_mem):
+        self._assert_in_rnn_block_('update_memory')
+        if not isinstance(ex_mem, Variable):
+            raise TypeError("The input arg `ex_mem` of update_memory() must "
+                            "be a Variable")
+        if not isinstance(new_mem, Variable):
+            raise TypeError("The input arg `new_mem` of update_memory() must "
+                            "be a Variable")
+
+        mem_array = self.mem_dict.get(ex_mem.name, None)
+        if mem_array is None:
+            raise ValueError("Please invoke memory before update_memory")
+        if self.lod_rank_table is None:
+            raise ValueError("Please invoke step_input before update_memory")
+
+        self.mem_link.append((new_mem, mem_array))
+
+    def output(self, *outputs):
+        self._assert_in_rnn_block_('output')
+        parent_block = self._parent_block_()
+        for each in outputs:
+            outside_array = parent_block.create_var(
+                name=unique_name("_".join(
+                    [self.helper.name, "output_array", each.name])),
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype=each.dtype)
+            array_write(x=each, i=self.step_idx, array=outside_array)
+            self.output_array.append(outside_array)
+
+    def _parent_block_(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+
+        return parent_block
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != DynamicRNN.IN_RNN:
+            raise ValueError("{0} can only be invoked inside rnn block.".format(
+                method))
+
+
+@autodoc()
+def reorder_lod_tensor_by_rank(x, rank_table):
+    helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
+    helper.is_instance('x', Variable)
+    helper.is_instance('rank_table', Variable)
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='reorder_lod_tensor_by_rank',
+        inputs={'X': [x],
+                'RankTable': [rank_table]},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/device.py b/python/paddle/v2/fluid/layers/device.py
new file mode 100644
index 0000000000..107511b5f4
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/device.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All util layers.
+"""
+
+from layer_function_generator import autodoc
+from ..framework import unique_name
+from ..layer_helper import LayerHelper
+
+__all__ = ['get_places']
+
+
+@autodoc()
+def get_places(device_count=None, device_type=None):
+    helper = LayerHelper('get_places', **locals())
+    out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
+    attrs = dict()
+    if device_count is not None:
+        attrs['device_count'] = int(device_count)
+    if device_type is not None:
+        attrs['device_type'] = str(device_type)
+
+    helper.append_op(
+        type='get_places', outputs={"Out": [out_places]}, attrs=attrs)
+
+    return out_places
diff --git a/python/paddle/v2/fluid/layers/io.py b/python/paddle/v2/fluid/layers/io.py
new file mode 100644
index 0000000000..b7b2cf2296
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/io.py
@@ -0,0 +1,198 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .. import core
+from ..layer_helper import LayerHelper
+from control_flow import BlockGuard
+from ..layer_helper import LayerHelper
+
+__all__ = ['data', 'BlockGuardServ', 'ListenAndServ', 'Send']
+
+
+def data(name,
+         shape,
+         append_batch_size=True,
+         dtype='float32',
+         lod_level=0,
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         stop_gradient=True):
+    """
+    **Data Layer**
+
+    This function takes in the input and based on whether data has
+    to be returned back as a minibatch, it creates the global variable by using
+    the helper functions. The global variables can be accessed by all the
+    following operators in the graph.
+
+    All the input variables of this function are passed in as local variables
+    to the LayerHelper constructor.
+
+    Args:
+       name(str): The name/alias of the function
+       shape(list): Tuple declaring the shape.
+       append_batch_size(bool): Whether or not to append the data as a batch.
+       dtype(int|float): The type of data : float32, float_16, int etc
+       type(VarType): The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
+       main_program(Program): Name of the main program that calls this
+       startup_program(Program): Name of the startup program
+       stop_gradient(bool): A boolean that mentions whether gradient should flow.
+
+    Returns:
+        Variable: The global variable that gives access to the data.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name='x', shape=[784], dtype='float32')
+    """
+    helper = LayerHelper('data', **locals())
+    shape = list(shape)
+    for i in xrange(len(shape)):
+        if shape[i] is None:
+            shape[i] = -1
+            append_batch_size = False
+        elif shape[i] < 0:
+            append_batch_size = False
+
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=type,
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
+
+
+class BlockGuardServ(BlockGuard):
+    """
+    BlockGuardServ class.
+
+    BlockGuardServ class is used to create an op with a block in a program.
+    """
+
+    def __init__(self, server):
+        if not (isinstance(server, ListenAndServ)):
+            raise TypeError("BlockGuardServ takes a ListenAndServ")
+        super(BlockGuardServ, self).__init__(server.helper.main_program)
+        self.server = server
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if exc_type is not None:
+            return False
+
+        self.server.complete_op()
+        return super(BlockGuardServ, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class ListenAndServ(object):
+    """
+    ListenAndServ class.
+
+    ListenAndServ class is used to wrap listen_and_serv op to create a server
+    which can receive variables from clients and run a block.
+    """
+
+    def __init__(self, endpoint, fan_in=1, optimizer_mode=True):
+        self.helper = LayerHelper("recv")
+        self.inputs = []
+        self.outputs = []
+        self.endpoint = endpoint
+        self.fan_in = fan_in
+        # FIXME(typhoonzero): add optimizer_mode is stupid, should make it more
+        # general.
+        self.optimizer_mode = optimizer_mode
+
+    def do(self):
+        return BlockGuardServ(self)
+
+    def get_params_and_grads(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+        # params and grads in the same order.
+        params = list()
+        grads = list()
+        for op in current_block.ops:
+            # FIXME(typhoonzero): op.inputs is None if it's cloned.
+            if self.optimizer_mode:
+                if "Grad" in op.inputs and "Param" in op.inputs:
+                    params.append(op.inputs["Param"].name)
+                    grads.append(op.inputs["Grad"].name)
+            else:
+                # simple recv mode, recv operators inputs.
+                for iname in op.input_names:
+                    for in_var_name in op.input(iname):
+                        params.append(parent_block.var(in_var_name))
+                        grads.append(parent_block.var(in_var_name))
+
+        return params, grads
+
+    def parent_block(self):
+        prog = self.helper.main_program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def complete_op(self):
+        main_program = self.helper.main_program
+        current_block = main_program.current_block()
+        parent_block = self.parent_block()
+
+        params, grads = self.get_params_and_grads()
+        param_names = [p.name for p in params]
+        grad_names = [g.name for g in grads]
+        parent_block.append_op(
+            type='recv',
+            inputs={},
+            outputs={},
+            attrs={
+                'endpoint': self.endpoint,
+                'Fanin': self.fan_in,
+                'ParamList': param_names,
+                'GradList': grad_names,
+                'OptimizeBlock': current_block
+            })
+
+
+def Send(endpoints, send_vars, get_vars):
+    """
+    Send layer
+
+    Args:
+        endpoints: comma seperated IP:PORT pairs in the order
+                   of send_vars to send
+        send_vars: vars to send
+        get_vars: vars to get from server after send completes.
+
+    Send variables to the server side, and get vars from server
+    side when server have finished running server side program.
+    """
+    assert (type(send_vars) == list)
+    assert (type(get_vars) == list)
+
+    epmap = endpoints.split(",")
+    endpoints = list(set(epmap))
+
+    helper = LayerHelper("Send", **locals())
+    helper.append_op(
+        type="send",
+        inputs={"X": send_vars},
+        outputs={"Out": get_vars},
+        attrs={"endpoints": endpoints,
+               "epmap": epmap})
diff --git a/python/paddle/v2/fluid/layers/layer_function_generator.py b/python/paddle/v2/fluid/layers/layer_function_generator.py
new file mode 100644
index 0000000000..b0e4d1635f
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/layer_function_generator.py
@@ -0,0 +1,218 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import re
+import cStringIO
+import functools
+import warnings
+
+from .. import proto
+
+framework_pb2 = proto.framework_pb2
+
+from ..framework import OpProtoHolder, Variable
+from ..layer_helper import LayerHelper
+
+__all__ = [
+    'deprecated',
+    'generate_layer_fn',
+    'autodoc',
+]
+
+
+def _convert_(name):
+    """
+    Formatting.
+
+    Args:
+       name: The name/alias
+
+    This function takes in a name and converts it to a standard format of
+    group1_group2. Where as per the regular expression, group1 can have
+    alphabets and numbers and group2 has capital alphabets.
+
+    """
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _generate_doc_string_(op_proto):
+    """
+    Generate docstring by OpProto
+
+    Args:
+        op_proto (framework_pb2.OpProto): a protobuf message typed OpProto
+
+    Returns:
+        str: the document string
+    """
+
+    def _type_to_str_(tp):
+        return framework_pb2.AttrType.Name(tp)
+
+    if not isinstance(op_proto, framework_pb2.OpProto):
+        raise TypeError("OpProto should be `framework_pb2.OpProto`")
+
+    buf = cStringIO.StringIO()
+    buf.write(op_proto.comment)
+    buf.write('\nArgs:\n')
+    for each_input in op_proto.inputs:
+        line_begin = '    {0}: '.format(_convert_(each_input.name))
+        buf.write(line_begin)
+        buf.write(each_input.comment)
+        buf.write('\n')
+        buf.write(' ' * len(line_begin))
+        buf.write('Duplicable: ')
+        buf.write(str(each_input.duplicable))
+        buf.write('  Optional: ')
+        buf.write(str(each_input.dispensable))
+        buf.write('\n')
+
+    for each_attr in op_proto.attrs:
+        buf.write('    ')
+        buf.write(each_attr.name)
+        buf.write(' (')
+        buf.write(_type_to_str_(each_attr.type))
+        buf.write('): ')
+        buf.write(each_attr.comment)
+        buf.write('\n')
+
+    if len(op_proto.outputs) != 0:
+        buf.write('\nReturns:\n')
+        buf.write('    ')
+        for each_opt in op_proto.outputs:
+            if not each_opt.intermediate:
+                break
+        buf.write(each_opt.comment)
+
+    return buf.getvalue()
+
+
+def generate_layer_fn(op_type):
+    """Register the Python layer for an Operator.
+
+    Args:
+       op_type: The name of the operator to be created.
+
+    This function takes in the operator type (sigmoid, mean , average etc) and
+    creates the operator functionality.
+
+    """
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    not_intermediate_outputs = \
+        filter(lambda output: not output.intermediate, op_proto.outputs)
+    intermediate_outputs = \
+        filter(lambda output: output.intermediate, op_proto.outputs)
+
+    if len(not_intermediate_outputs) != 1:
+        raise ValueError("Only one non intermediate output operator can be",
+                         "automatically generated.")
+
+    if not_intermediate_outputs[0].duplicable:
+        raise ValueError(
+            "Only non duplicable op can be automatically generated.")
+
+    for output in intermediate_outputs:
+        if output.duplicable:
+            raise ValueError("The op can be automatically generated only when ",
+                             "all intermediate ops are not duplicable.")
+
+    o_name = not_intermediate_outputs[0].name
+    intermediate_output_names = [output.name for output in intermediate_outputs]
+
+    def infer_and_check_dtype(op_proto, **kwargs):
+        """
+        This function performs the sanity check for dtype and
+        instance type.
+        """
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.dtype
+                elif dtype != each.dtype:
+                    raise ValueError(
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
+
+        return dtype
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+
+        dtype = infer_and_check_dtype(op_proto, **kwargs)
+
+        inputs = dict()
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            inputs[ipt.name] = val
+
+        outputs = dict()
+        out = kwargs.pop(_convert_(o_name), [])
+        if out:
+            out_var = out[0] if (isinstance(out, list) or
+                                 isinstance(out, tuple)) else out
+        else:
+            out_var = helper.create_tmp_variable(dtype=dtype)
+        outputs[o_name] = [out_var]
+        for name in intermediate_output_names:
+            outputs[name] = [helper.create_tmp_variable(dtype=dtype)]
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs)
+        return helper.append_activation(out_var)
+
+    func.__name__ = op_type
+    func.__doc__ = _generate_doc_string_(op_proto)
+    return func
+
+
+def deprecated(func_or_class):
+    """
+    Deprecated warning decorator. It will result a warning message.
+    Should be used before class or function, member function
+    """
+
+    @functools.wraps(func)
+    def func_wrapper(*args, **kwargs):
+        """
+        Wrap func with deprecated warning
+        """
+        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
+        warnings.warn(
+            "Call to deprecated function {}.".format(func.__name__),
+            category=DeprecationWarning,
+            stacklevel=2)
+        warnings.simplefilter('default', DeprecationWarning)  # reset filter
+        return func(*args, **kwargs)
+
+    return func_wrapper
+
+
+def autodoc(comment=""):
+    def __impl__(func):
+        func.__doc__ = _generate_doc_string_(OpProtoHolder.instance(
+        ).get_op_proto(func.__name__)) + comment
+        return func
+
+    return __impl__
diff --git a/python/paddle/v2/fluid/layers/math_op_patch.py b/python/paddle/v2/fluid/layers/math_op_patch.py
new file mode 100644
index 0000000000..79a130a3eb
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/math_op_patch.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..framework import Variable, unique_name
+from layer_function_generator import OpProtoHolder
+
+__all__ = ['monkey_patch_variable']
+
+
+def monkey_patch_variable():
+    def unique_tmp_name():
+        return unique_name("tmp")
+
+    def safe_get_dtype(var):
+        try:
+            dtype = var.dtype
+        except:
+            raise ValueError("Cannot get data type from %s", var.name)
+        return dtype
+
+    def create_tensor(block, value, dtype, shape):
+        value = float(value)
+        tmp_name = unique_tmp_name()
+        var = block.create_var(name=tmp_name, shape=shape, dtype=dtype)
+        block.append_op(
+            type="fill_constant",
+            outputs={'Out': [var]},
+            attrs={'dtype': var.dtype,
+                   'shape': shape,
+                   'value': value})
+        return var
+
+    def create_scalar(block, value, dtype):
+        return create_tensor(block, value, dtype, shape=[1])
+
+    def create_tensor_with_batchsize(ref_var, value, dtype):
+        assert isinstance(ref_var, Variable)
+        value = float(value)
+        tmp_name = unique_tmp_name()
+        var = ref_var.block.create_var(name=tmp_name, dtype=dtype)
+        ref_var.block.append_op(
+            type='fill_constant_batch_size_like',
+            outputs={'Out': [var]},
+            inputs={'Input': [ref_var]},
+            attrs={'shape': ref_var.shape,
+                   'value': value})
+        return var
+
+    def astype(self, dtype):
+        """
+        Cast a variable to a specified data type.
+        NOTE: The variable must be a Tensor
+        Args:
+            self(Variable): The source variable
+            dtype: The target dtype
+
+        Returns:
+            Variable with new dtype
+        """
+        tmp_name = unique_tmp_name()
+        out = self.block.create_var(name=tmp_name, dtype=dtype)
+        self.block.append_op(
+            type="cast",
+            inputs={"X": [self]},
+            outputs={"Out": [out]},
+            attrs={"in_dtype": self.dtype,
+                   "out_dtype": out.dtype})
+        return out
+
+    def _elemwise_method_creator_(method_name, op_type, reverse=False):
+        def __impl__(self, other_var):
+            lhs_dtype = safe_get_dtype(self)
+
+            if not isinstance(other_var, Variable):
+                if reverse:
+                    has_batch_size = False
+                    for elem in self.shape:
+                        if elem < 0:
+                            has_batch_size = True
+                            break
+                    if not has_batch_size:
+                        other_var = create_tensor(
+                            self.block,
+                            other_var,
+                            dtype=lhs_dtype,
+                            shape=self.shape)
+                    else:
+                        other_var = create_tensor_with_batchsize(
+                            self, other_var, lhs_dtype)
+                else:
+                    # add fill_op to self.block
+                    other_var = create_scalar(
+                        self.block, value=other_var, dtype=lhs_dtype)
+
+            rhs_dtype = safe_get_dtype(other_var)
+            if lhs_dtype != rhs_dtype:
+                other_var = astype(other_var, lhs_dtype)
+            if reverse:
+                tmp = self
+                self = other_var
+                other_var = tmp
+
+            tmp_name = unique_tmp_name()
+            out = self.block.create_var(name=tmp_name, dtype=lhs_dtype)
+            self.block.append_op(
+                type=op_type,
+                inputs={'X': [self],
+                        'Y': [other_var]},
+                outputs={'Out': out})
+            return out
+
+        comment = OpProtoHolder.instance().get_op_proto(op_type).comment
+
+        __impl__.__doc__ = """
+        {0}
+        Args:
+            self(Variable): left hand variable
+            other_var(Variable|float|int): right hand variable 
+
+        Returns:
+            Variable
+        """.format(comment)
+        __impl__.__name__ = method_name
+        return __impl__
+
+    # inject methods
+    for method_name, op_type, reverse in (
+        ("__add__", "elementwise_add", False),
+            # a+b == b+a. Do not need to reverse explicitly
+        ("__radd__", "elementwise_add", False),
+        ("__sub__", "elementwise_sub", False),
+        ("__rsub__", "elementwise_sub", True),
+        ("__mul__", "elementwise_mul", False),
+            # a*b == b*a. Do not need to reverse explicitly
+        ("__rmul__", "elementwise_mul", False),
+        ("__div__", "elementwise_div", False),
+        ("__rdiv__", "elementwise_div", True),
+        ("__pow__", "elementwise_pow", False),
+        ("__rpow__", "elementwise_pow", True)):
+        setattr(Variable, method_name,
+                _elemwise_method_creator_(method_name, op_type, reverse))
+
+    Variable.astype = astype
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
new file mode 100644
index 0000000000..c38e21087d
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -0,0 +1,2984 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+All layers just related to the neural network.
+"""
+
+from ..layer_helper import LayerHelper
+from ..initializer import Normal, Constant
+from ..framework import Variable
+from ..param_attr import ParamAttr
+from layer_function_generator import autodoc
+from tensor import concat
+
+__all__ = [
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'dynamic_lstmp',
+    'dynamic_gru',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'accuracy',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'sequence_pool',
+    'pool2d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
+    'split',
+    'ctc_greedy_decoder',
+    'edit_distance',
+    'l2_normalize',
+    'matmul',
+    'warpctc',
+    'sequence_reshape',
+    'transpose',
+    'im2sequence',
+    'nce',
+    'beam_search',
+    'row_conv',
+    'multiplex',
+]
+
+
+def fc(input,
+       size,
+       num_flatten_dims=1,
+       param_attr=None,
+       bias_attr=None,
+       act=None,
+       name=None):
+    """
+    **Fully Connected Layer**
+
+    The fully connected layer can take multiple tensors as its inputs. It
+    creates a variable (one for each input tensor) called weights for each
+    input tensor, which represents a fully connected weight matrix from
+    each input unit to each output unit. The fully connected layer
+    multiplies each input tensor with its coresponding weight to produce
+    an output Tensor. If multiple input tensors are given, the results of
+    multiple multiplications will be sumed up. If bias_attr is not None,
+    a biases variable will be created and added to the output. Finally,
+    if activation is not None, it will be applied to the output as well.
+
+    This process can be formulated as follows:
+
+    .. math::
+
+        Out = Act({\sum_{i=0}^{N-1}W_iX_i + b})
+
+    In the above equation:
+
+    * :math:`N`: Number of the input.
+    * :math:`X_i`: The input tensor.
+    * :math:`W`: The weights created by this layer.
+    * :math:`b`: The bias parameter created by this layer (if needed).
+    * :math:`Act`: The activation funtion.
+    * :math:`Out`: The output tensor.
+
+    Args:
+       input(Variable|list): The input tensor(s) to the fully connected layer.
+       size(int): The number of output units in the fully connected layer.
+       num_flatten_dims(int): The fc layer can accept an input tensor with more
+                              than two dimensions. If this happens, the
+                              multidimensional tensor will first be flattened
+                              into a 2-dimensional matrix. The parameter
+                              `num_flatten_dims` determines how the input tensor
+                              is flattened: the first `num_flatten_dims`
+                              (inclusive, index starts from 1) dimensions will
+                              be flatten to form the first dimension of the
+                              final matrix (height of the matrix), and the rest
+                              `rank(X) - num_flatten_dims` dimensions are
+                              flattened to form the second dimension of the
+                              final matrix (width of the matrix). For example,
+                              suppose `X` is a 6-dimensional tensor with a shape
+                              [2, 3, 4, 5, 6], and `num_flatten_dims` = 3. Then,
+                              the flattened matrix will have a shape
+                              [2 x 3 x 4, 5 x 6] = [24, 30]. By default,
+                              `num_flatten_dims` is set to 1.
+       param_attr(ParamAttr|list): The parameter attribute for learnable
+                                   parameters/weights of the fully connected
+                                   layer.
+       param_initializer(ParamAttr|list): The initializer used for the
+                                          weight/parameter. If set None,
+                                          XavierInitializer() will be used.
+       bias_attr(ParamAttr|list): The parameter attribute for the bias parameter
+                                  for this layer. If set None, no bias will be
+                                  added to the output units.
+       bias_initializer(ParamAttr|list): The initializer used for the bias.
+                                        If set None, then ConstantInitializer()
+                                        will be used.
+       act(str): Activation to be applied to the output of the fully connected
+                 layer.
+       name(str): Name/alias of the fully connected layer.
+
+
+    Returns:
+        Variable: The output tensor variable.
+
+    Raises:
+        ValueError: If rank of the input tensor is less than 2.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+    """
+
+    helper = LayerHelper("fc", **locals())
+
+    dtype = helper.input_dtype()
+
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={"X": input_var,
+                    "Y": w},
+            outputs={"Out": tmp},
+            attrs={"x_num_col_dims": num_flatten_dims,
+                   "y_num_col_dims": 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input,
+              size,
+              is_sparse=False,
+              padding_idx=None,
+              param_attr=None,
+              dtype='float32'):
+    """
+    **Embedding Layer**
+
+    This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in
+    a lookup table. The result of this lookup is the embedding of each ID in the
+    :attr:`input`.
+
+    All the input variables are passed in as local variables to the LayerHelper
+    constructor.
+
+    Args:
+        input(Variable): The tensor variable containing the IDs.
+        size(tuple|list): The shape of the look up table parameter. It should
+            have two elements which indicate the size of the dictionary of
+            embeddings and the size of each embedding vector respectively.
+        is_sparse(bool): The flag indicating whether to use sparse update.
+        padding_idx(int|long|None): If :attr:`None`, it makes no effect to lookup.
+            Otherwise the given :attr:`padding_idx` indicates padding the output
+            with zeros whenever lookup encounters it in :attr:`input`. If
+            :math:`padding_idx < 0`, the padding_idx to use in lookup is
+            :math:`size[0] + dim`.
+        param_attr(ParamAttr): Parameters for this layer
+        dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
+
+    Returns:
+        Variable: The tensor variable storing the embeddings of the \
+                  supplied inputs.
+
+    Examples:
+        .. code-block:: python
+
+          dict_size = len(dataset.ids)
+          data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32')
+          fc = fluid.layers.embedding(input=data, size=[dict_size, 16])
+    """
+
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
+    tmp = helper.create_tmp_variable(dtype)
+    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
+        size[0] + padding_idx)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp},
+        attrs={'is_sparse': is_sparse,
+               'padding_idx': padding_idx})
+    return tmp
+
+
+# TODO(qijun): expose H0 and C0
+def dynamic_lstm(input,
+                 size,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    **Dynamic LSTM Layer**
+
+    The defalut implementation is diagonal/peephole connection
+    (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows:
+
+    .. math::
+
+        i_t & = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
+
+        f_t & = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
+
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+
+        o_t & = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
+
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+        h_t & = o_t \odot act_h(c_t)
+
+    where the :math:`W` terms denote weight matrices (e.g. :math:`W_{xi}` is
+    the matrix of weights from the input gate to the input), :math:`W_{ic}, \
+    W_{fc}, W_{oc}` are diagonal weight matrices for peephole connections. In
+    our implementation, we use vectors to reprenset these diagonal weight
+    matrices. The :math:`b` terms denote bias vectors (:math:`b_i` is the input
+    gate bias vector), :math:`\sigma` is the non-linear activations, such as
+    logistic sigmoid function, and :math:`i, f, o` and :math:`c` are the input
+    gate, forget gate, output gate, and cell activation vectors, respectively,
+    all of which have the same size as the cell output activation vector :math:`h`.
+
+    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
+    and :math:`act_h` are the cell input and cell output activation functions
+    and `tanh` is usually used for them. :math:`\\tilde{c_t}` is also called
+    candidate hidden state, which is computed based on the current input and
+    the previous hidden state.
+
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connect layer before LSTM layer.
+
+    Args:
+        input(Variable): The input of dynamic_lstm layer, which supports
+                         variable-time length input sequence. The underlying
+                         tensor in this Variable is a matrix with shape
+                         (T X 4D), where T is the total time steps in this
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.
+
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
+        use_peepholes(bool): Whether to enable diagonal/peephole connections,
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and
+                              output gate. Choices = ["sigmoid", "tanh", "relu",
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           act=None, bias_attr=None)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
+
+    helper = LayerHelper('lstm', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm',
+        inputs={'Input': input,
+                'Weight': weight,
+                'Bias': bias},
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
+
+
+def dynamic_lstmp(input,
+                  size,
+                  proj_size,
+                  param_attr=None,
+                  bias_attr=None,
+                  use_peepholes=True,
+                  is_reverse=False,
+                  gate_activation='sigmoid',
+                  cell_activation='tanh',
+                  candidate_activation='tanh',
+                  proj_activation='tanh',
+                  dtype='float32',
+                  name=None):
+    """
+    **Dynamic LSTMP Layer**
+
+    LSTMP (LSTM with recurrent projection) layer has a separate projection 
+    layer after the LSTM layer, projecting the original hidden state to a 
+    lower-dimensional one, which is proposed to reduce the number of total 
+    parameters and furthermore computational complexity for the LSTM, 
+    espeacially for the case that the size of output units is relative 
+    large (https://research.google.com/pubs/archive/43905.pdf). 
+
+    The formula is as follows:
+
+    .. math::
+
+        i_t & = \sigma(W_{ix}x_{t} + W_{ir}r_{t-1} + W_{ic}c_{t-1} + b_i)
+
+        f_t & = \sigma(W_{fx}x_{t} + W_{fr}r_{t-1} + W_{fc}c_{t-1} + b_f)
+
+        \\tilde{c_t} & = act_g(W_{cx}x_t + W_{cr}r_{t-1} + b_c)
+
+        o_t & = \sigma(W_{ox}x_{t} + W_{or}r_{t-1} + W_{oc}c_t + b_o)
+
+        c_t & = f_t \odot c_{t-1} + i_t \odot \\tilde{c_t}
+
+        h_t & = o_t \odot act_h(c_t)
+
+        r_t & = \overline{act_h}(W_{rh}h_t)
+
+    In the above formula:
+
+    * :math:`W`: Denotes weight matrices (e.g. :math:`W_{xi}` is \
+          the matrix of weights from the input gate to the input).
+    * :math:`W_{ic}`, :math:`W_{fc}`, :math:`W_{oc}`: Diagonal weight \
+          matrices for peephole connections. In our implementation, \
+          we use vectors to reprenset these diagonal weight matrices. 
+    * :math:`b`: Denotes bias vectors (e.g. :math:`b_i` is the input gate \
+          bias vector). 
+    * :math:`\sigma`: The activation, such as logistic sigmoid function.
+    * :math:`i, f, o` and :math:`c`: The input gate, forget gate, output \
+          gate, and cell activation vectors, respectively, all of which have \
+          the same size as the cell output activation vector :math:`h`. 
+    * :math:`h`: The hidden state.
+    * :math:`r`: The recurrent projection of the hidden state. 
+    * :math:`\\tilde{c_t}`: The candidate hidden state, whose \
+          computation is based on the current input and previous hidden state.
+    * :math:`\odot`: The element-wise product of the vectors. 
+    * :math:`act_g` and :math:`act_h`: The cell input and cell output \
+          activation functions and `tanh` is usually used for them. 
+    * :math:`\overline{act_h}`: The activation function for the projection \
+          output, usually using `identity` or same as :math:`act_h`.
+
+    Set `use_peepholes` to `False` to disable peephole connection. The formula
+    is omitted here, please refer to the paper
+    http://www.bioinf.jku.at/publications/older/2604.pdf for details.
+    
+    Note that these :math:`W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}`
+    operations on the input :math:`x_{t}` are NOT included in this operator.
+    Users can choose to use fully-connected layer before LSTMP layer.
+
+    Args:
+        input(Variable): The input of dynamic_lstmp layer, which supports
+                         variable-time length input sequence. The underlying
+                         tensor in this Variable is a matrix with shape
+                         (T X 4D), where T is the total time steps in this
+                         mini-batch, D is the hidden size.
+        size(int): 4 * hidden size.
+        proj_size(int): The size of projection output.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weight and projection weight.
+
+                               - Hidden-hidden weight = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}.
+                               - The shape of hidden-hidden weight is (P x 4D), 
+                                 where P is the projection size and D the hidden 
+                                 size.
+                               - Projection weight = {:math:`W_{rh}`}.
+                               - The shape of projection weight is (D x P).
+        bias_attr(ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                - The shape is (1 x 7D).
+        use_peepholes(bool): Whether to enable diagonal/peephole connections,
+                             default `True`.
+        is_reverse(bool): Whether to compute reversed LSTM, default `False`.
+        gate_activation(str): The activation for input gate, forget gate and
+                              output gate. Choices = ["sigmoid", "tanh", "relu",
+                              "identity"], default "sigmoid".
+        cell_activation(str): The activation for cell output. Choices = ["sigmoid",
+                              "tanh", "relu", "identity"], default "tanh".
+        candidate_activation(str): The activation for candidate hidden state.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        proj_activation(str): The activation for projection output.
+                              Choices = ["sigmoid", "tanh", "relu", "identity"],
+                              default "tanh".
+        dtype(str): Data type. Choices = ["float32", "float64"], default "float32".
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+
+    Returns:
+        tuple: The projection of hidden state, and cell state of LSTMP. The \
+               shape of projection is (T x P), for the cell state which is \
+               (T x D), and both LoD is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim, proj_dim = 512, 256
+            fc_out = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                     act=None, bias_attr=None)
+            proj_out, _ = fluid.layers.dynamic_lstmp(input=fc_out, 
+                                                     size=hidden_dim * 4, 
+                                                     proj_size=proj_dim, 
+                                                     use_peepholes=False,
+                                                     is_reverse=True,
+                                                     cell_activation="tanh",
+                                                     proj_activation="tanh")
+    """
+
+    helper = LayerHelper('lstmp', **locals())
+    size = size / 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[proj_size, 4 * size], dtype=dtype)
+    proj_weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, proj_size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    projection = helper.create_tmp_variable(dtype)
+    cell = helper.create_tmp_variable(dtype)
+    ordered_proj0 = helper.create_tmp_variable(dtype)
+    batch_hidden = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_cell_pre_act = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstmp',
+        inputs={
+            'Input': input,
+            'Weight': weight,
+            'ProjWeight': proj_weight,
+            'Bias': bias
+        },
+        outputs={
+            'Projection': projection,
+            'Cell': cell,
+            'OrderedP0': ordered_proj0,
+            'BatchHidden': batch_hidden,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation,
+            'proj_activation': proj_activation
+        })
+    return projection, cell
+
+
+def dynamic_gru(input,
+                size,
+                param_attr=None,
+                bias_attr=None,
+                is_reverse=False,
+                gate_activation='sigmoid',
+                candidate_activation='tanh',
+                h_0=None):
+    """
+    **Dynamic GRU Layer**
+
+    Refer to `Empirical Evaluation of Gated Recurrent Neural Networks on
+    Sequence Modeling <https://arxiv.org/abs/1412.3555>`_
+
+    The formula is as follows:
+
+    .. math::
+
+        u_t & = act_g(W_{ux}x_{t} + W_{uh}h_{t-1} + b_u)
+
+        r_t & = act_g(W_{rx}x_{t} + W_{rh}h_{t-1} + b_r)
+
+        \\tilde{h_t} & = act_c(W_{cx}x_{t} + W_{ch}(r_t \odot h_{t-1}) + b_c)
+
+        h_t & = (1-u_t) \odot h_{t-1} + u_t \odot \\tilde{h_t}
+
+    The :math:`\odot` is the element-wise product of the vectors. :math:`act_g`
+    is the update gate and reset gate activation function and :math:`sigmoid`
+    is usually used for it. :math:`act_c` is the activation function for
+    candidate hidden state and :math:`tanh` is usually used for it.
+
+    Note that these :math:`W_{ux}x_{t}, W_{rx}x_{t}, W_{cx}x_{t}` operations on
+    the input :math:`x_{t}` are NOT included in this operator. Users can choose
+    to use fully-connect layer before GRU layer.
+
+    Args:
+        input(Variable): The input of dynamic_gru layer, which supports
+            variable-time length input sequence. The underlying tensor in this
+            Variable is a matrix with shape :math:`(T \\times 3D)`, where
+            :math:`T` is the total time steps in this mini-batch, :math:`D`
+            is the hidden size.
+        size(int): The dimension of the gru cell.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            hidden-hidden weight matrix. Note:
+
+            - The shape of the weight matrix is :math:`(T \\times 3D)`, where
+              :math:`D` is the hidden size.
+            - All elements in the weight matrix can be divided into two parts.
+              The first part are weights of the update gate and reset gate with
+              shape :math:`(D \\times 2D)`, and the second part are weights for
+              candidate hidden state with shape :math:`(D \\times D)`.
+        bias_attr(ParamAttr): The parameter attribute for learnable the
+            hidden-hidden bias.
+        is_reverse(bool): Whether to compute reversed GRU, default
+            :attr:`False`.
+        gate_activation(str): The activation for update gate and reset gate.
+            Choices = ["sigmoid", "tanh", "relu", "identity"], default "sigmoid".
+        activation(str): The activation for candidate hidden state.
+            Choices = ["sigmoid", "tanh", "relu", "identity"], default "tanh".
+
+    Returns:
+        Variable: The hidden state of GRU. The shape is (T \\times D), and lod \
+            is the same with the input.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            x = fluid.layers.fc(input=data, size=hidden_dim * 3)
+            hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim)
+    """
+
+    helper = LayerHelper('gru', **locals())
+    dtype = helper.input_dtype()
+
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=[1, 3 * size], dtype=dtype, is_bias=True)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    if h_0 != None:
+        assert h_0.shape == (
+            size, size), 'The shape of h0 should be(%d, %d)' % (size, size)
+        inputs['h0'] = h_0
+
+    hidden = helper.create_tmp_variable(dtype)
+    batch_gate = helper.create_tmp_variable(dtype)
+    batch_reset_hidden_prev = helper.create_tmp_variable(dtype)
+    batch_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'BatchGate': batch_gate,
+            'BatchResetHiddenPrev': batch_reset_hidden_prev,
+            'BatchHidden': batch_hidden
+        },
+        attrs={
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'activation': candidate_activation
+        })
+    return hidden
+
+
+def gru_unit(input,
+             hidden,
+             size,
+             weight=None,
+             bias=None,
+             activation='tanh',
+             gate_activation='sigmoid'):
+    """
+    GRU unit layer. The equation of a gru step is:
+
+        .. math::
+            u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
+
+            r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
+
+            m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
+
+            h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
+
+    The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
+    connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
+
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
+    an intermediate candidate hidden output, which is denoted by :math:`m_t`.
+    This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
+    and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
+
+    Args:
+        input (Variable): The fc transformed input value of current step.
+        hidden (Variable): The hidden value of lstm unit from previous step.
+        size (integer): The input dimension value.
+        weight (ParamAttr): The weight parameters for gru unit. Default: None
+        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        activation (string): The activation type for cell (actNode).
+                             Default: 'tanh'
+        gate_activation (string): The activation type for gates (actGate).
+                                  Default: 'sigmoid'
+
+    Returns:
+        tuple: The hidden value, reset-hidden value and gate values.
+
+    Examples:
+
+        .. code-block:: python
+
+             # assuming we have x_t_data and prev_hidden of size=10
+             x_t = fluid.layers.fc(input=x_t_data, size=30)
+             hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
+                                                    hidden = prev_hidden)
+
+    """
+    activation_dict = dict(
+        identity=0,
+        sigmoid=1,
+        tanh=2,
+        relu=3, )
+    activation = activation_dict[activation]
+    gate_activation = activation_dict[gate_activation]
+
+    helper = LayerHelper('gru_unit', **locals())
+    dtype = helper.input_dtype()
+    size = size / 3
+
+    # create weight
+    if weight is None:
+        weight = helper.create_parameter(
+            attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+
+    # create bias
+
+    if bias is None:
+        bias_size = [1, 3 * size]
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='gru_unit',
+        inputs={'Input': input,
+                'HiddenPrev': hidden,
+                'Weight': weight},
+        outputs={
+            'Gate': gate,
+            'ResetHiddenPrev': reset_hidden_pre,
+            'Hidden': updated_hidden,
+        },
+        attrs={
+            'activation': 0,
+            'gate_activation': 1,
+        })
+
+    return updated_hidden, reset_hidden_pre, gate
+
+
+def linear_chain_crf(input, label, param_attr=None):
+    helper = LayerHelper('linear_chain_crf', **locals())
+    size = input.shape[1]
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype())
+    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
+    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
+    log_likelihood = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        })
+
+    return log_likelihood
+
+
+def crf_decoding(input, param_attr, label=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
+def cos_sim(X, Y, **kwargs):
+    """
+    This function performs the cosine similarity between two tensors
+    X and Y and returns that as the output.
+    """
+    helper = LayerHelper('cos_sim', **kwargs)
+    out = helper.create_tmp_variable(dtype=X.dtype)
+    xnorm = helper.create_tmp_variable(dtype=X.dtype)
+    ynorm = helper.create_tmp_variable(dtype=X.dtype)
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X],
+                'Y': [Y]},
+        outputs={'Out': [out],
+                 'XNorm': [xnorm],
+                 'YNorm': [ynorm]})
+    return out
+
+
+def dropout(x, dropout_prob, is_test=False, seed=None, **kwargs):
+    """
+    Computes dropout.
+
+    Drop or keep each element of `x` independently. Dropout is a regularization
+    technique for reducing overfitting by preventing neuron co-adaption during
+    training. The dropout operator randomly set (according to the given dropout
+    probability) the outputs of some units to zero, while others are remain
+    unchanged.
+
+    Args:
+       x(variable): The input tensor.
+       dropout_prob(float): Probability of setting units to zero.
+       is_test(bool): A flag indicating whether it is in test phrase or not.
+       seed(int): A Python integer used to create random seeds. If this
+                  parameter is set to None, a random seed is used.
+                  NOTE: If an integer seed is given, always the same output
+                  units will be dropped. DO NOT use a fixed seed in training.
+
+    Returns:
+        Variable: A tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          x = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
+          droped = fluid.layers.dropout(input=x, dropout_rate=0.5)
+    """
+
+    helper = LayerHelper('dropout', **kwargs)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out],
+                 'Mask': [mask]},
+        attrs={
+            'dropout_prob': dropout_prob,
+            'is_test': is_test,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0
+        })
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    """
+    **Cross Entropy Layer**
+
+    This layer computes the cross entropy between `input` and `label`. It
+    supports both standard cross-entropy and soft-label cross-entropy loss
+    computation.
+
+    1) One-hot cross-entropy:
+	`soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
+
+        .. math::
+
+            Y[i] = -\log(X[i, Label[i]])
+
+    2) Soft-label cross-entropy:
+	`soft_label = True`, `Label[i, j]` indicates the soft label of class j
+	for sample i:
+
+        .. math::
+
+            Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
+
+       Please make sure that in this case the summation of each row of `label`
+       equals one.
+
+    3) One-hot cross-entropy with vecterized `label`:
+	 As a special case of 2), when each row of 'label' has only one
+	 non-zero element which is equal to 1, soft-label cross-entropy degenerates
+         to a one-hot cross-entropy with one-hot label representation.
+
+    Args:
+        input (Variable|list):  a 2-D tensor with shape [N x D], where N is the
+                                batch size and D is the number of classes. This
+                                input is a probability computed by the previous
+                                operator, which is almost always the result of
+                                a softmax operator.
+        label (Variable|list): the ground truth which is a 2-D tensor. When
+                               `soft_label` is set to `False`, `label` is a
+                               tensor<int64> with shape [N x 1]. When
+                               `soft_label` is set to `True`, `label` is a
+                               tensor<float/double> with shape [N x D].
+        soft_label (bool, via `**kwargs`): a flag indicating whether to
+                                           interpretate the given labels as soft
+                                           labels, default `False`.
+
+    Returns:
+         A 2-D tensor with shape [N x 1], the cross entropy loss.
+
+    Raises:
+        `ValueError`: 1) the 1st dimension of `input` and `label` are not equal.
+                      2) when `soft_label == True`, and the 2nd dimension of
+                         `input` and `label` are not equal.
+                      3) when `soft_label == False`, and the 2nd dimension of
+                         `label` is not 1.
+
+    Examples:
+        .. code-block:: python
+
+          predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+          cost = fluid.layers.cross_entropy(input=predict, label=label)
+    """
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    """
+    **Square error cost layer**
+
+    This layer accepts input predictions and target label and returns the
+    squared error cost.
+
+    For predictions, :math:`X`, and target labels, :math:`Y`, the equation is:
+
+    .. math::
+
+        Out = (X - Y)^2
+
+    In the above equation:
+
+        * :math:`X`: Input predictions, a tensor.
+        * :math:`Y`: Input labels, a tensor.
+        * :math:`Out`: Output value, same shape with :math:`X`.
+
+    Args:
+       input(Variable): Input tensor, has predictions.
+       label(Variable): Label tensor, has target labels.
+
+    Returns:
+        Variable: The tensor variable storing the element-wise squared error
+                  difference of input and label.
+
+    Examples:
+        .. code-block:: python
+
+          y = layers.data(name='y', shape=[1], dtype='float32')
+          y_predict = layers.data(name='y_predict', shape=[1], dtype='float32')
+          cost = layers.square_error_cost(input=y_predict, label=y)
+
+    """
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]},
+        outputs={'Out': [square_out]})
+    return square_out
+
+
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
+    """
+    This function computes the accuracy using the input and label.
+    The output is the top_k inputs and their indices.
+    """
+    helper = LayerHelper("accuracy", **kwargs)
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": k})
+    acc_out = helper.create_tmp_variable(dtype="float32")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="accuracy",
+        inputs={
+            "Out": [topk_out],
+            "Indices": [topk_indices],
+            "Label": [label]
+        },
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        })
+    return acc_out
+
+
+def chunk_eval(input,
+               label,
+               chunk_scheme,
+               num_chunk_types,
+               excluded_chunk_types=None,
+               **kwargs):
+    """
+    This function computes and outputs the precision, recall and
+    F1-score of chunk detection.
+    """
+    helper = LayerHelper("chunk_eval", **kwargs)
+
+    # prepare output
+    precision = helper.create_tmp_variable(dtype="float32")
+    recall = helper.create_tmp_variable(dtype="float32")
+    f1_score = helper.create_tmp_variable(dtype="float32")
+    num_infer_chunks = helper.create_tmp_variable(dtype="int64")
+    num_label_chunks = helper.create_tmp_variable(dtype="int64")
+    num_correct_chunks = helper.create_tmp_variable(dtype="int64")
+
+    helper.append_op(
+        type="chunk_eval",
+        inputs={"Inference": [input],
+                "Label": [label]},
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks]
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or []
+        })
+    return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
+            num_correct_chunks)
+
+
+def sequence_conv(input,
+                  num_filters,
+                  filter_size=3,
+                  filter_stride=1,
+                  padding=None,
+                  bias_attr=None,
+                  param_attr=None,
+                  act=None):
+    """
+    This function creates the op for sequence_conv, using the inputs and
+    other convolutional configurations for the filters and stride as given
+    in the input parameters to the function.
+    """
+
+    # FIXME(dzh) : want to unify the argument of python layer
+    # function. So we ignore some unecessary attributes.
+    # such as, padding_trainable, context_start.
+
+    helper = LayerHelper('sequence_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [filter_size * input.shape[1], num_filters]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='sequence_conv',
+        inputs={
+            'X': [input],
+            'Filter': [filter_param],
+        },
+        outputs={"Out": pre_bias},
+        attrs={
+            'contextStride': filter_stride,
+            'contextStart': -int(filter_size / 2),
+            'contextLength': filter_size
+        })
+    pre_act = helper.append_bias_op(pre_bias)
+    return helper.append_activation(pre_act)
+
+
+def conv2d(input,
+           num_filters,
+           filter_size,
+           stride=None,
+           padding=None,
+           groups=None,
+           param_attr=None,
+           bias_attr=None,
+           use_cudnn=True,
+           act=None):
+    """
+    **Convlution2D Layer**
+
+    The convolution2D layer calculates the output based on the input, filter
+    and strides, paddings, dilations, groups parameters. Input(Input) and
+    Output(Output) are in NCHW format. Where N is batch size, C is the number of
+    channels, H is the height of the feature, and W is the width of the feature.
+    The details of convolution layer, please refer UFLDL's `convolution,
+    <http://ufldl.stanford.edu/tutorial/supervised/FeatureExtractionUsingConvolution/>`_ .
+    If bias attribution and activation type are provided, bias is added to the
+    output of the convolution, and the corresponding activation function is
+    applied to the final result.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = \sigma (W \\ast X + b)
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast`: Convolution operation.
+    * :math:`b`: Bias value, a 2-D tensor with shape [M, 1].
+    * :math:`\\sigma`: Activation function.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
+
+    Example:
+
+        - Input:
+
+          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+          Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
+
+        - Output:
+          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+
+        Where
+
+        .. math::
+
+        H_{out}&= \\frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]} + 1 \\\\
+        W_{out}&= \\frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]} + 1
+
+    Args:
+       input(Variable): The input image with [N, C, H, W] format.
+       num_filters(int): The number of filter. It is as same as the output
+           image channel.
+       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+           it must contain two integers, (filter_size_H, filter_size_W).
+           Otherwise, the filter will be a square.
+       stride(int|tuple): The stride size. If stride is a tuple, it must
+           contain two integers, (stride_H, stride_W). Otherwise, the
+           stride_H = stride_W = stride. Default: stride = 1.
+       padding(int|tuple): The padding size. If padding is a tuple, it must
+           contain two integers, (padding_H, padding_W). Otherwise, the
+           padding_H = padding_W = padding. Default: padding = 0.
+       groups(int): The groups number of the Conv2d Layer. According to grouped
+           convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
+           the first half of the filters is only connected to the first half
+           of the input channels, while the second half of the filters is only
+           connected to the second half of the input channels. Default: groups=1
+       param_attr(ParamAttr): The parameters to the Conv2d Layer. Default: None
+       bias_attr(ParamAttr): Bias parameter for the Conv2d layer. Default: None
+       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+           library is installed. Default: True
+       act(str): Activation type. Default: None
+
+    Returns:
+        Variable: The tensor variable storing the convolution and
+                  non-linearity activation result.
+
+    Raises:
+        ValueError: If the shapes of input, filter_size, stride, padding and
+                    groups mismatch.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d = fluid.layers.conv2d(
+              input=data, num_filters=2, filter_size=3, act="relu")
+    """
+    if stride is None:
+        stride = [1, 1]
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups != 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
+
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=filter_shape,
+        dtype=dtype,
+        default_initializer=_get_default_param_initializer())
+
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d',
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'groups': groups,
+            'use_cudnn': use_cudnn
+        })
+
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
+
+    return helper.append_activation(pre_act)
+
+
+def sequence_pool(input, pool_type, **kwargs):
+    """
+    This function add the operator for sequence pooling.
+    It pools features of all time-steps of each instance, and is applied
+    on top of the input using pool_type mentioned in the parameters.
+
+    It supports four pool_type:
+
+    - average: :math:`Out[i] = \\frac{\sum_i X_i}{N}`
+    - sum:     :math:`Out[i] = \sum_jX_{ij}`
+    - sqrt:    :math:`Out[i] = \\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}`
+    - max:     :math:`Out[i] = max(X_i)`
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+
+       for different pool_type:
+         average: out.data = [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
+         sum    : out.data = [4, 12, 6], where 4=1+3, 12=2+4+6, 6=5+1
+         sqrt   : out.data = [2.82, 6.93, 4.24], where 2.82=(1+3)/sqrt(2),
+                    6.93=(2+4+6)/sqrt(3), 4.24=(5+1)/sqrt(2)
+         max    : out.data = [3, 6, 5], where 3=max(1,3), 6=max(2,4,6), 5=max(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+        pool_type (string): The pooling type of sequence_pool.
+            It supports average, sum, sqrt and max.
+
+    Returns:
+        The sequence pooling variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
+             sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
+             sqrt_x = fluid.layers.sequence_pool(input=x, pool_type='sqrt')
+             max_x = fluid.layers.sequence_pool(input=x, pool_type='max')
+    """
+    helper = LayerHelper('sequence_pool', input=input, **kwargs)
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+    max_index = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="sequence_pool",
+        inputs={"X": input},
+        outputs={"Out": pool_out,
+                 "MaxIndex": max_index},
+        attrs={"pooltype": pool_type.upper()})
+
+    # when pool_type is max, variable max_index is initialized,
+    # so we stop the gradient explicitly here
+    if pool_type == 'max':
+        max_index.stop_gradient = True
+
+    return pool_out
+
+
+def sequence_first_step(input, **kwargs):
+    """
+    This funciton get the first step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [1, 2, 5], where 1=first(1,3), 2=first(2,4,6), 5=first(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's first step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_first_step = fluid.layers.sequence_first_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="first")
+
+
+def sequence_last_step(input, **kwargs):
+    """
+    This funciton get the last step of sequence.
+
+    .. code-block:: text
+
+       x is a 1-level LoDTensor:
+         x.lod = [[0, 2, 5, 7]]
+         x.data = [1, 3, 2, 4, 6, 5, 1]
+         x.dims = [7, 1]
+
+       then output is a Tensor:
+         out.dim = [3, 1]
+         with condition len(x.lod[-1]) - 1 == out.dims[0]
+         out.data = [3, 6, 1], where 3=last(1,3), 6=last(2,4,6), 1=last(5,1)
+
+    Args:
+        input(variable): The input variable which is a LoDTensor.
+
+    Returns:
+        The sequence's last step variable which is a Tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+             x = fluid.layers.data(name='x', shape=[7, 1],
+                              dtype='float32', lod_level=1)
+             x_last_step = fluid.layers.sequence_last_step(input=x)
+    """
+    return sequence_pool(input=input, pool_type="last")
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=None,
+           pool_padding=None,
+           global_pooling=False,
+           use_cudnn=True,
+           name=None):
+    """
+    This function adds the operator for pooling in 2 dimensions, using the
+    pooling configurations mentioned in input parameters.
+    """
+    if pool_padding is None:
+        pool_padding = [0, 0]
+    if pool_stride is None:
+        pool_stride = [1, 1]
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+
+    helper = LayerHelper('pool2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "use_cudnn": use_cudnn
+        })
+
+    return pool_out
+
+
+def batch_norm(input,
+               act=None,
+               is_test=False,
+               momentum=0.9,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    This function helps create an operator to implement
+    the BatchNorm layer using the configurations from the input parameters.
+    """
+    helper = LayerHelper('batch_norm', **locals())
+    dtype = helper.input_dtype()
+
+    input_shape = input.shape
+    if data_layout == 'NCHW':
+        channel_num = input_shape[1]
+    else:
+        if data_layout == 'NHWC':
+            channel_num = input_shape[-1]
+        else:
+            raise ValueError("unsupported data layout:" + data_layout)
+
+    param_shape = [channel_num]
+
+    # create parameter
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0))
+
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+
+    mean = helper.create_global_variable(
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
+
+    variance = helper.create_global_variable(
+        dtype=input.dtype,
+        shape=param_shape,
+        persistable=True,
+        stop_gradient=True)
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
+
+    # create output
+    # mean and mean_out share the same memory
+    mean_out = mean
+    # variance and variance out share the same memory
+    variance_out = variance
+    saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+
+    batch_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="batch_norm",
+        inputs={
+            "X": input,
+            "Scale": scale,
+            "Bias": bias,
+            "Mean": mean,
+            "Variance": variance
+        },
+        outputs={
+            "Y": batch_norm_out,
+            "MeanOut": mean_out,
+            "VarianceOut": variance_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance
+        },
+        attrs={"momentum": momentum,
+               "epsilon": epsilon,
+               "is_test": is_test})
+
+    return helper.append_activation(batch_norm_out)
+
+
+def beam_search_decode(ids, scores, name=None):
+    helper = LayerHelper('beam_search_decode', **locals())
+    sentence_ids = helper.create_tmp_variable(dtype=ids.dtype)
+    sentence_scores = helper.create_tmp_variable(dtype=ids.dtype)
+
+    helper.append_op(
+        type="beam_search_decode",
+        inputs={"Ids": ids,
+                "Scores": scores},
+        outputs={
+            "SentenceIds": sentence_ids,
+            "SentenceScores": sentence_scores
+        })
+
+    return sentence_ids, sentence_scores
+
+
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     dilation=None,
+                     param_attr=None,
+                     use_cudnn=True,
+                     name=None):
+    """
+    **Convlution2D transpose layer**
+
+    The convolution2D transpose layer calculates the output based on the input,
+    filter, and dilations, strides, paddings. Input(Input) and output(Output)
+    are in NCHW format. Where N is batch size, C is the number of channels,
+    H is the height of the feature, and W is the width of the feature.
+    Parameters(dilations, strides, paddings) are two elements. These two elements
+    represent height and width, respectively. The details of convolution transpose
+    layer, please refer to the following explanation and references
+    `therein <http://www.matthewzeiler.com/wp-content/uploads/2017/07/cvpr2010.pdf>`_.
+
+    For each input :math:`X`, the equation is:
+
+    .. math::
+
+        Out = W \\ast X
+
+    In the above equation:
+
+    * :math:`X`: Input value, a tensor with NCHW format.
+    * :math:`W`: Filter value, a tensor with MCHW format.
+    * :math:`\\ast` : Convolution transpose operation.
+    * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be
+                   different.
+
+    Example:
+
+        - Input:
+
+          Input shape: $(N, C_{in}, H_{in}, W_{in})$
+
+          Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
+
+        - Output:
+
+          Output shape: $(N, C_{out}, H_{out}, W_{out})$
+
+        Where
+
+        .. math::
+
+           H_{out} &= (H_{in} - 1) * strides[0] - 2 * paddings[0] + dilations[0] * (H_f - 1) + 1 \\\\
+           W_{out} &= (W_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (W_f - 1) + 1
+
+    Args:
+       input(Variable): The input image with [N, C, H, W] format.
+       num_filters(int): The number of the filter. It is as same as the output
+           image channel.
+       output_size(int|tuple|None): The output image size. If output size is a
+           tuple, it must contain two integers, (image_H, image_W). This
+           parameter only works when filter_size is None.
+       filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+           it must contain two integers, (filter_size_H, filter_size_W).
+           Otherwise, the filter will be a square. None if use output size to
+           calculate filter_size.
+       padding(int|tuple): The padding size. If padding is a tuple, it must
+           contain two integers, (padding_H, padding_W). Otherwise, the
+           padding_H = padding_W = padding. Default: padding = 0.
+       stride(int|tuple): The stride size. If stride is a tuple, it must
+           contain two integers, (stride_H, stride_W). Otherwise, the
+           stride_H = stride_W = stride. Default: stride = 1.
+       dilation(int|tuple): The dilation size. If dilation is a tuple, it must
+           contain two integers, (dilation_H, dilation_W). Otherwise, the
+           dilation_H = dilation_W = dilation. Default: dilation = 1.
+       param_attr(ParamAttr): The parameters to the Conv2d_transpose Layer.
+                              Default: None
+       use_cudnn(bool): Use cudnn kernel or not, it is valid only when the cudnn
+           library is installed. Default: True
+       name(str|None): A name for this layer(optional). If set None, the layer
+           will be named automatically.
+
+    Returns:
+       Variable: The tensor variable storing the convolution transpose result.
+
+    Raises:
+       ValueError: If the shapes of input, filter_size, stride, padding and
+                   groups mismatch.
+
+    Examples:
+       .. code-block:: python
+
+          data = fluid.layers.data(
+              name='data', shape=[3, 32, 32], dtype='float32')
+          conv2d_transpose = fluid.layers.conv2d_transpose(
+              input=data, num_filters=2, filter_size=3)
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = [stride, stride]
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if isinstance(dilation, int):
+        op_attr['dilations'] = [dilation, dilation]
+    elif dilation is not None:
+        op_attr['dilations'] = dilation
+
+    if not isinstance(use_cudnn, bool):
+        raise ValueError("use_cudnn should be True or False")
+    op_attr['use_cudnn'] = use_cudnn
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+        dilation = op_attr.get('dilations', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+
+        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + 2 *
+                         padding[0] - 1) / dilation[0] + 1
+        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + 2 *
+                         padding[1] - 1) / dilation[1] + 1
+        filter_size = [filter_size_h, filter_size_w]
+
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
+def sequence_expand(x, y, name=None):
+    """Sequence Expand Layer. This layer will expand the input variable **x**
+    according to LoD information of **y**. And the following examples will
+    explain how sequence_expand works:
+
+    .. code-block:: text
+
+        * Case 1
+            x is a LoDTensor:
+                x.lod = [[0,       2, 3],
+                         [0, 1,    3, 4]]
+                x.data = [a, b, c, d]
+                x.dims = [4, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0,    2,    4],
+                         [0, 3, 6, 7, 8]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 2-level LoDTensor:
+                out.lod = [[0,                2,    4],
+                           [0,       3,       6, 7, 8]]
+                out.data = [a, a, a, b, b, b, c, d]
+                out.dims = [8, 1]
+
+        * Case 2
+            x is a Tensor:
+                x.data = [a, b, c]
+                x.dims = [3, 1]
+
+            y is a LoDTensor:
+                y.lod = [[0, 2, 3, 6]]
+
+            with condition len(y.lod[-1]) - 1 == x.dims[0]
+
+            then output is a 1-level LoDTensor:
+                out.lod = [[0,    2, 3,      6]]
+                out.data = [a, a, b, c, c, c]
+                out.dims = [6, 1]
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a LoDTensor.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The expanded variable which is a LoDTensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[10], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[10, 20],
+                             dtype='float32', lod_level=1)
+            out = layers.sequence_expand(x=x, y=y)
+    """
+    helper = LayerHelper('sequence_expand', input=x, **locals())
+    dtype = helper.input_dtype()
+    tmp = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='sequence_expand', inputs={'X': x,
+                                        'Y': y}, outputs={'Out': tmp})
+    return tmp
+
+
+def beam_search(pre_ids, ids, scores, beam_size, end_id, level=0):
+    '''
+    This function implements the beam search algorithm.
+    '''
+    helper = LayerHelper('beam_search', **locals())
+    score_type = scores.dtype
+    id_type = ids.dtype
+
+    selected_scores = helper.create_tmp_variable(dtype=score_type)
+    selected_ids = helper.create_tmp_variable(dtype=id_type)
+
+    helper.append_op(
+        type='beam_search',
+        inputs={
+            'pre_ids': pre_ids,
+            'ids': ids,
+            'scores': scores,
+        },
+        outputs={
+            'selected_ids': selected_ids,
+            'selected_scores': selected_scores,
+        },
+        attrs={
+            # TODO(ChunweiYan) to assure other value support
+            'level': level,
+            'beam_size': beam_size,
+            'end_id': end_id,
+        })
+
+    return selected_ids, selected_scores
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              param_attr=None,
+              bias_attr=None,
+              name=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
+    should be same. The implementation separates the linear transformation and
+    non-linear transformation apart. Here, we take :math:`i_t` as an example.
+    The linear transformation is applied by calling a `fc` layer and the
+    equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
+
+    Args:
+        x_t (Variable): The input value of current step, a 2-D tensor with shape
+            M x N, M for batch size and N for input size.
+        hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
+            with shape M x S, M for batch size and S for size of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
+            shape M x S, M for batch size and S for size of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        tuple: The hidden value and cell value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**
+                    not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev**
+                    and **cell_t_prev** not be the same or the 2nd dimensions of
+                    **hidden_t_prev** and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
+        raise ValueError("The 2nd dimensions of hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return h, c
+
+
+def reduce_sum(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the sum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the sum is performed. If
+            :attr:`None`, sum all elements of :attr:`input` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
+            the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_sum(x)  # [3.5]
+            fluid.layers.reduce_sum(x, dim=0)  # [0.3, 0.5, 1.1, 1.6]
+            fluid.layers.reduce_sum(x, dim=-1)  # [1.9, 1.6]
+            fluid.layers.reduce_sum(x, dim=1, keep_dim=True)  # [[1.9], [1.6]]
+    """
+    helper = LayerHelper('reduce_sum', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_mean(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the mean of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the mean is computed. If
+            :attr:`None`, compute the mean over all elements of :attr:`input`
+            and return a Tensor variable with a single element, otherwise
+            must be in the range :math:`[-rank(input), rank(input))`. If
+            :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_mean(x)  # [0.4375]
+            fluid.layers.reduce_mean(x, dim=0)  # [0.15, 0.25, 0.55, 0.8]
+            fluid.layers.reduce_mean(x, dim=-1)  # [0.475, 0.4]
+            fluid.layers.reduce_mean(x, dim=1, keep_dim=True)  # [[0.475], [0.4]]
+    """
+    helper = LayerHelper('reduce_mean', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_mean',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_max(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the maximum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the maximum is computed.
+            If :attr:`None`, compute the maximum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_max(x)  # [0.9]
+            fluid.layers.reduce_max(x, dim=0)  # [0.2, 0.3, 0.6, 0.9]
+            fluid.layers.reduce_max(x, dim=-1)  # [0.9, 0.7]
+            fluid.layers.reduce_max(x, dim=1, keep_dim=True)  # [[0.9], [0.7]]
+    """
+    helper = LayerHelper('reduce_max', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_max',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_min(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the minimum of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int|None): The dimension along which the minimum is computed.
+            If :attr:`None`, compute the minimum over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with following elements:
+            #    [[0.2, 0.3, 0.5, 0.9]
+            #     [0.1, 0.2, 0.6, 0.7]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_min(x)  # [0.1]
+            fluid.layers.reduce_min(x, dim=0)  # [0.1, 0.2, 0.5, 0.7]
+            fluid.layers.reduce_min(x, dim=-1)  # [0.2, 0.1]
+            fluid.layers.reduce_min(x, dim=1, keep_dim=True)  # [[0.2], [0.1]]
+    """
+    helper = LayerHelper('reduce_min', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='reduce_min',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else 0,
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def split(input, num_or_sections, dim=-1, name=None):
+    """
+    Split the input tensor into multiple sub-tensors.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        num_or_sections (int|list): If :attr:`num_or_sections` is an integer,
+            then the integer indicates the number of equal sized sub-tensors
+            that the tensor will be divided into. If :attr:`num_or_sections`
+            is a list of integers, the length of list indicates the number of
+            sub-tensors and the integers indicate the sizes of sub-tensors'
+            :attr:`dim` dimension orderly.
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
+            dimension to split along is :math:`rank(input) + dim`.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        List: The list of segmented tensor variables.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with shape [3, 9, 5]:
+            x0, x1, x2 = fluid.layers.split(x, num_or_sections=3, dim=1)
+            x0.shape  # [3, 3, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 3, 5]
+            x0, x1, x2 = fluid.layers.split(x, num_or_sections=[2, 3, 4], dim=1)
+            x0.shape  # [3, 2, 5]
+            x1.shape  # [3, 3, 5]
+            x2.shape  # [3, 4, 5]
+    """
+    helper = LayerHelper('split', **locals())
+    input_shape = input.shape
+    dim = (len(input_shape) + dim) if dim < 0 else dim
+    if isinstance(num_or_sections, int):
+        assert num_or_sections > 1, 'num_or_sections must be more than 1.'
+        num = num_or_sections
+    else:
+        assert len(num_or_sections) < input_shape[
+            dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+        num = len(num_or_sections)
+    outs = [
+        helper.create_tmp_variable(dtype=helper.input_dtype())
+        for i in range(num)
+    ]
+    helper.append_op(
+        type='split',
+        inputs={'X': input},
+        outputs={'Out': outs},
+        attrs={
+            'num': num_or_sections if isinstance(num_or_sections, int) else 0,
+            'sections': num_or_sections
+            if isinstance(num_or_sections, list) else [],
+            'axis': dim
+        })
+    return outs
+
+
+def l2_normalize(x, axis, epsilon=1e-12, name=None):
+    """
+    **L2 normalize Layer**
+
+    The l2 normalize layer normalizes `x` along dimension `axis` using an L2
+    norm. For a 1-D tensor (`dim` is fixed to 0), this layer computes
+
+    output = x / sqrt(max(sum(x**2), epsilon))
+
+    For `x` with more dimensions, this layer independently normalizes each 1-D
+    slice along dimension `axis`.
+
+    Args:
+       x(Variable|list): The input tensor to l2_normalize layer.
+       axis(int): Dimension along which to normalize the input.
+       epsilon(float): A lower bound value for `x`'s l2 norm. sqrt(epsilon) will
+                       be used as the divisor if the l2 norm of `x` is less than
+                       sqrt(epsilon).
+       name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+
+    Returns:
+        Variable: The output tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.data(name="data",
+                                   shape=(3, 17, 13),
+                                   dtype="float32")
+          normed = fluid.layers.l2_normalize(x=data, axis=1)
+    """
+
+    if len(x.shape) == 1: axis = 0
+
+    helper = LayerHelper("l2_normalize", **locals())
+
+    square = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="square", inputs={"X": x}, outputs={"Out": square})
+
+    reduced_sum = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reduce_sum",
+        inputs={"X": square},
+        outputs={"Out": reduced_sum},
+        attrs={
+            "dim": 1 if axis is None else axis,
+            "keep_dim": True,
+            "reduce_all": False
+        })
+
+    # TODO(caoying) A lower bound value epsilon for the norm is needed to
+    # imporve the numeric stability of reciprocal. This requires a maximum_op.
+    rsquare = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="reciprocal", inputs={"X": reduced_sum}, outputs={"Out": rsquare})
+
+    # TODO(caoying) the current elementwise_mul operator does not support a
+    # general broadcast rule which broadcasts input(Y) to have the same
+    # dimension with Input(X) starting from a specified dimension. So this
+    # exanpsion is requred. Once a general broadcast rule is spported, this
+    # expanding canbe removed.
+    rsquare_expanded = helper.create_tmp_variable(dtype=x.dtype)
+    expand_times = [1] * len(x.shape)
+    expand_times[axis] = int(x.shape[axis])
+    helper.append_op(
+        type="expand",
+        inputs={"X": rsquare},
+        outputs={"Out": rsquare_expanded},
+        attrs={"expand_times": expand_times})
+
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type="elementwise_mul",
+        inputs={"X": x,
+                "Y": rsquare_expanded},
+        outputs={"Out": out})
+    return out
+
+
+def matmul(x, y, transpose_x=False, transpose_y=False, name=None):
+    """
+    Applies matrix multiplication to two tensors.
+
+    Currently, the input tensors' rank can be any, but when the rank of any
+    inputs is bigger than 3, this two inputs' rank should be equal.
+
+    The actual behavior depends on the shapes of :math:`x`, :math:`y` and the
+    flag values of :attr:`transpose_x`, :attr:`transpose_y`. Specifically:
+
+    - If a transpose flag is specified, the last two dimensions of the tensor
+      are transposed. If the tensor is rank-1 of shape :math:`[D]`, then for
+      :math:`x` it is treated as :math:`[1, D]` in nontransposed form and as
+      :math:`[D, 1]` in transposed form, whereas for :math:`y` it is the
+      opposite: It is treated as :math:`[D, 1]` in nontransposed form and as
+      :math:`[1, D]` in transposed form.
+
+    - After transpose, the two tensors are 2-D or n-D and matrix multiplication
+      performs in the following way.
+
+      - If both are 2-D, they are multiplied like conventional matrices.
+      - If either is n-D, it is treated as a stack of matrices residing in the
+        last two dimensions and a batched matrix multiply supporting broadcast
+        applies on the two tensors.
+
+    Also note that if the raw tensor :math:`x` or :math:`y` is rank-1 and
+    nontransposed, the prepended or appended dimension :math:`1` will be
+    removed after matrix multiplication.
+
+    Args:
+        x (Variable): The input variable which is a Tensor or LoDTensor.
+        y (Variable): The input variable which is a Tensor or LoDTensor.
+        transpose_x (bool): Whether to transpose :math:`x` before multiplication.
+        transpose_y (bool): Whether to transpose :math:`y` before multiplication.
+        name(str|None): A name for this layer(optional). If set None, the layer
+            will be named automatically.
+
+    Returns:
+        Variable: The product Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # Examples to clarify shapes of the inputs and output
+            # x: [B, ..., M, K], y: [B, ..., K, N]
+            fluid.layers.matmul(x, y)  # out: [B, ..., M, N]
+
+            # x: [B, M, K], y: [B, K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+
+            # x: [B, M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [B, M, N]
+
+            # x: [M, K], y: [K, N]
+            fluid.layers.matmul(x, y)  # out: [M, N]
+
+            # x: [B, M, K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [B, M]
+
+            # x: [K], y: [K]
+            fluid.layers.matmul(x, y)  # out: [1]
+
+            # x: [M], y: [N]
+            fluid.layers.matmul(x, y, True, True)  # out: [M, N]
+    """
+
+    def __check_input(x, y):
+        if len(y.shape) > len(x.shape):
+            raise ValueError(
+                "Invalid inputs for matmul. "
+                "x's rank should be always greater than or equal to y'rank.")
+
+        x_shape = list(x.shape)
+        y_shape = list(y.shape)
+        if len(x_shape) == 1:
+            x_shape = [1] + x_shape
+        if len(y_shape) == 1:
+            y_shape = y_shape + [1]
+
+        # check the inner 2 dimensions
+        if transpose_x:
+            x_shape[-2], x_shape[-1] = x_shape[-1], x_shape[-2]
+        if transpose_y:
+            y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
+        if x_shape[-1] != y_shape[-2]:
+            raise ValueError("Invalid inputs for matmul.")
+
+        if len(y_shape) > 2:
+            for i, dim_x in enumerate(x_shape[:-2]):
+                if dim_x != y_shape[i]:
+                    raise ValueError("Invalid inputs for matmul.")
+
+    __check_input(x, y)
+
+    helper = LayerHelper('matmul', **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x,
+                'Y': y},
+        outputs={'Out': out},
+        attrs={'transpose_X': transpose_x,
+               'transpose_Y': transpose_y})
+    return out
+
+
+def edit_distance(input,
+                  label,
+                  normalized=False,
+                  ignored_tokens=None,
+                  name=None):
+    """
+    EditDistance operator computes the edit distances between a batch of
+    hypothesis strings and their references. Edit distance, also called
+    Levenshtein distance, measures how dissimilar two strings are by counting
+    the minimum number of operations to transform one string into anthor.
+    Here the operations include insertion, deletion, and substitution.
+
+    For example, given hypothesis string A = "kitten" and reference
+    B = "sitting", the edit distance is 3 for A will be transformed into B
+    at least after two substitutions and one insertion:
+
+    "kitten" -> "sitten" -> "sittin" -> "sitting"
+
+    Input(Hyps) is a LoDTensor consisting of all the hypothesis strings with
+    the total number denoted by `batch_size`, and the separation is specified
+    by the LoD information. And the `batch_size` reference strings are arranged
+    in order in the same way in the LoDTensor Input(Refs).
+
+    Output(Out) contains the `batch_size` results and each stands for the edit
+    distance for a pair of strings respectively. If Attr(normalized) is true,
+    the edit distance will be divided by the length of reference string.
+
+    Args:
+
+        input(Variable): The indices for hypothesis strings.
+
+        label(Variable): The indices for reference strings.
+
+        normalized(bool): Indicated whether to normalize the edit distance by
+                          the length of reference string.
+
+        ignored_tokens(list of int): Tokens that should be removed before
+                                     calculating edit distance.
+
+    Returns:
+        Variable: sequence-to-sequence edit distance in shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
+            y = fluid.layers.data(name='y', shape=[7], dtype='float32')
+
+            cost = fluid.layers.edit_distance(input=x,label=y)
+    """
+    helper = LayerHelper("edit_distance", **locals())
+
+    # remove some tokens from input and labels
+    if ignored_tokens is not None and len(ignored_tokens) > 0:
+        erased_input = helper.create_tmp_variable(dtype="int64")
+        erased_label = helper.create_tmp_variable(dtype="int64")
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens})
+        input = erased_input
+
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erase_label]},
+            attrs={"tokens": ignored_tokens})
+        label = erased_label
+
+    # edit distance op
+    edit_distance_out = helper.create_tmp_variable(dtype="int64")
+    sequence_num = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="edit_distance",
+        inputs={"Hyps": [input],
+                "Refs": [label]},
+        outputs={"Out": [edit_distance_out],
+                 "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized})
+
+    return edit_distance_out, sequence_num
+
+
+def ctc_greedy_decoder(input, blank, name=None):
+    """
+    This op is used to decode sequences by greedy policy by below steps:
+    1. Get the indexes of max value for each row in input. a.k.a.
+       numpy.argmax(input, axis=0).
+    2. For each sequence in result of step1, merge repeated tokens between two
+       blanks and delete all blanks.
+
+    A simple example as below:
+
+    .. code-block:: text
+
+        Given:
+
+        input.data = [[0.6, 0.1, 0.3, 0.1],
+                      [0.3, 0.2, 0.4, 0.1],
+                      [0.1, 0.5, 0.1, 0.3],
+                      [0.5, 0.1, 0.3, 0.1],
+
+                      [0.5, 0.1, 0.3, 0.1],
+                      [0.2, 0.2, 0.2, 0.4],
+                      [0.2, 0.2, 0.1, 0.5],
+                      [0.5, 0.1, 0.3, 0.1]]
+
+        input.lod = [[0, 4, 8]]
+
+        Then:
+
+        output.data = [[2],
+                       [1],
+                       [3]]
+
+        output.lod = [[0, 2, 3]]
+
+    Args:
+
+        input(Variable): (LoDTensor<float>), the probabilities of
+                         variable-length sequences, which is a 2-D Tensor with
+                         LoD information. It's shape is [Lp, num_classes + 1],
+                         where Lp is the sum of all input sequences' length and
+                         num_classes is the true number of classes. (not
+                         including the blank label).
+
+        blank(int): the blank label index of Connectionist Temporal
+                    Classification (CTC) loss, which is in thehalf-opened
+                    interval [0, num_classes + 1).
+
+    Returns:
+        Variable: CTC greedy decode result.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[8], dtype='float32')
+
+            cost = fluid.layers.ctc_greedy_decoder(input=x, blank=0)
+    """
+    helper = LayerHelper("ctc_greedy_decoder", **locals())
+    # top 1 op
+    topk_out = helper.create_tmp_variable(dtype=input.dtype)
+    topk_indices = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="top_k",
+        inputs={"X": [input]},
+        outputs={"Out": [topk_out],
+                 "Indices": [topk_indices]},
+        attrs={"k": 1})
+
+    # ctc align op
+    ctc_out = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="ctc_align",
+        inputs={"Input": [topk_indices]},
+        outputs={"Output": [ctc_out]},
+        attrs={"merge_repeated": True,
+               "blank": blank})
+    return ctc_out
+
+
+def warpctc(input, label, blank=0, norm_by_times=False, **kwargs):
+    """
+    An operator integrating the open source Warp-CTC library
+    (https://github.com/baidu-research/warp-ctc)
+    to compute Connectionist Temporal Classification (CTC) loss.
+    It can be aliased as softmax with CTC, since a native softmax activation is
+    interated to the Warp-CTC library, to to normlize values for each row of the
+    input tensor.
+
+    Args:
+       input(Variable): (LodTensor, default: LoDTensor<float>),
+         the unscaled probabilities of variable-length sequences,
+         which is a 2-D Tensor with LoD information.
+         It's shape is [Lp, num_classes + 1], where Lp is the sum of all input
+         sequences' length and num_classes is the true number of classes.
+         (not including the blank label).
+       label(Variable): (LodTensor, default: LoDTensor<int>), the ground truth
+         of variable-length sequence, which is a 2-D Tensor with LoD
+         information. It is of the shape [Lg, 1], where Lg is th sum of
+         all labels' length.
+       blank: (int, default: 0), the blank label index of Connectionist
+         Temporal Classification (CTC) loss, which is in the
+         half-opened interval [0, num_classes + 1).
+       norm_by_times: (bool, default: false), whether to normalize
+       the gradients by the number of time-step, which is also the
+       sequence's length. There is no need to normalize the gradients
+       if warpctc layer was follewed by a mean_op.
+
+    Returns:
+        Variable: The Connectionist Temporal Classification (CTC) loss,
+        which is a 2-D Tensor of the shape [batch_size, 1].
+
+    Examples:
+        .. code-block:: python
+            y = layers.data(
+                name='y', shape=[11, 8], dtype='float32', lod_level=1)
+            y_predict = layers.data(
+                name='y_predict', shape=[11, 1], dtype='float32')
+            cost = layers.warpctc(input=y_predict, label=y)
+
+    """
+    helper = LayerHelper('warpctc', **kwargs)
+    loss_out = helper.create_tmp_variable(dtype=input.dtype)
+    grad_out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='warpctc',
+        inputs={'Logits': [input],
+                'Label': [label]},
+        outputs={'WarpCTCGrad': [grad_out],
+                 'Loss': [loss_out]},
+        attrs={'blank': blank,
+               'norm_by_times': norm_by_times})
+    return loss_out
+
+
+def sequence_reshape(input, new_dim):
+    """
+    **Sequence Reshape Layer**
+
+    This layer will rearrange the input sequences. The new dimension is set by
+    user. Length of each sequence is computed according to original length,
+    original dimension and new dimension. The following example will help to
+    illustrate the function of this layer:
+
+    .. code-block:: text
+
+        x is a LoDTensor:
+            x.lod  = [[0, 2, 6]]
+            x.data = [[1, 2], [3, 4],
+                      [5, 6], [7, 8], [9, 10], [11, 12]]
+            x.dims = [6, 2]
+
+        set new_dim = 4
+
+        then out is a LoDTensor:
+            out.lod  = [[0, 1, 3]]
+            out.data = [[1, 2, 3, 4],
+                        [5, 6, 7, 8], [9, 10, 11, 12]]
+            out.dims = [3, 4]
+
+    Currently, only 1-level LoDTensor is supported and please make sure
+    (original length * original dimension) can be divided by new dimension with
+    no remainder for each sequence.
+
+    Args:
+       input (Variable): (LodTensor, default: LoDTensor<float>), a 2-D LoDTensor
+                with shape being [N, M] where M for dimension.
+       new_dim (int): New dimension which the input LoDTensor is reshaped to.
+
+    Returns:
+        Variable: Reshaped LoDTensor according to new dimension.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[5, 20],
+                              dtype='float32', lod_level=1)
+            x_reshaped = layers.sequence_reshape(input=x, new_dim=10)
+    """
+    helper = LayerHelper('sequence_reshape', **locals())
+    out = helper.create_tmp_variable(helper.input_dtype())
+    helper.append_op(
+        type='sequence_reshape',
+        inputs={'X': [input]},
+        outputs={'Out': [out]},
+        attrs={'new_dim': new_dim})
+    return out
+
+
+@autodoc()
+def nce(input,
+        label,
+        num_total_classes,
+        sample_weight=None,
+        param_attr=None,
+        bias_attr=None,
+        num_neg_samples=None):
+    helper = LayerHelper('nce', **locals())
+    assert isinstance(input, Variable)
+    dim = input.shape[1]
+    assert isinstance(label, Variable)
+    num_true_class = label.shape[1]
+    w = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[num_total_classes, dim],
+        is_bias=False,
+        dtype=input.dtype)
+    b = helper.create_parameter(
+        attr=helper.bias_attr,
+        shape=[num_total_classes, 1],
+        is_bias=True,
+        dtype=input.dtype)
+    cost = helper.create_tmp_variable(dtype=input.dtype)
+    sample_logits = helper.create_tmp_variable(dtype=input.dtype)
+    sample_labels = helper.create_tmp_variable(dtype=label.dtype)
+
+    if num_neg_samples is None:
+        num_neg_samples = 10
+    else:
+        num_neg_samples = int(num_neg_samples)
+
+    attrs = {
+        'num_total_classes': int(num_total_classes),
+        'num_neg_samples': num_neg_samples
+    }
+
+    helper.append_op(
+        type='nce',
+        inputs={
+            'Input': input,
+            'Label': label,
+            'Weight': w,
+            'Bias': b,
+            'SampleWeight': sample_weight if sample_weight is not None else []
+        },
+        outputs={
+            'Cost': cost,
+            'SampleLogits': sample_logits,
+            'SampleLabels': sample_labels
+        },
+        attrs=attrs)
+    return cost / (num_neg_samples + 1)
+
+
+def transpose(x, perm, name=None):
+    """
+    **transpose Layer**
+
+    Permute the dimensions of `input` according to `perm`.
+
+    The `i`-th dimension  of the returned tensor will correspond to the
+    perm[i]-th dimension of `input`.
+
+    Args:
+       input (Variable): (Tensor), A Tensor.
+       perm (list): A permutation of the dimensions of `input`.
+
+    Returns:
+        Variable: A transposed Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[5, 10, 15], dtype='float32')
+            x_transposed = layers.transpose(x, perm=[1, 0, 2])
+    """
+
+    if len(perm) != len(x.shape):
+        raise ValueError(
+            "Input(perm) is the permutation of dimensions of Input(input). "
+            "It's length shoud be equal to Input(input)'s rank.")
+    for idx, dim in enumerate(perm):
+        if dim >= len(x.shape):
+            raise ValueError(
+                "Each element in perm should be less than x's rank. "
+                "%d-th element in perm is %d which accesses x's rank %d." %
+                (idx, perm[idx], len(x.shape)))
+
+    helper = LayerHelper('transpose', **locals())
+    out = helper.create_tmp_variable(x.dtype)
+    helper.append_op(
+        type='transpose',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'axis': perm})
+    return out
+
+
+def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+    """
+    Extracts image patches from the input tensor to form a tensor of shape
+    {input.batch_size * output_height * output_width, filter_size_H *
+    filter_size_W * input.channels} which is similar with im2col.
+    This op use filter / kernel to scan images and convert these images to
+    sequences. After expanding, the number of time step are
+    output_height * output_width for an image, in which output_height and
+    output_width are calculated by below equation:
+
+    .. math::
+
+        output\_size = 1 + \
+            (2 * padding + img\_size - block\_size + stride - 1) / stride
+
+    And the dimension of each time step is block_y * block_x * input.channels.
+
+    Args:
+        input (Variable): The input should be a tensor in NCHW format.
+
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.
+
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride. Default: stride = 1.
+
+        padding(int|tuple): The padding size. If padding is a tuple, it can
+            contain two integers like (padding_H, padding_W) which means
+            padding_up = padding_down = padding_H and
+            padding_left = padding_right = padding_W. Or it can use
+            (padding_up, padding_left, padding_down, padding_right) to indicate
+            paddings of four direction. Otherwise, a scalar padding means
+            padding_up = padding_down = padding_left = padding_right = padding
+            Default: padding = 0.
+
+        name (int): The name of this layer. It is optional.
+
+    Returns:
+        output: The output is a LoDTensor with shape
+        {input.batch_size * output_height * output_width,
+        filter_size_H * filter_size_W * input.channels}.
+        If we regard output as a matrix, each row of this matrix is
+        a step of a sequence.
+
+    Examples:
+
+    As an example:
+
+        .. code-block:: text
+
+            Given:
+
+            x = [[[[ 6.  2.  1.]
+                   [ 8.  3.  5.]
+                   [ 0.  2.  6.]]
+
+                  [[ 2.  4.  4.]
+                   [ 6.  3.  0.]
+                   [ 6.  4.  7.]]]
+
+                 [[[ 6.  7.  1.]
+                   [ 5.  7.  9.]
+                   [ 2.  4.  8.]]
+
+                  [[ 1.  2.  1.]
+                   [ 1.  3.  5.]
+                   [ 9.  0.  8.]]]]
+
+            x.dims = {2, 2, 3, 3}
+
+            And:
+
+            filter = [2, 2]
+            stride = [1, 1]
+            padding = [0, 0]
+
+            Then:
+
+            output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
+                           [ 2.  1.  3.  5.  4.  4.  3.  0.]
+                           [ 8.  3.  0.  2.  6.  3.  6.  4.]
+                           [ 3.  5.  2.  6.  3.  0.  4.  7.]
+                           [ 6.  7.  5.  7.  1.  2.  1.  3.]
+                           [ 7.  1.  7.  9.  2.  1.  3.  5.]
+                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
+                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]
+
+            output.dims = {8, 9}
+
+            output.lod = [[0, 4, 8]]
+
+        The simple usage is:
+
+        .. code-block:: python
+
+            output = fluid.layers.im2sequence(
+                input=layer, stride=[1, 1], filter_size=[2, 2])
+
+    """
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+    if len(padding) == 2:
+        padding.append(padding[0])
+        padding.append(padding[1])
+
+    helper = LayerHelper('im2sequence', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='im2sequence',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'kernels': filter_size,
+            'strides': stride,
+            'paddings': padding,
+        })
+    return out
+
+
+def row_conv(input, future_context_size, param_attr=None, act=None):
+    """Row Conv Operator. This layer will apply lookahead convolution to
+    **input**. The input variable should be a 2D LoDTensor with shape [T, D].
+    Parameters with shape [future_context_size + 1, D] will be created. The math
+    equation of row convolution is as follows:
+
+    .. math::
+        Out_{i} = \sum_{j = i} ^ {i + \\tau} X_{j} \odot W_{i - j}
+
+    In the above equation:
+
+    * :math:`Out_{i}`: The i-th row of output variable with shape [1, D].
+    * :math:`\\tau`: Future context size.
+    * :math:`X_{j}`: The j-th row of input variable with shape [1, D].
+    * :math:`W_{i-j}`: The (i-j)-th row of parameters with shape [1, D].
+
+    More details about row_conv please refer to the paper \
+    (http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf) and
+    the design document \
+    (https://github.com/PaddlePaddle/Paddle/issues/2228#issuecomment-303903645).
+
+    Args:
+        input (Variable): Input variable, a 2D LoDTensor with shape [T, D].
+        future_context_size (int): Future context size. Please note, the shape
+            of convolution kernel is [future_context_size + 1, D].
+        param_attr (ParamAttr): Attributes of parameters, including
+            name, initializer etc.
+        act (str): Non-linear activation to be applied to output variable.
+
+    Returns:
+        Variable: The output tensor with same shape as input tensor.
+
+    Examples:
+        .. code-block:: python
+
+            x = fluid.layers.data(name='x', shape=[16],
+                            dtype='float32', lod_level=1)
+            out = fluid.layers.row_conv(input=x, future_context_size=2)
+    """
+    helper = LayerHelper('row_conv', **locals())
+    dtype = helper.input_dtype()
+    filter_shape = [future_context_size + 1, input.shape[1]]
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    out = helper.create_tmp_variable(dtype)
+    helper.append_op(
+        type='row_conv',
+        inputs={'X': [input],
+                'Filter': [filter_param]},
+        outputs={'Out': [out]})
+    return helper.append_activation(out)
+
+
+def multiplex(inputs, index):
+    """
+    **Multiplex Layer**
+
+    Referring to the given index variable, this layer selects rows from the
+    input variables to construct a multiplex variable. Assuming that there are
+    :math:`m` input variables and :math:`I_i` represents the i-th input
+    variable and :math:`i` is in [0, :math:`m`). All input variables are
+    tensors with same shape [:math:`d_0`, :math:`d_1`, ..., :math:`d_R`].
+    Please note that rank of the input tensor should be at least 2. Each input
+    variable will be treated as a 2-D matrix with shape [:math:`M`, :math:`N`]
+    where :math:`M` for :math:`d_0` and :math:`N` for :math:`d_1` * :math:`d_2`
+    * ... * :math:`d_R`. Let :math:`I_i[j]` be the j-th row of the i-th input
+    variable. The given index variable should be a 2-D tensor with shape
+    [:math:`M`, 1]. Let `ID[i]` be the i-th index value of the index variable.
+    Then the output variable will be a tensor with shape [:math:`d_0`,
+    :math:`d_1`, ..., :math:`d_R`]. If we treat the output tensor as a 2-D
+    matrix with shape [:math:`M`, :math:`N`] and let :math:`O[i]` be the i-th
+    row of the matrix, then `O[i]` is equal to :math:`I_{ID[i]}[i]`.
+
+    Args:
+       inputs (list): A list of variables to gather from. All variables have the
+                same shape and the rank is at least 2.
+       index (Variable): Tensor<int32>, index variable which is a 2-D tensor
+                with shape [M, 1] where M is the batch size.
+
+    Returns:
+        Variable: Multiplex variable gathered from input variables.
+
+    Examples:
+        .. code-block:: python
+
+            x1 = fluid.layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = fluid.layers.data(name='x2', shape=[4], dtype='float32')
+            index = fluid.layers.data(name='index', shape=[1], dtype='int32')
+            out = fluid.layers.multiplex(inputs=[x1, x2], index=index)
+    """
+    helper = LayerHelper('multiplex', **locals())
+
+    if not isinstance(inputs, list) and len(inputs) < 2:
+        raise ValueError("inputs should be a list object and contains at least "
+                         "2 elements.")
+
+    out = helper.create_tmp_variable(inputs[0].dtype)
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs,
+                'Ids': index},
+        outputs={'Out': [out]})
+    return out
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
new file mode 100644
index 0000000000..ee3172c7b8
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from layer_function_generator import generate_layer_fn
+
+__activations__ = [
+    'sigmoid',
+    'logsigmoid',
+    'exp',
+    'relu',
+    'tanh',
+    'tanh_shrink',
+    'softshrink',
+    'sqrt',
+    'abs',
+    'ceil',
+    'floor',
+    'round',
+    'reciprocal',
+    'log',
+    'square',
+    'softplus',
+    'softsign',
+    'brelu',
+    'leaky_relu',
+    'soft_relu',
+    'elu',
+    'relu6',
+    'pow',
+    'stanh',
+    'hard_shrink',
+    'thresholded_relu',
+    'hard_sigmoid',
+    'swish',
+]
+
+__all__ = [
+    'mean',
+    'mul',
+    'reshape',
+    'scale',
+    'sigmoid_cross_entropy_with_logits',
+    'elementwise_add',
+    'elementwise_div',
+    'elementwise_sub',
+    'elementwise_mul',
+    'elementwise_max',
+    'elementwise_min',
+    'elementwise_pow',
+    'clip',
+    'clip_by_norm',
+    'sequence_softmax',
+] + __activations__
+
+for _OP in set(__all__):
+    globals()[_OP] = generate_layer_fn(_OP)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
new file mode 100644
index 0000000000..c435c5206d
--- /dev/null
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -0,0 +1,343 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..layer_helper import LayerHelper
+from ..param_attr import ParamAttr
+from ..framework import convert_np_dtype_to_dtype_
+from ..framework import Variable
+from ..initializer import Constant
+from ..core import DataType
+import numpy
+
+__all__ = [
+    'create_tensor',
+    'create_parameter',
+    'create_global_var',
+    'cast',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'ones',
+    'zeros',
+]
+
+
+def create_tensor(dtype, name=None):
+    helper = LayerHelper("create_tensor", **locals())
+    return helper.create_variable(name=helper.name, dtype=dtype)
+
+
+def create_parameter(shape,
+                     dtype,
+                     attr=None,
+                     is_bias=False,
+                     default_initializer=None):
+    """
+    Create a parameter
+    Args:
+        shape(list[int]): shape of the parameter
+        dtype(string): element type of the parameter
+        attr(ParamAttr): attributes of the parameter
+        is_bias(bool): This can affect which default initializer is chosen
+                       when default_initializer is None. If is_bias,
+                       initializer.Constant(0.0) will be used. Otherwise,
+                       Xavier() will be used.
+        default_initializer(Initializer): initializer for the parameter
+
+    Returns:
+        Parameter: the created parameter
+    """
+    helper = LayerHelper("create_parameter", **locals())
+    if attr is None:
+        attr = ParamAttr()
+    return helper.create_parameter(attr, shape, dtype, is_bias,
+                                   default_initializer)
+
+
+def create_global_var(shape, value, dtype, persistable=False, name=None):
+    helper = LayerHelper("global_var", **locals())
+    var = helper.create_global_variable(
+        dtype=dtype, shape=shape, persistable=persistable, name=name)
+    helper.set_variable_initializer(
+        var, initializer=Constant(value=float(value)))
+    return var
+
+
+def cast(x, dtype):
+    """
+    This function takes in the input with input_dtype
+    and casts it to the output_dtype as the output.
+    """
+    helper = LayerHelper('cast', **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype,
+               'out_dtype': out.dtype})
+    return out
+
+
+def concat(input, axis=0):
+    """
+    **Concat**
+
+    This function concatenates the input along the axis mentioned
+    and returns that as the output.
+
+    Args:
+        input(list): List of tensors to be concatenated
+        axis(int): Integer axis along which the tensors will be concatenated
+
+    Returns:
+        Variable: Output variable of the concatenation
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    """
+    helper = LayerHelper('concat', **locals())
+    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def sums(input, out=None):
+    """This function performs the sum operation on the input and returns the
+    result as the output.
+
+    Args:
+        input (Variable|list): The input tensor that has the elements
+                               that need to be summed up.
+
+    Returns:
+        Variable: The tensor type variable that has the sum of input
+                  written to it.
+
+    Examples:
+        .. code-block::python
+
+          tmp = fluid.layers.zeros(shape=[10], dtype='int32')
+          i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=10)
+          a0 = layers.array_read(array=tmp, i=i)
+          i = layers.increment(x=i)
+          a1 = layers.array_read(array=tmp, i=i)
+          mean_a0 = layers.mean(x=a0)
+          mean_a1 = layers.mean(x=a1)
+          a_sum = layers.sums(input=[mean_a0, mean_a1])
+    """
+    helper = LayerHelper('sum', **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
+    return out
+
+
+def assign(input, output):
+    """
+    **Assign**
+
+    This function copies the *input* Variable to the *output* Variable.
+
+    Args:
+        input(Variable|numpy.ndarray): The source variable
+        output(Variable): The destination variable
+
+    Returns:
+        Variable: The destination variable that was supplied as the *output*.
+
+    Examples:
+        .. code-block:: python
+          out = fluid.layers.create_tensor(dtype='float32')
+          hidden = fluid.layers.fc(input=data, size=10)
+          fluid.layers.assign(hidden, out)
+    """
+    helper = LayerHelper('assign', **locals())
+    if isinstance(input, Variable):
+        helper.append_op(
+            type='scale',
+            inputs={'X': [input]},
+            outputs={'Out': [output]},
+            attrs={'scale': 1.0})
+    elif isinstance(input, numpy.ndarray):
+        dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == DataType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in input.flat]
+        elif dtype == DataType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in input.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if input.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
+    else:
+        raise ValueError("Wrong type for assign input: %s" % type(input))
+
+    return output
+
+
+def fill_constant(shape, dtype, value, force_cpu=False, out=None):
+    """
+    **fill_constant**
+
+    This function creates a tensor with specified `shape` and `dtype`, and
+    initializes it with a constant specifed by `value`.
+
+    The attribute `stop_gradient` of the created tensor is set to True.
+
+    Args:
+        shape(tuple|list|None): Shape of the output tensor.
+        dtype(np.dtype|core.DataType|str): Data type of the output tensor.
+        value(float): The constant value used to initialize the output tensor.
+        out(Variable): The output tensor.
+
+    Returns:
+        Variable: The tensor variable storing the output.
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
+    """
+
+    helper = LayerHelper("fill_constant", **locals())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant',
+        inputs={},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'force_cpu': force_cpu
+        })
+    out.stop_gradient = True
+    return out
+
+
+def fill_constant_batch_size_like(input,
+                                  shape,
+                                  dtype,
+                                  value,
+                                  input_dim_idx=0,
+                                  output_dim_idx=0):
+    """
+    **fill_constant_batch_size_like**
+
+    This function creates a tensor of specified *shape*, *dtype* and batch size,
+    and initializes this with a constant supplied in *value*. The batch size is
+    obtained from the `input` tensor.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        input(Variable): Tensor whose dimensions will be used to get batch size
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+        value(float): Constant value to initialize the output tensor
+        input_dim_idx(int): Index of input's batch size dimension
+        output_dim_idx(int): Index of output's batch size dimension
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.fill_constant_batch_size_like(
+              input=like, shape=[1], value=0, dtype='int64')
+    """
+    helper = LayerHelper("fill_constant_batch_size_like", **locals())
+    out = helper.create_tmp_variable(dtype=dtype)
+    helper.append_op(
+        type='fill_constant_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': [out]},
+        attrs={
+            'shape': shape,
+            'dtype': out.dtype,
+            'value': float(value),
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx
+        })
+    out.stop_gradient = True
+    return out
+
+
+def ones(shape, dtype):
+    """
+    **ones**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 1.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.ones(shape=[1], dtype='int64')
+    """
+    return fill_constant(value=1.0, **locals())
+
+
+def zeros(shape, dtype):
+    """
+    **zeros**
+
+    This function creates a tensor of specified *shape* and
+    *dtype*, and initializes this with 0.
+
+    It also sets *stop_gradient* to True.
+
+    Args:
+        shape(tuple|list|None): Shape of output tensor
+        dtype(np.dtype|core.DataType|str): Data type of output tensor
+
+    Returns:
+        Variable: The tensor variable storing the output
+
+    Examples:
+        .. code-block:: python
+
+          data = fluid.layers.zeros(shape=[1], dtype='int64')
+    """
+    return fill_constant(value=0.0, **locals())
diff --git a/python/paddle/v2/fluid/learning_rate_decay.py b/python/paddle/v2/fluid/learning_rate_decay.py
new file mode 100644
index 0000000000..96b3e9a0d7
--- /dev/null
+++ b/python/paddle/v2/fluid/learning_rate_decay.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import layers
+from framework import Variable
+
+__all__ = ['exponential_decay', 'natural_exp_decay', 'inverse_time_decay']
+"""
+When training a model, it's often useful to decay the
+learning rate during training process, this is called
+learning_rate_decay. There are many strategies to do
+this, this module will provide some classical method.
+User can also implement their own learning_rate_decay
+strategy according to this module.
+"""
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies exponential decay to the learning rate.
+
+    ```python
+    decayed_learning_rate = learning_rate *
+            decay_rate ^ (global_step / decay_steps)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for exponential_decay.")
+
+    # update learning_rate
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * (decay_rate**div_res)
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    """Applies natural exponential decay to the initial learning rate.
+
+    ```python
+    if not staircase:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    else:
+        decayed_learning_rate = learning_rate * exp(- decay_rate * (global_step / decay_steps))
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for natural_exp_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+    return learning_rate * layers.exp(x=(-1 * decay_rate * div_res))
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    """Applies inverse time decay to the initial learning rate.
+
+    ```python
+    if staircase:
+      decayed_learning_rate = learning_rate / (1 + decay_rate * floor(global_step / decay_step))
+    else
+      decayed_learning_rate = learning_rate / (1 + decay_rate * global_step / decay_step)
+    ```
+    Args:
+        learning_rate: A scalar float32 value or a Variable. This
+          will be the initial learning rate during training
+        global_step: A Variable that record the training step.
+        decay_steps: A Python `int32` number.
+        decay_rate: A Python `float` number.
+        staircase: Boolean. If set true, decay the learning rate every decay_steps.
+
+    Returns:
+        The decayed learning rate
+    """
+    if not isinstance(global_step, Variable):
+        raise ValueError("global_step is required for inverse_time_decay.")
+
+    div_res = global_step / decay_steps
+    if staircase:
+        div_res = layers.floor(x=div_res)
+
+    return learning_rate / (1 + decay_rate * div_res)
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
new file mode 100644
index 0000000000..956c5b66da
--- /dev/null
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+import framework
+from framework import Program, default_main_program, Parameter, Variable
+import backward
+from backward import _rename_arg_
+from . import core
+
+dtype_to_size = {
+    core.DataType.FP16: 2,
+    core.DataType.FP32: 4,
+    core.DataType.FP64: 8,
+    core.DataType.INT16: 2,
+    core.DataType.INT32: 4,
+    core.DataType.INT64: 8,
+    core.DataType.BOOL: 1
+}
+
+
+class ControlFlowGraph(object):
+    def __init__(self, Program, ops, forward_num):
+        self._program = Program
+        self._ops = ops
+        self._forward_num = forward_num
+        self._successors = defaultdict(set)
+        self._presuccessors = defaultdict(set)
+        self._uses = defaultdict(set)
+        self._defs = defaultdict(set)
+        self._live_in = defaultdict(set)
+        self._live_out = defaultdict(set)
+
+    def _add_connections(self, connections):
+        for node1, node2 in connections:
+            self._add(node1, node2)
+
+    def _add(self, node1, node2):
+        self._successors[node1].add(node2)
+        self._presuccessors[node2].add(node1)
+
+    def _build_graph(self):
+        self.op_size = len(self._ops)
+        op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
+        self._add_connections(op_node_connections)
+        for i in range(self.op_size):
+            self._uses[i].update(self._ops[i].input_arg_names())
+            self._defs[i].update(self._ops[i].output_arg_names())
+
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_out[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
+    def _reach_fixed_point(self, live_in, live_out):
+        if len(live_in) != len(self._live_in):
+            return False
+        if len(live_out) != len(self._live_out):
+            return False
+        for i in range(self.op_size):
+            if live_in[i] != self._live_in[i]:
+                return False
+        for i in range(self.op_size):
+            if live_out[i] != self._live_out[i]:
+                return False
+        return True
+
+    def _dataflow_analyze(self):
+        self._build_graph()
+        live_in = defaultdict(set)
+        live_out = defaultdict(set)
+        while True:
+            for i in range(self.op_size):
+                live_in[i] = set(self._live_in[i])
+                live_out[i] = set(self._live_out[i])
+                self._live_in[i] = self._uses[i] | (
+                    self._live_out[i] - self._defs[i])
+                for s in self._successors[i]:
+                    self._live_out[i] |= self._live_in[s]
+
+            if self._reach_fixed_point(live_in, live_out):
+                break
+
+    def _get_diff(self, a, b):
+        u = a & b
+        return a - u, b - u
+
+    def _has_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.has_var(str(var_name))
+        else:
+            return block_desc.has_var_recursive(str(var_name))
+
+    def _find_var(self, block_desc, var_name, is_forward):
+        if is_forward:
+            return block_desc.find_var(str(var_name))
+        else:
+            return block_desc.find_var_recursive(str(var_name))
+
+    def memory_optimize(self):
+        def check_var_validity(block_desc, x, is_forward):
+            if str(x) == "@EMPTY@":
+                return False
+            if not self._has_var(block_desc, x, is_forward):
+                return False
+            if self._find_var(block_desc, x, is_forward).persistable():
+                return False
+            if self._find_var(
+                    block_desc, x,
+                    is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
+                return False
+            return True
+
+        self._build_graph()
+        self._dataflow_analyze()
+        self.pool = []
+        for i in range(self.op_size):
+            op = self._ops[i]
+            if op.type() == "while" or op.type() == "while_grad":
+                continue
+            block_desc = op.block()
+            is_forward = i < self._forward_num
+            if self.pool:
+                defs_can_optimize = filter(
+                    lambda x: check_var_validity(block_desc, x, is_forward),
+                    self._defs[i])
+                out_pair = [
+                    (x, self._find_var(block_desc, x, is_forward).shape())
+                    for x in defs_can_optimize
+                ]
+                for x, x_shape in out_pair:
+                    for index, cache_pair in enumerate(self.pool):
+                        cache_var = cache_pair[0]
+                        cache_shape = cache_pair[1]
+                        if x_shape == cache_shape:
+                            if self._has_var(block_desc, cache_var, is_forward):
+                                x_dtype = self._find_var(block_desc, x,
+                                                         is_forward).dtype()
+                                cache_dtype = self._find_var(
+                                    block_desc, cache_var, is_forward).dtype()
+                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
+                                # and dtype_to_size[cache_dtype]
+                                if x_dtype == cache_dtype:
+                                    print(("Hit Cache !!!! cache pool index "
+                                           "is %d, var name is %s, "
+                                           "cached var name is %s, "
+                                           "var shape is %s ") %
+                                          (index, x, cache_var,
+                                           str(cache_shape)))
+                                    self.pool.pop(index)
+                                    if x == cache_var:
+                                        break
+                                    _rename_arg_(
+                                        self._ops, x, cache_var, begin_idx=i)
+                                    self._program.block(block_desc.id).var(
+                                        str(x)).desc = self._find_var(
+                                            block_desc, cache_var, is_forward)
+                                    self._update_graph(
+                                        x, cache_var, begin_idx=i)
+                                    break
+
+            in_diff, out_diff = self._get_diff(self._live_in[i],
+                                               self._live_out[i])
+            can_optimize = filter(
+                lambda x: check_var_validity(block_desc, x, is_forward),
+                in_diff)
+            if can_optimize:
+                for var_name in can_optimize:
+                    self.pool.append((var_name, self._find_var(
+                        block_desc, var_name, is_forward).shape()))
+
+
+def get_cfgs(input_program):
+    ops_list = []
+    pdesc = input_program.get_desc()
+    block_desc = pdesc.block(0)
+    op_size = block_desc.op_size()
+    # Get global block ops
+    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+
+    while_sub_block_ids = []
+    while_grad_sub_block_ids = []
+    while_pair = []
+
+    for i in range(op_size):
+        op = block_desc.op(i)
+        if op.type() == "while":
+            while_sub_block_ids.append(op.attr("sub_block").id)
+        elif op.type() == "while_grad":
+            while_grad_sub_block_ids.append(op.attr("sub_block").id)
+
+    # Find while/while_grad block pair
+    for grad_id in while_grad_sub_block_ids:
+        parent_id = pdesc.block(grad_id).parent
+        if parent_id in while_sub_block_ids:
+            while_pair.append((parent_id, grad_id))
+            while_sub_block_ids.remove(parent_id)
+
+    # Get while/while_grad block ops
+    for parent_id, grad_id in while_pair:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        while_grad_block = pdesc.block(grad_id)
+        while_grad_block_op_size = while_grad_block.op_size()
+        for i in range(while_grad_block_op_size):
+            while_block_ops.append(while_grad_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    # Process rest while block ops
+    for parent_id in while_sub_block_ids:
+        while_block_ops = []
+        while_block = pdesc.block(parent_id)
+        while_block_op_size = while_block.op_size()
+        for i in range(while_block_op_size):
+            while_block_ops.append(while_block.op(i))
+
+        ops_list.append((while_block_ops, while_block_op_size))
+
+    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    return cfgs
+
+
+def memory_optimize(input_program):
+    cfgs = get_cfgs(input_program)
+    for cfg in cfgs:
+        cfg.memory_optimize()
diff --git a/python/paddle/v2/fluid/net_drawer.py b/python/paddle/v2/fluid/net_drawer.py
new file mode 100644
index 0000000000..9b126f5197
--- /dev/null
+++ b/python/paddle/v2/fluid/net_drawer.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.INFO)
+
+try:
+    from graphviz import Digraph
+except ImportError:
+    logger.info(
+        'Cannot import graphviz, which is required for drawing a network. This '
+        'can usually be installed in python with "pip install graphviz". Also, '
+        'pydot requires graphviz to convert dot files to pdf: in ubuntu, this '
+        'can usually be installed with "sudo apt-get install graphviz".')
+    print('net_drawer will not run correctly. Please install the correct '
+          'dependencies.')
+    exit(0)
+
+OP_STYLE = {
+    'shape': 'oval',
+    'color': '#0F9D58',
+    'style': 'filled',
+    'fontcolor': '#FFFFFF'
+}
+
+VAR_STYLE = {}
+
+GRAPH_STYLE = {"rankdir": "TB", }
+
+GRAPH_ID = 0
+
+
+def unique_id():
+    def generator():
+        GRAPH_ID += 1
+        return GRAPH_ID
+
+    return generator
+
+
+def draw_node(op):
+    node = OP_STYLE
+    node["name"] = op.type
+    node["label"] = op.type
+    return node
+
+
+def draw_edge(var_parent, op, var, arg):
+    edge = VAR_STYLE
+    edge["label"] = "%s(%s)" % (var.parameter, arg)
+    edge["head_name"] = op.type
+    edge["tail_name"] = var_parent[arg]
+    return edge
+
+
+def parse_graph(program, graph, var_dict, **kwargs):
+
+    # fill the known variables
+    for block in program.blocks:
+        for var in block.vars:
+            if not var_dict.has_key(var):
+                var_dict[var] = "Feed"
+
+    temp_id = 0
+    proto = framework_pb2.ProgramDesc.FromString(
+        program.desc.serialize_to_string())
+    for block in proto.blocks:
+        for op in block.ops:
+            op.type = op.type + "_" + str(temp_id)
+            temp_id += 1
+            graph.node(**draw_node(op))
+            for o in op.outputs:
+                for arg in o.arguments:
+                    var_dict[arg] = op.type
+            for e in op.inputs:
+                for arg in e.arguments:
+                    if var_dict.has_key(arg):
+                        graph.edge(**draw_edge(var_dict, op, e, arg))
+        break  # only plot the first block
+
+
+def draw_graph(startup_program, main_program, **kwargs):
+    if kwargs.has_key("graph_attr"):
+        GRAPH_STYLE.update(kwargs[graph_attr])
+    if kwargs.has_key("node_attr"):
+        OP_STYLE.update(kwargs[node_attr])
+    if kwargs.has_key("edge_attr"):
+        VAR_STYLE.update(kwargs[edge_attr])
+
+    graph_id = unique_id()
+    filename = kwargs.get("filename")
+    if filename == None:
+        filename = str(graph_id) + ".gv"
+    g = Digraph(
+        name=str(graph_id),
+        filename=filename,
+        graph_attr=GRAPH_STYLE,
+        node_attr=OP_STYLE,
+        edge_attr=VAR_STYLE,
+        **kwargs)
+
+    var_dict = {}
+    parse_graph(startup_program, g, var_dict)
+    parse_graph(main_program, g, var_dict)
+
+    if filename != None:
+        g.save()
+    return g
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
new file mode 100644
index 0000000000..cb63d43709
--- /dev/null
+++ b/python/paddle/v2/fluid/nets.py
@@ -0,0 +1,338 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import layers
+
+__all__ = [
+    "simple_img_conv_pool",
+    "sequence_conv_pool",
+    "glu",
+    "scaled_dot_product_attention",
+]
+
+
+def simple_img_conv_pool(input,
+                         num_filters,
+                         filter_size,
+                         pool_size,
+                         pool_stride,
+                         act,
+                         param_attr=None,
+                         pool_type='max',
+                         use_cudnn=True):
+    conv_out = layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        param_attr=param_attr,
+        act=act,
+        use_cudnn=use_cudnn)
+
+    pool_out = layers.pool2d(
+        input=conv_out,
+        pool_size=pool_size,
+        pool_type=pool_type,
+        pool_stride=pool_stride,
+        use_cudnn=use_cudnn)
+    return pool_out
+
+
+def img_conv_group(input,
+                   conv_num_filter,
+                   pool_size,
+                   conv_padding=1,
+                   conv_filter_size=3,
+                   conv_act=None,
+                   param_attr=None,
+                   conv_with_batchnorm=False,
+                   conv_batchnorm_drop_rate=0.0,
+                   pool_stride=1,
+                   pool_type=None,
+                   use_cudnn=True):
+    """
+    Image Convolution Group, Used for vgg net.
+    """
+    tmp = input
+    assert isinstance(conv_num_filter, list) or \
+        isinstance(conv_num_filter, tuple)
+
+    def __extend_list__(obj):
+        if not hasattr(obj, '__len__'):
+            return [obj] * len(conv_num_filter)
+        else:
+            return obj
+
+    conv_padding = __extend_list__(conv_padding)
+    conv_filter_size = __extend_list__(conv_filter_size)
+    param_attr = __extend_list__(param_attr)
+    conv_with_batchnorm = __extend_list__(conv_with_batchnorm)
+    conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate)
+
+    for i in xrange(len(conv_num_filter)):
+        local_conv_act = conv_act
+        if conv_with_batchnorm[i]:
+            local_conv_act = None
+
+        tmp = layers.conv2d(
+            input=tmp,
+            num_filters=conv_num_filter[i],
+            filter_size=conv_filter_size[i],
+            padding=conv_padding[i],
+            param_attr=param_attr[i],
+            act=local_conv_act,
+            use_cudnn=use_cudnn)
+
+        if conv_with_batchnorm[i]:
+            tmp = layers.batch_norm(input=tmp, act=conv_act)
+            drop_rate = conv_batchnorm_drop_rate[i]
+            if abs(drop_rate) > 1e-5:
+                tmp = layers.dropout(x=tmp, dropout_prob=drop_rate)
+
+    pool_out = layers.pool2d(
+        input=tmp,
+        pool_size=pool_size,
+        pool_type=pool_type,
+        pool_stride=pool_stride,
+        use_cudnn=use_cudnn)
+    return pool_out
+
+
+def sequence_conv_pool(input,
+                       num_filters,
+                       filter_size,
+                       param_attr=None,
+                       act="sigmoid",
+                       pool_type="max"):
+    conv_out = layers.sequence_conv(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        param_attr=param_attr,
+        act=act)
+
+    pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type)
+    return pool_out
+
+
+def glu(input, dim=-1):
+    """
+    The gated linear unit composed by split, sigmoid activation and elementwise
+    multiplication. Specifically, Split the input into two equal sized parts
+    :math:`a` and :math:`b` along the given dimension and then compute as
+    following:
+
+        .. math::
+
+            {GLU}(a, b)= a \otimes \sigma(b)
+
+    Refer to `Language Modeling with Gated Convolutional Networks
+    <https://arxiv.org/pdf/1612.08083.pdf>`_.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (int): The dimension along which to split. If :math:`dim < 0`, the
+            dimension to split along is :math:`rank(input) + dim`.
+
+    Returns:
+        Variable: The Tensor variable with half the size of input.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a Tensor variable with shape [3, 6, 9]
+            fluid.nets.glu(input=x, dim=1)  # shape of output: [3, 3, 9]
+    """
+
+    a, b = layers.split(input, num_or_sections=2, dim=dim)
+    act_b = layers.sigmoid(x=b)
+    out = layers.elementwise_mul(x=a, y=act_b)
+    return out
+
+
+def scaled_dot_product_attention(queries,
+                                 keys,
+                                 values,
+                                 num_heads=1,
+                                 dropout_rate=0.):
+    """
+    The dot-product attention.
+
+    Attention mechanism can be seen as mapping a query and a set of key-value
+    pairs to an output. The output is computed as a weighted sum of the values,
+    where the weight assigned to each value is computed by a compatibility
+    function (dot-product here) of the query with the corresponding key.
+
+    The dot-product attention can be implemented through (batch) matrix
+    multipication as follows:
+
+        .. math::
+
+            Attention(Q, K, V)= softmax(QK^\mathrm{T})V
+
+    Refer to `Attention Is All You Need
+    <https://arxiv.org/pdf/1706.03762.pdf>`_.
+
+    Args:
+
+        queries (Variable): The input variable which should be a 3-D Tensor.
+        keys (Variable): The input variable which should be a 3-D Tensor.
+        values (Variable): The input variable which should be a 3-D Tensor.
+        num_heads (int): Head number to compute the scaled dot product
+                         attention. Default value is 1.
+        dropout_rate (float): The dropout rate to drop the attention weight.
+                              Default value is 0.
+
+    Returns:
+
+        Variable: A 3-D Tensor computed by multi-head scaled dot product
+                  attention.
+
+    Raises:
+
+        ValueError: If input queries, keys, values are not 3-D Tensors.
+
+    NOTE:
+        1. When num_heads > 1, three linear projections are learned respectively
+        to map input queries, keys and values into queries', keys' and values'.
+        queries', keys' and values' have the same shapes with queries, keys
+        and values.
+
+        1. When num_heads == 1, scaled_dot_product_attention has no learnable
+        parameters.
+
+    Examples:
+        .. code-block:: python
+
+            # Suppose q, k, v are Tensors with the following shape:
+            # q: [3, 5, 9], k: [3, 6, 9], v: [3, 6, 10]
+
+            contexts = fluid.nets.scaled_dot_product_attention(q, k, v)
+            contexts.shape  # [3, 5, 10]
+    """
+    if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3):
+        raise ValueError(
+            "Inputs quries, keys and values should all be 3-D tensors.")
+
+    if queries.shape[-1] != keys.shape[-1]:
+        raise ValueError(
+            "The hidden size of queries and keys should be the same.")
+    if keys.shape[-2] != values.shape[-2]:
+        raise ValueError(
+            "The max sequence length in query batch and in key batch "
+            "should be the same.")
+    if keys.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of keys (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (keys.shape[-1], num_heads))
+    if values.shape[-1] % num_heads != 0:
+        raise ValueError("The hidden size of values (%d) must be divisible "
+                         "by the number of attention heads (%d)." %
+                         (values.shape[-1], num_heads))
+
+    def __compute_qkv(queries, keys, values, num_heads):
+        """
+        Add linear projection to queries, keys, and values.
+
+        Args:
+            queries(Tensor): a 3-D input Tensor.
+            keys(Tensor): a 3-D input Tensor.
+            values(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads. Linearly project the inputs
+                            ONLY when num_heads > 1.
+
+        Returns:
+            Tensor: linearly projected output Tensors: queries', keys' and
+                    values'. They have the same shapes with queries, keys and
+                    values.
+        """
+
+        if num_heads == 1:
+            return queries, keys, values
+
+        q = layers.fc(input=queries, size=queries.shape[-1], num_flatten_dims=2)
+        k = layers.fc(input=keys, size=keys.shape[-1], num_flatten_dims=2)
+        v = layers.fc(input=values, size=values.shape[-1], num_flatten_dims=2)
+        return q, k, v
+
+    def __split_heads(x, num_heads):
+        """
+        Reshape the last dimension of inpunt tensor x so that it becomes two
+        dimensions.
+
+        Args:
+            x(Tensor): a 3-D input Tensor.
+            num_heads(int): The number of heads.
+
+        Returns:
+            Tensor: a Tensor with shape [..., n, m/num_heads], where m is size
+                    of the last dimension of x.
+        """
+        if num_heads == 1:
+            return x
+
+        hidden_size = x.shape[-1]
+        # reshape the 3-D input: [batch_size, max_sequence_length, hidden_dim]
+        # into a 4-D output:
+        # [batch_size, max_sequence_length, num_heads, hidden_size_per_head].
+        reshaped = layers.reshape(
+            x=x,
+            shape=list(x.shape[:-1]) + [num_heads, hidden_size // num_heads])
+
+        # permuate the dimensions into:
+        # [batch_size, num_heads, max_sequence_len, hidden_size_per_head]
+        return layers.transpose(x=reshaped, perm=[0, 2, 1, 3])
+
+    def __combine_heads(x):
+        """
+        Reshape the last two dimensions of inpunt tensor x so that it becomes
+        one dimension.
+
+        Args:
+            x(Tensor): a 4-D input Tensor with shape
+                       [bs, num_heads, max_sequence_length, hidden_dim].
+
+        Returns:
+            Tensor: a Tensor with shape
+                    [bs, max_sequence_length, num_heads * hidden_dim].
+        """
+
+        if len(x.shape) == 3: return x
+        if len(x.shape) != 4:
+            raise ValueError("Input(x) should be a 4-D Tensor.")
+
+        trans_x = layers.transpose(x, perm=[0, 2, 1, 3])
+        return layers.reshape(
+            x=trans_x,
+            shape=map(int, [
+                trans_x.shape[0], trans_x.shape[1],
+                trans_x.shape[2] * trans_x.shape[3]
+            ]))
+
+    q, k, v = __compute_qkv(queries, keys, values, num_heads)
+
+    q = __split_heads(q, num_heads)
+    k = __split_heads(k, num_heads)
+    v = __split_heads(v, num_heads)
+
+    key_dim_per_head = keys.shape[-1] // num_heads
+    scaled_q = layers.scale(x=q, scale=key_dim_per_head**-0.5)
+    product = layers.matmul(x=k, y=scaled_q, transpose_y=True)
+
+    weights = layers.reshape(
+        x=layers.reshape(
+            x=product, shape=[-1, product.shape[-1]], act="softmax"),
+        shape=product.shape)
+    if dropout_rate:
+        weights = layers.dropout(x, dropout_prob=dropout_rate, is_test=False)
+    ctx_multiheads = layers.matmul(weights, v)
+    return __combine_heads(ctx_multiheads)
diff --git a/python/paddle/v2/fluid/op.py b/python/paddle/v2/fluid/op.py
new file mode 100644
index 0000000000..f368e0c2d8
--- /dev/null
+++ b/python/paddle/v2/fluid/op.py
@@ -0,0 +1,281 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from PaddlePaddle C++ end.
+    :return: A list of registered OpProto.
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+def is_str(s):
+    return isinstance(s, str) or isinstance(s, unicode)
+
+
+class OpDescCreationMethod(object):
+    """
+    Convert the user's input(only keyword arguments are supported) to OpDesc
+    based on the OpProto.
+
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, framework_pb2.OpProto):
+            raise TypeError(
+                "Type of op_proto should be OpProto in PaddlePaddle.")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user's input to OpDesc. Only keyword arguments are supported.
+        :return: The OpDesc based on user input.
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments are supported.")
+        op_desc = framework_pb2.OpDesc()
+        for input_parameter in self.__op_proto__.inputs:
+            input_arguments = kwargs.get(input_parameter.name, [])
+            if is_str(input_arguments):
+                input_arguments = [input_arguments]
+
+            if not input_parameter.duplicable and len(input_arguments) > 1:
+                raise ValueError(
+                    "Input %s expects only one input, but %d are given." %
+                    (input_parameter.name, len(input_arguments)))
+
+            ipt = op_desc.inputs.add()
+            ipt.parameter = input_parameter.name
+            ipt.arguments.extend(input_arguments)
+
+        for output_parameter in self.__op_proto__.outputs:
+            output_arguments = kwargs.get(output_parameter.name, [])
+            if is_str(output_arguments):
+                output_arguments = [output_arguments]
+
+            if not output_parameter.duplicable and len(output_arguments) > 1:
+                raise ValueError(
+                    "Output %s expects only one output, but %d are given." %
+                    (output_parameter.name, len(output_arguments)))
+
+            out = op_desc.outputs.add()
+            out.parameter = output_parameter.name
+            out.arguments.extend(output_arguments)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == framework_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == framework_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == framework_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == framework_pb2.BOOLEAN:
+                    new_attr.b = user_defined_attr
+                elif attr.type == framework_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == framework_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == framework_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                elif attr.type == framework_pb2.BOOLEANS:
+                    new_attr.bools.extend(user_defined_attr)
+                elif attr.type == framework_pb2.INT_PAIRS:
+                    for p in user_defined_attr:
+                        pair = new_attr.int_pairs.add()
+                        pair.first = p[0]
+                        pair.second = p[1]
+                else:
+                    raise NotImplementedError(
+                        "A not supported attribute type: %s." % (
+                            str(attr.type)))
+
+        return op_desc
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a boolean array to a single boolean parameter. If any element in
+        the array is True, this function will return True, otherwise False.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+class OpInfo(object):
+    def __init__(self, name, method, inputs, outputs, attrs):
+        self.name = name
+        self.method = method
+        self.inputs = inputs
+        self.outputs = outputs
+        self.attrs = attrs
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto.
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    return OpInfo(
+        method=__impl__,
+        name=op_proto.type,
+        inputs=[(var.name, var.duplicable) for var in op_proto.inputs],
+        outputs=[(var.name, var.duplicable) for var in op_proto.outputs],
+        attrs=[attr.name for attr in op_proto.attrs])
+
+
+class OperatorFactory(object):
+    def __init__(self):
+        self.op_methods = dict()
+
+        for op_proto in get_all_op_protos():
+            method = create_op_creation_method(op_proto)
+            self.op_methods[method.name] = method
+
+    def __call__(self, *args, **kwargs):
+        if "type" in kwargs:
+            if len(args) != 0:
+                raise ValueError(
+                    "Except the argument \"type\","
+                    "all of the other arguments should be keyword arguments.")
+            t = kwargs.pop("type")
+        else:
+            if len(args) != 1:
+                raise ValueError(
+                    "Except the argument \"type\","
+                    "all of the other arguments should be keyword arguments.")
+            t = args[0]
+
+        return self.get_op_info(t).method(**kwargs)
+
+    def types(self):
+        return self.op_methods.keys()
+
+    def get_op_info(self, t):
+        if t not in self.op_methods:
+            raise ValueError("The operator: %s is not registered." % t)
+        return self.op_methods.get(t)
+
+    def get_op_input_names(self, type):
+        return map(lambda x: x[0], self.get_op_info(type).inputs)
+
+    def get_op_inputs(self, type):
+        return self.get_op_info(type).inputs
+
+    def get_op_output_names(self, type):
+        return map(lambda x: x[0], self.get_op_info(type).outputs)
+
+    def get_op_outputs(self, type):
+        return self.get_op_info(type).outputs
+
+    def get_op_attr_names(self, type):
+        return self.get_op_info(type).attrs
+
+
+class __RecurrentOp__(object):
+    __proto__ = None
+    type = "recurrent"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.RecurrentOp.create(proto.SerializeToString())
+
+
+class __DynamicRecurrentOp__(object):
+    __proto__ = None
+    type = "dynamic_recurrent"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.DynamicRecurrentOp.create(proto.SerializeToString())
+
+
+class __CondOp__(object):
+    __proto__ = None
+    type = "cond"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create condop
+        return core.CondOp.create(proto.SerializeToString())
+
+
+Operator = OperatorFactory()  # The default global factory
+RecurrentOp = __RecurrentOp__()
+DynamicRecurrentOp = __DynamicRecurrentOp__()
+CondOp = __CondOp__()
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
new file mode 100644
index 0000000000..7844a4e2df
--- /dev/null
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -0,0 +1,601 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+
+import framework
+import layers
+from backward import append_backward
+from framework import unique_name, program_guard
+from initializer import Constant
+from layer_helper import LayerHelper
+from regularizer import append_regularization_ops
+from clip import append_gradient_clip_ops, error_clip_callback
+
+__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
+
+
+class Optimizer(object):
+    """Optimizer Base class.
+
+    Define the common interface of an optimizer.
+    User should not use this class directly,
+    but need to use one of it's implementation.
+    """
+
+    def __init__(self, learning_rate, global_step=None, regularization=None):
+        assert learning_rate is not None
+        self._global_step = global_step
+        self.regularization = regularization
+        self._global_learning_rate = learning_rate
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra variables associated with the parameters
+        # to train. These variables are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+        self.helper = None
+
+    def _create_global_learning_rate(self):
+        if isinstance(self._global_learning_rate, float):
+            self._global_learning_rate = layers.create_global_var(
+                name=unique_name("learning_rate"),
+                shape=[1],
+                value=float(self._global_learning_rate),
+                dtype='float32',
+                persistable=True)
+
+        if not isinstance(self._global_learning_rate, framework.Variable):
+            raise ValueError("learning rate should be a Variable, "
+                             "actual type is %s",
+                             type(self._global_learning_rate))
+
+    @property
+    def global_learning_rate(self):
+        """
+        get global decayed learning rate
+        :return:
+        """
+        return self._global_learning_rate
+
+    def _append_optimize_op(self, block, param_and_grad):
+        """ append optimize operator to block and return all the added optimize_op
+        """
+        raise NotImplementedError()
+
+    def _create_param_lr(self, param_and_grad):
+        # create learning rate variable for every parameter
+        param = param_and_grad[0]
+        param_lr = param.optimize_attr['learning_rate']
+        return self._global_learning_rate * param_lr
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
+        """
+        pass
+
+    def _finish_update(self, block):
+        """Finish any custom updates needed
+           before completing an optimization step
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
+
+        Returns:
+            list of finish ops or None
+        """
+        pass
+
+    def _add_accumulator(self, name, param, dtype=None, fill_value=0.0):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be added
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+        """
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            raise Exception("Accumulator {} already exists for parameter {}".
+                            format(name, param.name))
+
+        assert isinstance(self.helper, LayerHelper)
+        var = self.helper.create_global_variable(
+            name=unique_name(name),
+            persistable=True,
+            dtype=dtype or param.dtype,
+            type=param.type,
+            shape=param.shape)
+        self.helper.set_variable_initializer(
+            var, initializer=Constant(value=float(fill_value)))
+        self._accumulators[name][param.name] = var
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def _increment_global_step(self, block):
+        """Increment the global step by 1 after every iteration
+
+        Args:
+            block: the block in which the loss variable is present
+
+        Returns:
+            list with global_step increment op as its only element
+        """
+        assert isinstance(block, framework.Block)
+        assert self._global_step is not None
+        # create the increment op
+        increment_op = block.append_op(
+            type="increment",
+            inputs={"X": self._global_step},
+            outputs={"Out": self._global_step},
+            attrs={"step": 1.0})
+
+        return increment_op
+
+    def create_optimization_pass(self,
+                                 parameters_and_grads,
+                                 loss,
+                                 startup_program=None):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          loss: the target that this optimization is for.
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+          optimization. This will include parameter update ops, global step
+          update ops and any other custom ops required by subclasses to manage
+          their internal state.
+          :param startup_program:
+        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters and extend _finish_update method to add custom ops.
+
+        # Create any accumulators
+        program = loss.block.program
+        with program_guard(program, startup_program):
+            self.helper = LayerHelper(self.__class__.__name__)
+            self._create_accumulators(loss.block,
+                                      [p[0] for p in parameters_and_grads])
+            self._create_global_learning_rate()
+
+            optimize_ops = []
+            for param_and_grad in parameters_and_grads:
+                if param_and_grad[0].trainable is True and param_and_grad[
+                        1] is not None:
+                    optimize_op = self._append_optimize_op(loss.block,
+                                                           param_and_grad)
+                    optimize_ops.append(optimize_op)
+
+            # Returned list of ops can include more ops in addition
+            # to optimization ops
+            return_ops = optimize_ops
+
+            # Get custom finish ops for subclasses
+            # FIXME: Need to fix this once we figure out how to handle dependencies
+            finish_ops = self._finish_update(loss.block)
+            if finish_ops is not None:
+                return_ops += finish_ops
+
+            if self._global_step is not None:
+                return_ops.append(self._increment_global_step(loss.block))
+            return return_ops
+
+    def minimize(self,
+                 loss,
+                 startup_program=None,
+                 parameter_list=None,
+                 no_grad_set=None):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = append_backward(loss, parameter_list, no_grad_set,
+                                       error_clip_callback)
+
+        params_grads = append_gradient_clip_ops(params_grads)
+
+        # Add regularization if any
+        params_grads = append_regularization_ops(params_grads,
+                                                 self.regularization)
+
+        optimize_ops = self.create_optimization_pass(params_grads, loss,
+                                                     startup_program)
+        return optimize_ops, params_grads
+
+
+class SGDOptimizer(Optimizer):
+    """ Simple SGD optimizer without any state.
+    """
+
+    def __init__(self, learning_rate, **kwargs):
+        assert learning_rate is not None
+        super(SGDOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "sgd"
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={"ParamOut": param_and_grad[0]})
+
+        return sgd_op
+
+
+class MomentumOptimizer(Optimizer):
+    """Simple Momentum optimizer with velocity state
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(MomentumOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "momentum"
+        self._momentum = momentum
+        self._use_nesterov = bool(use_nesterov)
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._velocity_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={"mu": self._momentum,
+                   "use_nesterov": self._use_nesterov})
+
+        return momentum_op
+
+
+class AdagradOptimizer(Optimizer):
+    """Simple Adagrad optimizer with moment state
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs):
+        assert learning_rate is not None
+        assert epsilon is not None
+        super(AdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "adagrad"
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+
+        # Create the adagrad optimizer op
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon})
+
+        return adagrad_op
+
+
+class AdamOptimizer(Optimizer):
+    """Implements the Adam Optimizer
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 **kwargs):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(AdamOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "adam"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        main_block = block.program.global_block()
+        # Create beta1 and beta2 power tensors
+        beta_shape = [1]
+        self._beta1_pow_acc = self.helper.create_global_variable(
+            name=unique_name('beta1_pow_acc'),
+            dtype='float32',
+            shape=beta_shape,
+            lod_level=0,
+            persistable=True)
+        self.helper.set_variable_initializer(
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
+
+        self._beta2_pow_acc = self.helper.create_global_variable(
+            name=unique_name('beta2_pow_acc'),
+            dtype='float32',
+            shape=beta_shape,
+            lod_level=0,
+            persistable=True)
+
+        self.helper.set_variable_initializer(
+            self._beta2_pow_acc, initializer=Constant(self._beta2))
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(self._moment1_acc_str, p)
+            self._add_accumulator(self._moment2_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        # create the adam optimize op
+        adam_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment1": moment1,
+                "Moment2": moment2,
+                "Beta1Pow": self._beta1_pow_acc,
+                "Beta2Pow": self._beta2_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "Moment1Out": moment1,
+                "Moment2Out": moment2
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            })
+
+        return adam_op
+
+    def _finish_update(self, block):
+        """Update Beta1 and Beta2 Power accumulators
+        """
+        assert isinstance(block, framework.Block)
+        main_block = block.program.global_block()
+        scale_beta1 = main_block.append_op(
+            type="scale",
+            inputs={"X": self._beta1_pow_acc},
+            outputs={"Out": self._beta1_pow_acc},
+            attrs={"scale": self._beta1})
+
+        scale_beta2 = main_block.append_op(
+            type="scale",
+            inputs={"X": self._beta2_pow_acc},
+            outputs={"Out": self._beta2_pow_acc},
+            attrs={"scale": self._beta2})
+
+        return [scale_beta1, scale_beta2]
+
+
+class AdamaxOptimizer(Optimizer):
+    """Implements the Adamax Optimizer
+    """
+    _moment_acc_str = "moment"
+    _inf_norm_acc_str = "inf_norm"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 **kwargs):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(AdamaxOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "adamax"
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        # Create beta1 power accumulator tensor
+        beta_shape = [1]
+        self._beta1_pow_acc = self.helper.create_global_variable(
+            name=unique_name('beta1_pow_acc'),
+            dtype='float32',
+            shape=beta_shape,
+            lod_level=0,
+            persistable=True)
+        self.helper.set_variable_initializer(
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
+
+        # Create accumulator tensors for first moment and infinity norm
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+            self._add_accumulator(self._inf_norm_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
+        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
+                                         param_and_grad[0])
+        # create the adamax optimize op
+        adamax_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "Moment": moment,
+                "InfNorm": inf_norm,
+                "Beta1Pow": self._beta1_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": moment,
+                "InfNormOut": inf_norm
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            })
+
+        return adamax_op
+
+    def _finish_update(self, block):
+        """Update Beta1 Power accumulator
+        """
+        assert isinstance(block, framework.Block)
+        main_block = block.program.global_block()
+        scale_beta1 = main_block.append_op(
+            type="scale",
+            inputs={"X": self._beta1_pow_acc},
+            outputs={"Out": self._beta1_pow_acc},
+            attrs={"scale": self._beta1})
+
+        return [scale_beta1]
+
+
+class DecayedAdagradOptimizer(Optimizer):
+    """Simple Decayed Adagrad optimizer with moment state
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs):
+        assert learning_rate is not None
+        assert decay is not None
+        assert epsilon is not None
+
+        super(DecayedAdagradOptimizer, self).__init__(
+            learning_rate=learning_rate, **kwargs)
+        self.type = "decayed_adagrad"
+        self._decay = decay
+        self._epsilon = epsilon
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(self._moment_acc_str, p)
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+
+        # Create the decayed adagrad optimizer op
+        decayed_adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad)
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon})
+
+        return decayed_adagrad_op
+
+
+# We short the class name, since users will use the optimizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# sgd = fluid.optimizer.SGD(...)
+#
+# It is no need to add an `Optimizer` as the class suffix
+SGD = SGDOptimizer
+Momentum = MomentumOptimizer
+Adagrad = AdagradOptimizer
+Adam = AdamOptimizer
+Adamax = AdamaxOptimizer
+DecayedAdagrad = DecayedAdagradOptimizer
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000..fc566b8a24
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,104 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+__all__ = [
+    'ParamAttr',
+    'WeightNormParamAttr',
+]
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True,
+                 gradient_clip=None):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+        self.gradient_clip = gradient_clip
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, list) or isinstance(arg, tuple):
+            return [ParamAttr.to_attr(a) for a in arg]
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'optimize_attr': {
+                'learning_rate': self.learning_rate
+            },
+            'regularizer': self.regularizer,
+            'trainable': self.trainable,
+            'gradient_clip_attr': self.gradient_clip
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
+
+
+class WeightNormParamAttr(ParamAttr):
+    """
+    Used for weight normalization. Any field in ParamAttr can also be set here.
+    Besides, an extra field dim can be set to indicate the dimension except 
+    which to normalize.
+    """
+    # List to record the parameters reparameterized by weight normalization.
+    # If these parameters are treated as Variable rather than Parameter,
+    # it can be used to discriminate these parameters and help to serialize
+    # these paramters for inference.
+    params_with_weight_norm = []
+
+    def __init__(self, dim=None, **kwargs):
+        super(WeightNormParamAttr, self).__init__(**kwargs)
+        self.dim = dim
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000..51c1c8aa70
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+import os
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (list of string) : The profiler options and counters can refer
+            to "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    config_file = 'nvprof_config_file'
+    with open(config_file, 'wb') as fp:
+        fp.writelines(["%s\n" % item for item in config])
+    core.nvprof_init(output_file, output_mode, config_file)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
+    os.remove(config_file)
+
+
+def reset_profiler():
+    """The profiler clear interface.
+    reset_profiler will clear the previous time record.
+    """
+    core.reset_profiler()
+
+
+@contextmanager
+def profiler(state, sorted_key=None):
+    """The profiler interface.
+    Different from cuda_profiler, this profiler can be used to profile both CPU
+    and GPU program. By defalut, it records the CPU and GPU operator kernels,
+    if you want to profile other program, you can refer the profiling tutorial
+    to add more records.
+
+    Args:
+        state (string) : The profiling state, which should be 'CPU' or 'GPU',
+            telling the profiler to use CPU timer or GPU timer for profiling.
+            Although users may have already specified the execution place
+            (CPUPlace/CUDAPlace) in the begining, for flexibility the profiler
+            would not inherit this place.
+        sorted_key (string) : If None, the profiling results will be printed
+            in the order of first end time of events. Otherwise, the profiling
+            results will be sorted by the this flag. This flag should be one
+            of 'calls', 'total', 'max', 'min' or 'ave'.
+            The `calls` means sorting by the number of calls.
+            The `total` means sorting by the total execution time.
+            The `max` means sorting by the maximum execution time.
+            The `min` means sorting by the minimum execution time.
+            The `ave` means sorting by the average execution time.
+    """
+
+    if state not in ['CPU', 'GPU']:
+        raise ValueError("The state must be 'CPU' or 'GPU'.")
+    prof_state = core.ProfilerState.kCUDA if state == "GPU" else core.ProfilerState.kCPU
+    core.enable_profiler(prof_state)
+    yield
+
+    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The state must be in 'calls', 'total', "
+                         "'max', 'min', 'ave'")
+    sorted_key = 'default' if sorted_key is None else sorted_key
+    key_map = {
+        'default': core.EventSortingKey.kDefault,
+        'calls': core.EventSortingKey.kCalls,
+        'total': core.EventSortingKey.kTotal,
+        'max': core.EventSortingKey.kMax,
+        'min': core.EventSortingKey.kMin,
+        'ave': core.EventSortingKey.kAve,
+    }
+    # TODO(qingqing) : redirect C++ ostream to Python stream.
+    # with core.ostream_redirect(stdout=True, stderr=True):
+    core.disable_profiler(key_map[sorted_key])
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
new file mode 100644
index 0000000000..0273da647a
--- /dev/null
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -0,0 +1,188 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import framework
+
+__all__ = [
+    'append_regularization_ops',
+    'L1Decay',
+    'L2Decay',
+]
+
+
+def append_regularization_ops(parameters_and_grads, regularization=None):
+    """Create and add backward regularization Operators
+
+    Creates and adds backward regularization operators in the BlockDesc.
+    This will add gradients of the regularizer function to the gradients
+    of the parameters and return these modified gradients. This is the
+    same as implementing weight decay in optimizers for regularization.
+
+    Args:
+        parameters_and_grads: A list of (parameters, gradients) pairs
+                              that need to be regularized.
+        regularization: A global regularizer. If the parameter is not
+                        set. It will be applied with regularizer.
+
+    Returns:
+        list of (parameters, gradients) pair with the regularized gradient
+
+    Raises:
+        Exception: Unknown regularization type
+    """
+    params_and_grads = []
+    for param, grad in parameters_and_grads:
+        regularization_term = None
+        if param.regularizer is not None:
+            # Add variable for regularization term in grad block
+            regularization_term = param.regularizer(param, grad.block)
+        elif regularization is not None:
+            regularization_term = regularization(param, grad.block)
+
+        # If no gradient or no regularization specified,
+        # then we don't need to do anything
+        if grad is None or regularization_term is None:
+            params_and_grads.append((param, grad))
+            continue
+
+        assert grad.shape == regularization_term.shape
+
+        grad.block.append_op(
+            type='elementwise_add',
+            inputs={"X": grad,
+                    "Y": regularization_term},
+            outputs={"Out": grad})
+        params_and_grads.append((param, grad))
+
+    return params_and_grads
+
+
+class WeightDecayRegularizer(object):
+    """Base class for weight decay regularizers
+
+    Defines the common interface of weight-decay regularizers.
+    Weight-decay regularizers are added only during the backward
+    pass for faster regularization. They add operations to the network
+    that correspond to gradient of the regularization function.
+    Users should not use this class directly, but need to use one
+    of its implementations
+    """
+
+    def __init__(self):
+        pass
+
+    def __call__(self, param, block):
+        """Add corresponding weight decay operations to the network
+        """
+        raise NotImplementedError()
+
+    def __str__(self):
+        """Debug string
+        """
+        raise NotImplementedError()
+
+
+class L2DecayRegularizer(WeightDecayRegularizer):
+    """Implements the L2 Weight Decay Regularization
+    """
+
+    def __init__(self, regularization_coeff=0.0):
+        assert regularization_coeff is not None
+        super(L2DecayRegularizer, self).__init__()
+        self._regularization_coeff = regularization_coeff
+
+    def __call__(self, param, block):
+        """Add L2 weight decay ops to network
+
+        Adds L2 weight decay ops.
+        L2WeightDecay = reg_coeff * parameter
+
+        Args:
+            param: parameter variable for which regularization is applied
+            block: block in which variable is to be created
+
+        Returns:
+            new variable for weight decay
+        """
+        assert isinstance(param, framework.Parameter)
+        assert isinstance(block, framework.Block)
+        decay = block.create_var(
+            dtype="float32", shape=param.shape, lod_level=param.lod_level)
+        # Append Op to calculate decay
+        block.append_op(
+            type='scale',
+            inputs={"X": param},
+            outputs={"Out": decay},
+            attrs={"scale": self._regularization_coeff})
+
+        return decay
+
+    def __str__(self):
+        return "L2Decay, regularization_coeff=%f" % self._regularization_coeff
+
+
+class L1DecayRegularizer(WeightDecayRegularizer):
+    """Implements the L1 Weight Decay Regularization
+    """
+
+    def __init__(self, regularization_coeff=0.0):
+        assert regularization_coeff is not None
+        super(L1DecayRegularizer, self).__init__()
+        self._regularization_coeff = regularization_coeff
+
+    def __call__(self, param, block):
+        """Add L1 weight decay ops to network
+
+        Adds L1 weight decay ops.
+        L1WeightDecay = reg_coeff * sign(parameter)
+
+        Args:
+            param: parameter variable for which regularization is applied
+            block: block in which variable is to be created
+
+        Returns:
+            new variable for weight decay
+        """
+        assert isinstance(param, framework.Parameter)
+        assert isinstance(block, framework.Block)
+        decay = block.create_var(
+            dtype="float32", shape=param.shape, lod_level=param.lod_level)
+        # Append sign op
+        block.append_op(
+            type='sign', inputs={"X": param}, outputs={"Out": decay})
+
+        # Append scale op to the output of sign op
+        block.append_op(
+            type='scale',
+            inputs={"X": decay},
+            outputs={"Out": decay},
+            attrs={"scale": self._regularization_coeff})
+
+        return decay
+
+    def __str__(self):
+        return "L1Decay, regularization_coeff=%f" % self._regularization_coeff
+
+
+# We short the class name, since users will use the regulaizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=fluid.regularizer.Xavier())
+#
+# It is no need to add a `Regularizer` as the class suffix
+L1Decay = L1DecayRegularizer
+L2Decay = L2DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
new file mode 100644
index 0000000000..62f82151eb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -0,0 +1,4 @@
+image/
+fit_a_line.model/
+tmp
+cuda_profiler.txt
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
new file mode 100644
index 0000000000..628ce60b40
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -0,0 +1,14 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+if(NOT WITH_DISTRIBUTE)
+    list(REMOVE_ITEM TEST_OPS test_recv_op)
+endif(NOT WITH_DISTRIBUTE)
+
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
+
+add_subdirectory(book)
+add_subdirectory(book_distribute)
+add_subdirectory(book_memory_optimization)
diff --git a/v1_api_demo/model_zoo/resnet/example/__init__.py b/python/paddle/v2/fluid/tests/__init__.py
similarity index 89%
rename from v1_api_demo/model_zoo/resnet/example/__init__.py
rename to python/paddle/v2/fluid/tests/__init__.py
index f662d68263..b94a21a7e4 100644
--- a/v1_api_demo/model_zoo/resnet/example/__init__.py
+++ b/python/paddle/v2/fluid/tests/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
new file mode 100644
index 0000000000..dda02c03fd
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -0,0 +1,35 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+py_test(test_recognize_digits_mlp_cpu
+  SRCS test_recognize_digits.py
+  ARGS mlp)
+py_test(test_recognize_digits_mlp_cuda
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda)
+py_test(test_recognize_digits_conv_cpu
+  SRCS test_recognize_digits.py
+  ARGS conv)
+py_test(test_recognize_digits_conv_cuda
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda)
+py_test(test_recognize_digits_mlp_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --parallel)
+py_test(test_recognize_digits_mlp_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS mlp --use_cuda --parallel)
+py_test(test_recognize_digits_conv_cpu_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --parallel)
+py_test(test_recognize_digits_conv_cuda_parallel
+  SRCS test_recognize_digits.py
+  ARGS conv --use_cuda --parallel)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/v1_api_demo/gan/data/download_cifar.sh b/python/paddle/v2/fluid/tests/book/__init__.py
old mode 100755
new mode 100644
similarity index 73%
rename from v1_api_demo/gan/data/download_cifar.sh
rename to python/paddle/v2/fluid/tests/book/__init__.py
index bbadc7c10c..b94a21a7e4
--- a/v1_api_demo/gan/data/download_cifar.sh
+++ b/python/paddle/v2/fluid/tests/book/__init__.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
-wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
-tar zxf cifar-10-python.tar.gz
-rm cifar-10-python.tar.gz
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
new file mode 100644
index 0000000000..0b954c60b6
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+        print(avg_loss_value)
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
new file mode 100644
index 0000000000..30582a21d0
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+opts = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 128
+PASS_NUM = 1
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe.run(fluid.default_startup_program())
+
+for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
+    for data in train_reader():
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
new file mode 100644
index 0000000000..f85768de99
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -0,0 +1,222 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 10
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    # place = fluid.CPUPlace()
+    place = fluid.CUDAPlace(0)
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    embedding_param = fluid.global_scope().find_var(embedding_name).get_tensor()
+    embedding_param.set(
+        load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
+
+    start_time = time.time()
+    batch_id = 0
+    for pass_id in xrange(PASS_NUM):
+        chunk_evaluator.reset(exe)
+        for data in train_data():
+            cost, precision, recall, f1_score = exe.run(
+                fluid.default_main_program(),
+                feed=feeder.feed(data),
+                fetch_list=[avg_cost] + chunk_evaluator.metrics)
+            pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                exe)
+
+            if batch_id % 10 == 0:
+                print("avg_cost:" + str(cost) + " precision:" + str(
+                    precision) + " recall:" + str(recall) + " f1_score:" + str(
+                        f1_score) + " pass_precision:" + str(
+                            pass_precision) + " pass_recall:" + str(pass_recall)
+                      + " pass_f1_score:" + str(pass_f1_score))
+                if batch_id != 0:
+                    print("second per batch: " + str((time.time() - start_time)
+                                                     / batch_id))
+
+            # exit early for CI
+            exit(0)
+
+            batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
new file mode 100644
index 0000000000..82b760d693
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -0,0 +1,255 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as pd
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 2
+max_length = 8
+topk_size = 50
+trg_dic_size = 10000
+beam_size = 2
+
+decoder_size = hidden_dim
+
+place = core.CPUPlace()
+
+
+def encoder():
+    # encoder
+    src_word_id = pd.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = pd.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = pd.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = pd.sequence_last_step(input=lstm_hidden0)
+    return encoder_out
+
+
+def decoder_train(context):
+    # decoder
+    trg_language_word = pd.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = pd.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = pd.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        pre_state = rnn.memory(init=context)
+        current_state = pd.fc(input=[current_word, pre_state],
+                              size=decoder_size,
+                              act='tanh')
+
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        rnn.update_memory(pre_state, current_state)
+        rnn.output(current_score)
+
+    return rnn()
+
+
+def decoder_decode(context):
+    init_state = context
+    array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
+    counter = pd.zeros(shape=[1], dtype='int64')
+
+    # fill the first element with init_state
+    state_array = pd.create_array('float32')
+    pd.array_write(init_state, array=state_array, i=counter)
+
+    # ids, scores as memory
+    ids_array = pd.create_array('int64')
+    scores_array = pd.create_array('float32')
+
+    init_ids = pd.data(name="init_ids", shape=[1], dtype="int64", lod_level=2)
+    init_scores = pd.data(
+        name="init_scores", shape=[1], dtype="float32", lod_level=2)
+
+    pd.array_write(init_ids, array=ids_array, i=counter)
+    pd.array_write(init_scores, array=scores_array, i=counter)
+
+    cond = pd.less_than(x=counter, y=array_len)
+
+    while_op = pd.While(cond=cond)
+    with while_op.block():
+        pre_ids = pd.array_read(array=ids_array, i=counter)
+        pre_state = pd.array_read(array=state_array, i=counter)
+        pre_score = pd.array_read(array=scores_array, i=counter)
+
+        # expand the lod of pre_state to be the same with pre_score
+        pre_state_expanded = pd.sequence_expand(pre_state, pre_score)
+
+        pre_ids_emb = pd.embedding(
+            input=pre_ids,
+            size=[dict_size, word_dim],
+            dtype='float32',
+            is_sparse=IS_SPARSE)
+
+        # use rnn unit to update rnn
+        current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
+                              size=decoder_size,
+                              act='tanh')
+
+        # use score to do beam search
+        current_score = pd.fc(input=current_state,
+                              size=target_dict_dim,
+                              act='softmax')
+        topk_scores, topk_indices = pd.topk(current_score, k=50)
+        selected_ids, selected_scores = pd.beam_search(
+            pre_ids, topk_indices, topk_scores, beam_size, end_id=10, level=0)
+
+        pd.increment(x=counter, value=1, in_place=True)
+
+        # update the memories
+        pd.array_write(current_state, array=state_array, i=counter)
+        pd.array_write(selected_ids, array=ids_array, i=counter)
+        pd.array_write(selected_scores, array=scores_array, i=counter)
+
+        pd.less_than(x=counter, y=array_len, cond=cond)
+
+    translation_ids, translation_scores = pd.beam_search_decode(
+        ids=ids_array, scores=scores_array)
+
+    # return init_ids, init_scores
+
+    return translation_ids, translation_scores
+
+
+def set_init_lod(data, lod, place):
+    res = core.LoDTensor()
+    res.set(data, place)
+    res.set_lod(lod)
+    return res
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def train_main():
+    context = encoder()
+    rnn_out = decoder_train(context)
+    label = pd.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = pd.cross_entropy(input=rnn_out, label=label)
+    avg_cost = pd.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(1):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                break
+            batch_id += 1
+
+
+def decode_main():
+    context = encoder()
+    translation_ids, translation_scores = decoder_decode(context)
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    init_ids_data = np.array([1 for _ in range(batch_size)], dtype='int64')
+    init_scores_data = np.array(
+        [1. for _ in range(batch_size)], dtype='float32')
+    init_ids_data = init_ids_data.reshape((batch_size, 1))
+    init_scores_data = init_scores_data.reshape((batch_size, 1))
+    init_lod = [i for i in range(batch_size)] + [batch_size]
+    init_lod = [init_lod, init_lod]
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+    for _, data in enumerate(train_data()):
+        init_ids = set_init_lod(init_ids_data, init_lod, place)
+        init_scores = set_init_lod(init_scores_data, init_lod, place)
+
+        src_word_data = to_lodtensor(map(lambda x: x[0], data), place)
+
+        result_ids, result_scores = exe.run(
+            framework.default_main_program(),
+            feed={
+                'src_word_id': src_word_data,
+                'init_ids': init_ids,
+                'init_scores': init_scores
+            },
+            fetch_list=[translation_ids, translation_scores],
+            return_numpy=False)
+        print result_ids.lod()
+        break
+
+
+if __name__ == '__main__':
+    # train_main()
+    decode_main()
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
new file mode 100644
index 0000000000..b4b6020f58
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -0,0 +1,184 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import argparse
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import sys
+import numpy
+
+
+def parse_arg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "nn_type",
+        help="The neural network type, in ['mlp', 'conv']",
+        type=str,
+        choices=['mlp', 'conv'])
+    parser.add_argument(
+        "--parallel",
+        help='Run in parallel or not',
+        default=False,
+        action="store_true")
+    parser.add_argument(
+        "--use_cuda",
+        help="Run the program by using CUDA",
+        default=False,
+        action="store_true")
+    return parser.parse_args()
+
+
+BATCH_SIZE = 64
+
+
+def loss_net(hidden, label):
+    prediction = fluid.layers.fc(input=hidden, size=10, act='softmax')
+    loss = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_loss = fluid.layers.mean(x=loss)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+    return prediction, avg_loss, acc
+
+
+def mlp(img, label):
+    hidden = fluid.layers.fc(input=img, size=200, act='tanh')
+    hidden = fluid.layers.fc(input=hidden, size=200, act='tanh')
+    return loss_net(hidden, label)
+
+
+def conv_net(img, label):
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        pool_size=2,
+        pool_stride=2,
+        act="relu")
+    return loss_net(conv_pool_2, label)
+
+
+def train(args, save_dirname=None):
+    print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
+
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if args.nn_type == 'mlp':
+        net_conf = mlp
+    else:
+        net_conf = conv_net
+
+    if args.parallel:
+        places = fluid.layers.get_places()
+        pd = fluid.layers.ParallelDo(places)
+        with pd.do():
+            img_ = pd.read_input(img)
+            label_ = pd.read_input(label)
+            prediction, avg_loss, acc = net_conf(img_, label_)
+            for o in [avg_loss, acc]:
+                pd.write_output(o)
+
+        avg_loss, acc = pd()
+        # get mean loss and acc through every devices.
+        avg_loss = fluid.layers.mean(x=avg_loss)
+        acc = fluid.layers.mean(x=acc)
+    else:
+        prediction, avg_loss, acc = net_conf(img, label)
+
+    test_program = fluid.default_main_program().clone()
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_loss)
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+
+    exe = fluid.Executor(place)
+    exe.run(fluid.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for batch_id, data in enumerate(train_reader()):
+            # train a mini-batch, fetch nothing
+            exe.run(feed=feeder.feed(data))
+            if (batch_id + 1) % 10 == 0:
+                acc_set = []
+                avg_loss_set = []
+                for test_data in test_reader():
+                    acc_np, avg_loss_np = exe.run(program=test_program,
+                                                  feed=feeder.feed(test_data),
+                                                  fetch_list=[acc, avg_loss])
+                    acc_set.append(float(acc_np))
+                    avg_loss_set.append(float(avg_loss_np))
+                # get test acc and loss
+                acc_val = numpy.array(acc_set).mean()
+                avg_loss_val = numpy.array(avg_loss_set).mean()
+                if float(acc_val) > 0.85:  # test acc > 85%
+                    if save_dirname is not None:
+                        fluid.io.save_inference_model(save_dirname, ["img"],
+                                                      [prediction], exe)
+                    return
+                else:
+                    print(
+                        'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
+                        format(pass_id, batch_id + 1,
+                               float(avg_loss_val), float(acc_val)))
+
+
+def infer(args, save_dirname=None):
+    if save_dirname is None:
+        return
+
+    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    # Use fluid.io.load_inference_model to obtain the inference program desc,
+    # the feed_target_names (the names of variables that will be feeded 
+    # data using feed operators), and the fetch_targets (variables that 
+    # we want to obtain data from using fetch operators).
+    [inference_program, feed_target_names,
+     fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
+
+    # The input's dimension of conv should be 4-D or 5-D.
+    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+
+    # Construct feed as a dictionary of {feed_target_name: feed_target_data}
+    # and results will contain a list of data corresponding to fetch_targets.
+    results = exe.run(inference_program,
+                      feed={feed_target_names[0]: tensor_img},
+                      fetch_list=fetch_targets)
+    print("infer results: ", results[0])
+
+
+if __name__ == '__main__':
+    args = parse_arg()
+    if not args.use_cuda and not args.parallel:
+        save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
+    else:
+        save_dirname = None
+    train(args, save_dirname)
+    infer(args, save_dirname)
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
new file mode 100644
index 0000000000..d4a694e572
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -0,0 +1,222 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+IS_SPARSE = True
+USE_GPU = False
+BATCH_SIZE = 256
+
+
+def get_usr_combined_features():
+    # FIXME(dzh) : old API integer_value(10) may has range check.
+    # currently we don't have user configurated check.
+
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+
+    usr_fc = layers.fc(input=usr_emb, size=32)
+
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    # FIXME(dzh) : need tanh operator
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+
+    return mov_combined_features
+
+
+def model():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    # need cos sim
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+
+    avg_cost = layers.mean(x=square_cost)
+
+    return avg_cost
+
+
+def main():
+    cost = model()
+    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
+    opts = sgd_optimizer.minimize(cost)
+
+    if USE_GPU:
+        place = core.CUDAPlace(0)
+    else:
+        place = core.CPUPlace()
+
+    exe = Executor(place)
+    exe.run(framework.default_startup_program())
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    feeding = {
+        'user_id': 0,
+        'gender_id': 1,
+        'age_id': 2,
+        'job_id': 3,
+        'movie_id': 4,
+        'category_id': 5,
+        'movie_title': 6,
+        'score': 7
+    }
+
+    def func_feed(feeding, data):
+        feed_tensors = {}
+        for (key, idx) in feeding.iteritems():
+            tensor = core.LoDTensor()
+            if key != "category_id" and key != "movie_title":
+                if key == "score":
+                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                        "float32")
+                else:
+                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                        "int64")
+            else:
+                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
+                                 data)
+                lod_info = [len(item) for item in numpy_data]
+                offset = 0
+                lod = [offset]
+                for item in lod_info:
+                    offset += item
+                    lod.append(offset)
+                numpy_data = np.concatenate(numpy_data, axis=0)
+                tensor.set_lod([lod])
+
+            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
+            tensor.set(numpy_data, place)
+            feed_tensors[key] = tensor
+        return feed_tensors
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            outs = exe.run(framework.default_main_program(),
+                           feed=func_feed(feeding, data),
+                           fetch_list=[cost])
+            out = np.array(outs[0])
+            if out[0] < 6.0:
+                # if avg cost less than 6.0, we think our code is good.
+                exit(0)
+
+
+main()
diff --git a/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
new file mode 100644
index 0000000000..fdc6086176
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_rnn_encoder_decoder.py
@@ -0,0 +1,204 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+embedding_dim = 16
+batch_size = 10
+max_length = 50
+topk_size = 50
+encoder_size = decoder_size = hidden_dim
+IS_SPARSE = True
+USE_PEEPHOLES = False
+
+
+def bi_lstm_encoder(input_seq, hidden_size):
+    input_forward_proj = fluid.layers.fc(input=input_seq,
+                                         size=hidden_size * 4,
+                                         bias_attr=True)
+    forward, _ = fluid.layers.dynamic_lstm(
+        input=input_forward_proj,
+        size=hidden_size * 4,
+        use_peepholes=USE_PEEPHOLES)
+    input_backward_proj = fluid.layers.fc(input=input_seq,
+                                          size=hidden_size * 4,
+                                          bias_attr=True)
+    backward, _ = fluid.layers.dynamic_lstm(
+        input=input_backward_proj,
+        size=hidden_size * 4,
+        is_reverse=True,
+        use_peepholes=USE_PEEPHOLES)
+
+    forward_last = fluid.layers.sequence_last_step(input=forward)
+    backward_first = fluid.layers.sequence_first_step(input=backward)
+
+    return forward_last, backward_first
+
+
+# FIXME(peterzhang2029): Replace this function with the lstm_unit_op.
+def lstm_step(x_t, hidden_t_prev, cell_t_prev, size):
+    def linear(inputs):
+        return fluid.layers.fc(input=inputs, size=size, bias_attr=True)
+
+    forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t]))
+    cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t]))
+
+    cell_t = fluid.layers.sums(input=[
+        fluid.layers.elementwise_mul(
+            x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul(
+                x=input_gate, y=cell_tilde)
+    ])
+
+    hidden_t = fluid.layers.elementwise_mul(
+        x=output_gate, y=fluid.layers.tanh(x=cell_t))
+
+    return hidden_t, cell_t
+
+
+def lstm_decoder_without_attention(target_embedding, decoder_boot, context,
+                                   decoder_size):
+    rnn = fluid.layers.DynamicRNN()
+
+    cell_init = fluid.layers.fill_constant_batch_size_like(
+        input=decoder_boot,
+        value=0.0,
+        shape=[-1, decoder_size],
+        dtype='float32')
+    cell_init.stop_gradient = False
+
+    with rnn.block():
+        current_word = rnn.step_input(target_embedding)
+        context = rnn.static_input(context)
+
+        hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
+        cell_mem = rnn.memory(init=cell_init)
+        decoder_inputs = fluid.layers.concat(
+            input=[context, current_word], axis=1)
+        h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size)
+        rnn.update_memory(hidden_mem, h)
+        rnn.update_memory(cell_mem, c)
+        out = fluid.layers.fc(input=h,
+                              size=target_dict_dim,
+                              bias_attr=True,
+                              act='softmax')
+        rnn.output(out)
+    return rnn()
+
+
+def seq_to_seq_net():
+    """Construct a seq2seq network."""
+
+    src_word_idx = fluid.layers.data(
+        name='source_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    src_embedding = fluid.layers.embedding(
+        input=src_word_idx,
+        size=[source_dict_dim, embedding_dim],
+        dtype='float32')
+
+    src_forward_last, src_backward_first = bi_lstm_encoder(
+        input_seq=src_embedding, hidden_size=encoder_size)
+
+    encoded_vector = fluid.layers.concat(
+        input=[src_forward_last, src_backward_first], axis=1)
+
+    decoder_boot = fluid.layers.fc(input=src_backward_first,
+                                   size=decoder_size,
+                                   bias_attr=False,
+                                   act='tanh')
+
+    trg_word_idx = fluid.layers.data(
+        name='target_sequence', shape=[1], dtype='int64', lod_level=1)
+
+    trg_embedding = fluid.layers.embedding(
+        input=trg_word_idx,
+        size=[target_dict_dim, embedding_dim],
+        dtype='float32')
+
+    prediction = lstm_decoder_without_attention(trg_embedding, decoder_boot,
+                                                encoded_vector, decoder_size)
+    label = fluid.layers.data(
+        name='label_sequence', shape=[1], dtype='int64', lod_level=1)
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    return avg_cost
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    avg_cost = seq_to_seq_net()
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(2):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(framework.default_main_program(),
+                           feed={
+                               'source_sequence': word_data,
+                               'target_sequence': trg_word,
+                               'label_sequence': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
new file mode 100644
index 0000000000..df27399dd2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_data():
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and pass_acc > 0.8:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000..529223eba8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out = stacked_lstm_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_data():
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
+                                        fetch_list=[cost, acc_out])
+            pass_acc = accuracy.eval(exe)
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                  " pass_acc=" + str(pass_acc))
+            if cost_val < 1.0 and acc_val > 0.8:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
new file mode 100644
index 0000000000..117f74c59a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -0,0 +1,160 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.layer_helper import LayerHelper
+
+
+def lstm(x, c_pre_init, hidden_dim, forget_bias=None):
+    """
+    This function helps create an operator for the LSTM (Long Short Term
+    Memory) cell that can be used inside an RNN.
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+    rnn = fluid.layers.StaticRNN()
+    with rnn.step():
+        c_pre = rnn.memory(init=c_pre_init)
+        x_t = rnn.step_input(x)
+
+        before_fc = fluid.layers.concat(input=[x_t, c_pre], axis=1)
+        after_fc = fluid.layers.fc(input=before_fc, size=hidden_dim * 4)
+
+        dtype = x.dtype
+        c = helper.create_tmp_variable(dtype)
+        h = helper.create_tmp_variable(dtype)
+
+        helper.append_op(
+            type='lstm_unit',
+            inputs={"X": after_fc,
+                    "C_prev": c_pre},
+            outputs={"C": c,
+                     "H": h},
+            attrs={"forget_bias": forget_bias})
+
+        rnn.update_memory(c_pre, c)
+        rnn.output(h)
+
+    return rnn()
+
+
+def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
+    data = fluid.layers.data(
+        name="words",
+        shape=[seq_len * batch_size, 1],
+        append_batch_size=False,
+        dtype="int64",
+        lod_level=1)
+    label = fluid.layers.data(
+        name="label",
+        shape=[batch_size, 1],
+        append_batch_size=False,
+        dtype="int64")
+
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = fluid.layers.transpose(x=emb, perm=[1, 0, 2])
+
+    c_pre_init = fluid.layers.fill_constant(
+        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
+    c_pre_init.stop_gradient = False
+    layer_1_out = lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, perm=[1, 0, 2])
+
+    prediction = fluid.layers.fc(input=layer_1_out,
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
+
+    return avg_cost, acc
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def chop_data(data, chop_len=80, batch_size=50):
+    data = [(x[0][:chop_len], x[1]) for x in data if len(x[0]) >= chop_len]
+
+    return data[:batch_size]
+
+
+def prepare_feed_data(data, place):
+    tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+
+    label = np.array(map(lambda x: x[1], data)).astype("int64")
+    label = label.reshape([len(label), 1])
+    tensor_label = fluid.LoDTensor()
+    tensor_label.set(label, place)
+
+    return tensor_words, tensor_label
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "load word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    cost, acc = lstm_net(dict_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in xrange(PASS_NUM):
+        for data in train_data():
+            chopped_data = chop_data(data)
+            tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
+
+            outs = exe.run(fluid.default_main_program(),
+                           feed={"words": tensor_words,
+                                 "label": tensor_label},
+                           fetch_list=[cost, acc])
+            cost_val = np.array(outs[0])
+            acc_val = np.array(outs[1])
+
+            print("cost=" + str(cost_val) + " acc=" + str(acc_val))
+            if acc_val > 0.7:
+                exit(0)
+    exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
new file mode 100644
index 0000000000..8cf54846fe
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+IS_SPARSE = True
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+    input=first_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+    input=second_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+    input=third_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
+    input=forth_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(
+    feed_list=[first_word, second_word, third_word, forth_word, next_word],
+    place=place)
+
+exe.run(fluid.default_startup_program())
+
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        avg_cost_np = exe.run(fluid.default_main_program(),
+                              feed=feeder.feed(data),
+                              fetch_list=[avg_cost])
+        if avg_cost_np[0] < 5.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
new file mode 100644
index 0000000000..4d7664469e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/CMakeLists.txt
@@ -0,0 +1,5 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
new file mode 100644
index 0000000000..9774edebfb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_fit_a_line.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+else:
+    trainer_prog = t.get_trainer_program()
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value = exe.run(trainer_prog,
+                                     feed=feeder.feed(data),
+                                     fetch_list=[avg_cost])
+            print("loss:" + str(avg_loss_value))
+            if avg_loss_value[0] < 10.0:
+                exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
new file mode 100644
index 0000000000..298ecfc386
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_image_classification.py
@@ -0,0 +1,170 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+import sys
+
+TRAINERS = 5
+BATCH_SIZE = 128
+PASS_NUM = 100
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("training vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("training resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("pass_id:" + str(pass_id) + "loss:" + str(loss) + " pass_acc:"
+                  + str(pass_acc))
+            # this model is slow, so if we can train two mini batches,
+            # we think it works properly.
+    print("trainer run end")
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
new file mode 100644
index 0000000000..08bb67b0a1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_label_semantic_roles.py
@@ -0,0 +1,240 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        start_time = time.time()
+        batch_id = 0
+        exe.run(fluid.default_startup_program())
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    trainer_prog,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+
+                batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
new file mode 100644
index 0000000000..04b3113690
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_dist_word2vec.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+PASS_NUM = 100
+EMBED_SIZE = 32
+HIDDEN_SIZE = 256
+N = 5
+BATCH_SIZE = 32
+IS_SPARSE = True
+TRAINERS = 2
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
+
+embed_first = fluid.layers.embedding(
+    input=first_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
+    input=second_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
+    input=third_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
+    input=forth_word,
+    size=[dict_size, EMBED_SIZE],
+    dtype='float32',
+    is_sparse=IS_SPARSE,
+    param_attr='shared_w')
+
+concat_embed = fluid.layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    feeder = fluid.DataFeeder(
+        feed_list=[first_word, second_word, third_word, forth_word, next_word],
+        place=place)
+    exe.run(fluid.default_startup_program())
+    for pass_id in range(PASS_NUM):
+        for data in train_reader():
+            avg_cost_np = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+            print("avg_cost_np", avg_cost_np)
+            if avg_cost_np[0] < 5.0:
+                exit(
+                    0)  # if avg cost less than 10.0, we think our code is good.
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
new file mode 100644
index 0000000000..adeacd4adf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_machine_translation.py
@@ -0,0 +1,157 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import os
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+        batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        exe.run(framework.default_startup_program())
+
+        batch_id = 0
+        for pass_id in xrange(2):
+            for data in train_data():
+                word_data = to_lodtensor(map(lambda x: x[0], data), place)
+                trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+                trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+                outs = exe.run(trainer_prog,
+                               feed={
+                                   'src_word_id': word_data,
+                                   'target_language_word': trg_word,
+                                   'target_language_next_word': trg_word_next
+                               },
+                               fetch_list=[avg_cost])
+                avg_cost_val = np.array(outs[0])
+                print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                      " avg_cost=" + str(avg_cost_val))
+                if batch_id > 3:
+                    exit(0)
+                batch_id += 1
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
new file mode 100644
index 0000000000..f18ca05c78
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_conv_dist.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu")
+
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+BATCH_SIZE = 50
+PASS_NUM = 3
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+pserver_endpoints = os.getenv("PSERVERS")  # all pserver endpoints
+trainers = int(os.getenv("TRAINERS"))  # total trainer count
+current_endpoint = os.getenv("SERVER_ENDPOINT")  # current pserver endpoint
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t = fluid.DistributeTranspiler()
+t.transpile(
+    optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        batch_id = 0
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
new file mode 100644
index 0000000000..7733248cb4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recognize_digits_mlp_dist.py
@@ -0,0 +1,89 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+BATCH_SIZE = 128
+PASS_NUM = 100
+
+images = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+# TODO(aroraabhinav) Add regularization and error clipping after
+# Issue 7432(https://github.com/PaddlePaddle/Paddle/issues/7432) is resolved.
+hidden1 = fluid.layers.fc(input=images, size=128, act='relu')
+hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint)
+    pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+    exe.run(pserver_startup)
+    exe.run(pserver_prog)
+elif training_role == "TRAINER":
+    trainer_prog = t.get_trainer_program()
+    feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        batch_id = 0
+        for data in train_reader():
+            loss, acc = exe.run(trainer_prog,
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            if batch_id % 100 == 0:
+                print("batch_id %d, loss: %f, acc: %f" %
+                      (batch_id, loss, pass_acc))
+            batch_id += 1
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+else:
+    print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
new file mode 100644
index 0000000000..2d8885e377
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_recommender_system_dist.py
@@ -0,0 +1,216 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.optimizer import SGDOptimizer
+
+IS_SPARSE = True
+BATCH_SIZE = 256
+PASS_NUM = 100
+
+
+def get_usr_combined_features():
+    USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1
+    uid = layers.data(name='user_id', shape=[1], dtype='int64')
+    usr_emb = layers.embedding(
+        input=uid,
+        dtype='float32',
+        size=[USR_DICT_SIZE, 32],
+        param_attr='user_table',
+        is_sparse=IS_SPARSE)
+    usr_fc = layers.fc(input=usr_emb, size=32)
+    USR_GENDER_DICT_SIZE = 2
+
+    usr_gender_id = layers.data(name='gender_id', shape=[1], dtype='int64')
+    usr_gender_emb = layers.embedding(
+        input=usr_gender_id,
+        size=[USR_GENDER_DICT_SIZE, 16],
+        param_attr='gender_table',
+        is_sparse=IS_SPARSE)
+    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
+
+    USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
+    usr_age_id = layers.data(name='age_id', shape=[1], dtype="int64")
+    usr_age_emb = layers.embedding(
+        input=usr_age_id,
+        size=[USR_AGE_DICT_SIZE, 16],
+        is_sparse=IS_SPARSE,
+        param_attr='age_table')
+    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
+
+    USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
+    usr_job_id = layers.data(name='job_id', shape=[1], dtype="int64")
+    usr_job_emb = layers.embedding(
+        input=usr_job_id,
+        size=[USR_JOB_DICT_SIZE, 16],
+        param_attr='job_table',
+        is_sparse=IS_SPARSE)
+    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
+
+    concat_embed = layers.concat(
+        input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc], axis=1)
+
+    usr_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return usr_combined_features
+
+
+def get_mov_combined_features():
+    MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1
+    mov_id = layers.data(name='movie_id', shape=[1], dtype='int64')
+    mov_emb = layers.embedding(
+        input=mov_id,
+        dtype='float32',
+        size=[MOV_DICT_SIZE, 32],
+        param_attr='movie_table',
+        is_sparse=IS_SPARSE)
+    mov_fc = layers.fc(input=mov_emb, size=32)
+
+    CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())
+    category_id = layers.data(name='category_id', shape=[1], dtype='int64')
+    mov_categories_emb = layers.embedding(
+        input=category_id, size=[CATEGORY_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_categories_hidden = layers.sequence_pool(
+        input=mov_categories_emb, pool_type="sum")
+
+    MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())
+    mov_title_id = layers.data(name='movie_title', shape=[1], dtype='int64')
+    mov_title_emb = layers.embedding(
+        input=mov_title_id, size=[MOV_TITLE_DICT_SIZE, 32], is_sparse=IS_SPARSE)
+    mov_title_conv = nets.sequence_conv_pool(
+        input=mov_title_emb,
+        num_filters=32,
+        filter_size=3,
+        act="tanh",
+        pool_type="sum")
+
+    concat_embed = layers.concat(
+        input=[mov_fc, mov_categories_hidden, mov_title_conv], axis=1)
+
+    mov_combined_features = layers.fc(input=concat_embed, size=200, act="tanh")
+    return mov_combined_features
+
+
+def model():
+    usr_combined_features = get_usr_combined_features()
+    mov_combined_features = get_mov_combined_features()
+
+    # need cos sim
+    inference = layers.cos_sim(X=usr_combined_features, Y=mov_combined_features)
+    scale_infer = layers.scale(x=inference, scale=5.0)
+
+    label = layers.data(name='score', shape=[1], dtype='float32')
+    square_cost = layers.square_error_cost(input=scale_infer, label=label)
+    avg_cost = layers.mean(x=square_cost)
+
+    return avg_cost
+
+
+def func_feed(feeding, data, place):
+    feed_tensors = {}
+    for (key, idx) in feeding.iteritems():
+        tensor = core.LoDTensor()
+        if key != "category_id" and key != "movie_title":
+            if key == "score":
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "float32")
+            else:
+                numpy_data = np.array(map(lambda x: x[idx], data)).astype(
+                    "int64")
+        else:
+            numpy_data = map(lambda x: np.array(x[idx]).astype("int64"), data)
+            lod_info = [len(item) for item in numpy_data]
+            offset = 0
+            lod = [offset]
+            for item in lod_info:
+                offset += item
+                lod.append(offset)
+            numpy_data = np.concatenate(numpy_data, axis=0)
+            tensor.set_lod([lod])
+
+        numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
+        tensor.set(numpy_data, place)
+        feed_tensors[key] = tensor
+    return feed_tensors
+
+
+def main():
+    cost = model()
+    optimizer = SGDOptimizer(learning_rate=0.2)
+    optimize_ops, params_grads = optimizer.minimize(cost)
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.movielens.train(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv("TRAINING_ROLE", "TRAINER")
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+
+        feeding = {
+            'user_id': 0,
+            'gender_id': 1,
+            'age_id': 2,
+            'job_id': 3,
+            'movie_id': 4,
+            'category_id': 5,
+            'movie_title': 6,
+            'score': 7
+        }
+
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                outs = exe.run(trainer_prog,
+                               feed=func_feed(feeding, data, place),
+                               fetch_list=[cost])
+                out = np.array(outs[0])
+                print("cost=" + str(out[0]))
+                if out[0] < 6.0:
+                    print("Training complete. Average cost is less than 6.0.")
+                    # if avg cost less than 6.0, we think our code is good.
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
new file mode 100644
index 0000000000..49f26d6b69
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_conv_dist.py
@@ -0,0 +1,125 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import os
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and pass_acc > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
new file mode 100644
index 0000000000..bff376a0e2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/notest_understand_sentiment_dynamic_lstm.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import os
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    assert stacked_num % 2 == 1
+
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    # add bias attr
+
+    # TODO(qijun) linear act
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
+
+    inputs = [fc1, lstm1]
+
+    for i in range(2, stacked_num + 1):
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
+            input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
+        inputs = [fc, lstm]
+
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
+
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    print "loaded word dict successfully"
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = stacked_lstm_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+    t = fluid.DistributeTranspiler()
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint)
+        pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+        exe.run(pserver_startup)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        exe.run(fluid.default_startup_program())
+        trainer_prog = t.get_trainer_program()
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and acc_val > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py b/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
new file mode 100644
index 0000000000..4a50049bf2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_split_var.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+from paddle.v2.fluid.distribute_transpiler import split_dense_variable
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import random
+
+
+class TestSplitVar(unittest.TestCase):
+    def test_check_output(self):
+        # split below shapes to 10 servers
+        shapes = [[3, 5], [1024], [28, 784], [8, 1020], [800, 10]]
+        expected_sizes = [
+            [15], [1024],
+            [2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 2352, 784],
+            [2040, 2040, 2040, 2040],
+            [1150, 1150, 1150, 1150, 1150, 1150, 1100]
+        ]
+        var_list = []
+        program = fluid.Program()
+        for shape in shapes:
+            var = program.global_block().create_var(
+                name=str(random.randint(10000, 99999)),
+                persistable=True,
+                # dtype=core.VarDesc.VarType.LOD_TENSOR,
+                shape=shape)
+            var_list.append(var)
+        blocks = split_dense_variable(var_list, 10)
+        all_sizes = []
+        for s in expected_sizes:
+            for s2 in s:
+                all_sizes.append(s2)
+        for i, block_str in enumerate(blocks):
+            varname, block_id, size = block_str.split(":")
+            self.assertEqual(int(size), all_sizes[i])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt b/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
new file mode 100644
index 0000000000..213af5d27f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
+string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_memopt_image_classification_train)
+py_test(test_memopt_image_classification_train_resnet SRCS test_memopt_image_classification_train.py ARGS resnet)
+py_test(test_memopt_image_classification_train_vgg SRCS test_memopt_image_classification_train.py ARGS vgg)
+
+# default test
+foreach(src ${TEST_OPS})
+    py_test(${src} SRCS ${src}.py)
+endforeach()
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
new file mode 100644
index 0000000000..7ad5e2c594
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -0,0 +1,66 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.1)
+sgd_optimizer.minimize(avg_cost)
+
+fluid.memory_optimize(fluid.default_main_program())
+
+BATCH_SIZE = 200
+
+# fix the order of training data
+train_reader = paddle.batch(
+    paddle.dataset.uci_housing.train(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.uci_housing.train(), buf_size=500),
+#     batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+exe.run(fluid.default_startup_program())
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
+    for data in train_reader():
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed=feeder.feed(data),
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
new file mode 100644
index 0000000000..26673afd83
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
@@ -0,0 +1,158 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+
+def resnet_cifar10(input, depth=32):
+    def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
+        tmp = fluid.layers.conv2d(
+            input=input,
+            filter_size=filter_size,
+            num_filters=ch_out,
+            stride=stride,
+            padding=padding,
+            act=None,
+            bias_attr=False)
+        return fluid.layers.batch_norm(input=tmp, act=act)
+
+    def shortcut(input, ch_in, ch_out, stride):
+        if ch_in != ch_out:
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
+        else:
+            return input
+
+    def basicblock(input, ch_in, ch_out, stride):
+        tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
+        tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
+        short = shortcut(input, ch_in, ch_out, stride)
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
+
+    def layer_warp(block_func, input, ch_in, ch_out, count, stride):
+        tmp = block_func(input, ch_in, ch_out, stride)
+        for i in range(1, count):
+            tmp = block_func(tmp, ch_out, ch_out, 1)
+        return tmp
+
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    conv1 = conv_bn_layer(
+        input=input, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
+    pool = fluid.layers.pool2d(
+        input=res3, pool_size=8, pool_type='avg', pool_stride=1)
+    return pool
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+classdim = 10
+data_shape = [3, 32, 32]
+
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
+
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
+
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+opts = optimizer.minimize(avg_cost)
+
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+fluid.memory_optimize(fluid.default_main_program())
+
+BATCH_SIZE = 128
+PASS_NUM = 1
+
+# fix the order of training data
+train_reader = paddle.batch(
+    paddle.dataset.cifar.train10(), batch_size=BATCH_SIZE)
+
+# train_reader = paddle.batch(
+#     paddle.reader.shuffle(
+#         paddle.dataset.cifar.train10(), buf_size=128 * 10),
+#     batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+exe.run(fluid.default_startup_program())
+
+i = 0
+for pass_id in range(PASS_NUM):
+    accuracy.reset(exe)
+    for data in train_reader():
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed=feeder.feed(data),
+                            fetch_list=[avg_cost] + accuracy.metrics)
+        pass_acc = accuracy.eval(exe)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        if i > 2:
+            exit(0)
+        i += 1
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
new file mode 100644
index 0000000000..ffd53e7a78
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
@@ -0,0 +1,144 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+
+dict_size = 30000
+source_dict_dim = target_dict_dim = dict_size
+src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+hidden_dim = 32
+word_dim = 16
+IS_SPARSE = True
+batch_size = 10
+max_length = 50
+topk_size = 50
+trg_dic_size = 10000
+
+decoder_size = hidden_dim
+
+# need to fix random seed and training data to compare the loss
+# value accurately calculated by the default and the memory optimization
+# version.
+fluid.default_startup_program().random_seed = 111
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_last_step(input=lstm_hidden0)
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = core.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
+
+    fluid.memory_optimize(fluid.default_main_program())
+
+    # fix the order of training data
+    train_data = paddle.batch(
+        paddle.dataset.wmt14.train(dict_size), batch_size=batch_size)
+
+    # train_data = paddle.batch(
+    #     paddle.reader.shuffle(
+    #         paddle.dataset.wmt14.train(dict_size), buf_size=1000),
+    #     batch_size=batch_size)
+
+    place = core.CPUPlace()
+    exe = Executor(place)
+
+    exe.run(framework.default_startup_program())
+
+    batch_id = 0
+    for pass_id in xrange(10):
+        for data in train_data():
+            word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
+            outs = exe.run(fluid.default_main_program(),
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 2:
+                exit(0)
+            batch_id += 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/decorators.py b/python/paddle/v2/fluid/tests/decorators.py
new file mode 100644
index 0000000000..0a8a2ccc4d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/decorators.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+
+__all__ = ['many_times', 'prog_scope']
+
+
+def many_times(times):
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            for _ in range(times):
+                fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
+
+
+def prog_scope():
+    def __impl__(fn):
+        def __fn__(*args, **kwargs):
+            prog = fluid.Program()
+            startup_prog = fluid.Program()
+            scope = fluid.core.Scope()
+            with fluid.scope_guard(scope):
+                with fluid.program_guard(prog, startup_prog):
+                    fn(*args, **kwargs)
+
+        return __fn__
+
+    return __impl__
diff --git a/python/paddle/v2/fluid/tests/demo/fc_gan.py b/python/paddle/v2/fluid/tests/demo/fc_gan.py
new file mode 100644
index 0000000000..0652c8134d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
@@ -0,0 +1,171 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import errno
+import math
+import os
+
+import matplotlib
+import numpy
+
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+
+NOISE_SIZE = 100
+NUM_PASS = 1000
+NUM_REAL_IMGS_IN_BATCH = 121
+NUM_TRAIN_TIMES_OF_DG = 3
+LEARNING_RATE = 2e-5
+
+
+def D(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='D.w1',
+                             bias_attr='D.b1')
+    logits = fluid.layers.fc(input=hidden,
+                             size=1,
+                             act=None,
+                             param_attr='D.w2',
+                             bias_attr='D.b2')
+    return logits
+
+
+def G(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='G.w1',
+                             bias_attr='G.b1')
+    img = fluid.layers.fc(input=hidden,
+                          size=28 * 28,
+                          act='tanh',
+                          param_attr='G.w2',
+                          bias_attr='G.b2')
+    return img
+
+
+def plot(gen_data):
+    gen_data.resize(gen_data.shape[0], 28, 28)
+    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
+    fig = plt.figure(figsize=(n, n))
+    gs = gridspec.GridSpec(n, n)
+    gs.update(wspace=0.05, hspace=0.05)
+
+    for i, sample in enumerate(gen_data):
+        ax = plt.subplot(gs[i])
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_aspect('equal')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+
+    return fig
+
+
+def main():
+    try:
+        os.makedirs("./out")
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+    startup_program = fluid.Program()
+    d_program = fluid.Program()
+    dg_program = fluid.Program()
+
+    with fluid.program_guard(d_program, startup_program):
+        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(img),
+            label=fluid.layers.data(
+                name='label', shape=[1], dtype='float32'))
+        d_loss = fluid.layers.mean(x=d_loss)
+
+    with fluid.program_guard(dg_program, startup_program):
+        noise = fluid.layers.data(
+            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        g_img = G(x=noise)
+        g_program = dg_program.clone()
+        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(g_img),
+            label=fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
+        dg_loss = fluid.layers.mean(x=dg_loss)
+
+    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
+
+    opt.minimize(loss=d_loss, startup_program=startup_program)
+    opt.minimize(
+        loss=dg_loss,
+        startup_program=startup_program,
+        parameter_list=[
+            p.name for p in g_program.global_block().all_parameters()
+        ])
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(startup_program)
+
+    num_true = NUM_REAL_IMGS_IN_BATCH
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=60000),
+        batch_size=num_true)
+
+    for pass_id in range(NUM_PASS):
+        for batch_id, data in enumerate(train_reader()):
+            num_true = len(data)
+            n = numpy.random.uniform(
+                low=-1.0, high=1.0,
+                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
+                    [num_true, NOISE_SIZE])
+            generated_img = exe.run(g_program,
+                                    feed={'noise': n},
+                                    fetch_list={g_img})[0]
+            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = real_data.reshape(num_true, 784)
+            total_data = numpy.concatenate([real_data, generated_img])
+            total_label = numpy.concatenate([
+                numpy.ones(
+                    shape=[real_data.shape[0], 1], dtype='float32'),
+                numpy.zeros(
+                    shape=[real_data.shape[0], 1], dtype='float32')
+            ])
+            d_loss_np = exe.run(d_program,
+                                feed={'img': total_data,
+                                      'label': total_label},
+                                fetch_list={d_loss})[0]
+            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+                n = numpy.random.uniform(
+                    low=-1.0, high=1.0,
+                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
+                        [2 * num_true, NOISE_SIZE, 1, 1])
+                dg_loss_np = exe.run(dg_program,
+                                     feed={'noise': n},
+                                     fetch_list={dg_loss})[0]
+            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
+                pass_id, batch_id, d_loss_np, dg_loss_np))
+        # generate image each batch
+        fig = plot(generated_img)
+        plt.savefig(
+            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
+        plt.close(fig)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
new file mode 100644
index 0000000000..3f6d7070c2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -0,0 +1,534 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+import itertools
+import paddle.v2.fluid.core as core
+import collections
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.framework import Program, OpProtoHolder
+
+
+def randomize_probability(batch_size, class_num, dtype='float32'):
+    prob = np.random.uniform(
+        0.1, 1.0, size=(batch_size, class_num)).astype(dtype)
+    prob_sum = prob.sum(axis=1)
+    for i in xrange(len(prob)):
+        prob[i] /= prob_sum[i]
+    return prob
+
+
+def create_op(scope, op_type, inputs, outputs, attrs):
+    kwargs = dict()
+
+    def __create_var__(name, var_name):
+        scope.var(var_name).get_tensor()
+        kwargs[name].append(var_name)
+
+    for in_name, in_dup in Operator.get_op_inputs(op_type):
+        if in_name in inputs:
+            kwargs[in_name] = []
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, _ = item[0], item[1]
+                    __create_var__(in_name, sub_in_name)
+            else:
+                __create_var__(in_name, in_name)
+
+    for out_name, out_dup in Operator.get_op_outputs(op_type):
+        if out_name in outputs:
+            kwargs[out_name] = []
+            if out_dup:
+                sub_out = outputs[out_name]
+                for item in sub_out:
+                    sub_out_name, _ = item[0], item[1]
+                    __create_var__(out_name, sub_out_name)
+            else:
+                __create_var__(out_name, out_name)
+
+    for attr_name in Operator.get_op_attr_names(op_type):
+        if attr_name in attrs:
+            kwargs[attr_name] = attrs[attr_name]
+
+    return Operator(op_type, **kwargs)
+
+
+def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
+    for in_name, in_dup in Operator.get_op_inputs(op.type()):
+        if in_name in inputs:
+            if in_dup:
+                sub_in = inputs[in_name]
+                for item in sub_in:
+                    sub_in_name, sub_in_val = item[0], item[1]
+                    __set_input__(sub_in_name, sub_in_val)
+            else:
+                __set_input__(in_name, inputs[in_name])
+
+
+def get_numeric_gradient(place,
+                         scope,
+                         op,
+                         inputs,
+                         input_to_check,
+                         output_names,
+                         delta=0.005,
+                         in_place=False):
+    # FIXME: change this method by compile time concepts
+    set_input(scope, op, inputs, place)
+
+    def product(dim):
+        return reduce(lambda a, b: a * b, dim, 1)
+
+    def get_output():
+        sum = []
+        for output_name in output_names:
+            op.run(scope, place)
+            sum.append(
+                np.array(scope.find_var(output_name).get_tensor()).mean())
+        return np.array(sum).mean()
+
+    tensor_to_check = scope.find_var(input_to_check).get_tensor()
+    tensor_size = product(tensor_to_check.get_dims())
+    tensor_to_check_dtype = tensor_to_check.dtype()
+    if tensor_to_check_dtype == core.DataType.FP32:
+        tensor_to_check_dtype = np.float32
+    elif tensor_to_check_dtype == core.DataType.FP64:
+        tensor_to_check_dtype = np.float64
+    else:
+        raise ValueError("Not supported data type " + str(
+            tensor_to_check_dtype))
+
+    gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
+
+    def __get_elem__(tensor, i):
+        if tensor_to_check_dtype == np.float32:
+            return tensor.get_float_element(i)
+        else:
+            return tensor.get_double_element(i)
+
+    def __set_elem__(tensor, i, e):
+        if tensor_to_check_dtype == np.float32:
+            tensor.set_float_element(i, e)
+        else:
+            tensor.set_double_element(i, e)
+
+    # we only compute gradient of one element each time.
+    # we use a for loop to compute the gradient of every element.
+    for i in xrange(tensor_size):
+        if in_place:
+            set_input(scope, op, inputs, place)
+
+        # get one input element throw it's index i.
+        origin = __get_elem__(tensor_to_check, i)
+        # add delta to it, run op and then get the sum of the result tensor.
+        x_pos = origin + delta
+        __set_elem__(tensor_to_check, i, x_pos)
+        y_pos = get_output()
+
+        if in_place:
+            set_input(scope, op, inputs, place)
+
+        x_neg = origin - delta
+        __set_elem__(tensor_to_check, i, x_neg)
+        y_neg = get_output()
+
+        __set_elem__(tensor_to_check, i, origin)
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+
+
+def append_input_output(block, op_proto, np_list, is_input):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+            shape = None
+            lod_level = None
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                shape = list(np_value[0].shape)
+                lod_level = len(np_value[1])
+            else:
+                shape = list(np_value.shape)
+                lod_level = 0
+        return block.create_var(
+            dtype="float32", shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
+
+
+class OpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        '''Fix random seeds to remove randomness from tests'''
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+
+        np.random.seed(123)
+        random.seed(124)
+
+    @classmethod
+    def tearDownClass(cls):
+        '''Restore random seeds'''
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+    def feed_var(self, input_vars, place):
+        feed_map = {}
+        for var_name in input_vars:
+            if isinstance(input_vars[var_name], list):
+                for name, np_value in self.inputs[var_name]:
+                    tensor = core.LoDTensor()
+                    if isinstance(np_value, tuple):
+                        tensor.set(np_value[0], place)
+                        tensor.set_lod(np_value[1])
+                    else:
+                        tensor.set(np_value, place)
+                    feed_map[name] = tensor
+            else:
+                tensor = core.LoDTensor()
+                if isinstance(self.inputs[var_name], tuple):
+                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set_lod(self.inputs[var_name][1])
+                else:
+                    tensor.set(self.inputs[var_name], place)
+                feed_map[var_name] = tensor
+
+        return feed_map
+
+    def check_output_with_place(self, place, atol):
+        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+
+        program = Program()
+        block = program.global_block()
+
+        inputs = append_input_output(block, op_proto, self.inputs, True)
+        outputs = append_input_output(block, op_proto, self.outputs, False)
+        op = block.append_op(
+            type=self.op_type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=self.attrs if hasattr(self, "attrs") else dict())
+        # infer variable type and infer shape in compile-time
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name, var in outputs.iteritems():
+            if var_name in self.outputs:
+                if isinstance(var, list):
+                    for v in var:
+                        fetch_list.append(v)
+                else:
+                    fetch_list.append(var)
+
+        feed_map = self.feed_var(inputs, place)
+
+        exe = Executor(place)
+        outs = exe.run(program,
+                       feed=feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
+
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
+            if out_name not in self.outputs:
+                continue
+
+            def find_actual(target_name, fetch_list):
+                found = [
+                    i for i, var in enumerate(fetch_list)
+                    if var.name == target_name
+                ]
+                self.assertTrue(
+                    len(found) == 1, "Found {} {}".format(
+                        len(found), target_name))
+                return found[0]
+
+            if out_dup:
+                sub_out = self.outputs[out_name]
+                if not isinstance(sub_out, list):
+                    raise AssertionError("sub_out type %s is not list",
+                                         type(sub_out))
+                for item in sub_out:
+                    sub_out_name, expect = item[0], item[1]
+                    idx = find_actual(sub_out_name, fetch_list)
+                    actual = outs[idx]
+                    actual_t = np.array(actual)
+                    expect_t = expect[0] \
+                        if isinstance(expect, tuple) else expect
+                    self.assertTrue(
+                        np.allclose(
+                            actual_t, expect_t, atol=atol),
+                        "Output (" + sub_out_name + ") has diff at " +
+                        str(place))
+                    if isinstance(expect, tuple):
+                        self.assertListEqual(
+                            actual.lod(), expect[1], "Output (" + sub_out_name +
+                            ") has different lod at " + str(place))
+            else:
+                idx = find_actual(out_name, fetch_list)
+                actual = outs[idx]
+                actual_t = np.array(actual)
+                expect = self.outputs[out_name]
+                expect_t = expect[0] if isinstance(expect, tuple) else expect
+                self.assertTrue(
+                    np.allclose(
+                        actual_t, expect_t, atol=atol),
+                    "Output (" + out_name + ") has diff at " + str(place))
+                if isinstance(expect, tuple):
+                    self.assertListEqual(actual.lod(), expect[1],
+                                         "Output (" + out_name +
+                                         ") has different lod at " + str(place))
+
+    def check_output(self, atol=1e-5):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_output_with_place(place, atol)
+
+    def __assert_is_close(self, numeric_grads, analytic_grads, names,
+                          max_relative_error, msg_prefix):
+
+        for a, b, name in itertools.izip(numeric_grads, analytic_grads, names):
+            abs_a = np.abs(a)
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = np.abs(a - b) / abs_a
+            max_diff = np.max(diff_mat)
+
+            def err_msg():
+                offset = np.argmax(diff_mat > max_relative_error)
+                return ("%s Variable %s max gradient diff %f over limit %f, "
+                        "the first error element is %d, %f, %f") % (
+                            msg_prefix, name, max_diff, max_relative_error,
+                            offset, a.flatten()[offset], b.flatten()[offset])
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_grad(self,
+                   inputs_to_check,
+                   output_names,
+                   no_grad_set=None,
+                   numeric_grad_delta=0.005,
+                   in_place=False,
+                   max_relative_error=0.005,
+                   user_defined_grads=None):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type):
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_grad_with_place(place, inputs_to_check, output_names,
+                                       no_grad_set, numeric_grad_delta,
+                                       in_place, max_relative_error,
+                                       user_defined_grads)
+
+    def check_grad_with_place(self,
+                              place,
+                              inputs_to_check,
+                              output_names,
+                              no_grad_set=None,
+                              numeric_grad_delta=0.005,
+                              in_place=False,
+                              max_relative_error=0.005,
+                              user_defined_grads=None):
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+
+        if no_grad_set is None:
+            no_grad_set = set()
+
+        if not type(output_names) is list:
+            output_names = [output_names]
+
+        numeric_grads = user_defined_grads or [
+            get_numeric_gradient(
+                place,
+                self.scope,
+                self.op,
+                self.inputs,
+                input_to_check,
+                output_names,
+                delta=numeric_grad_delta,
+                in_place=in_place) for input_to_check in inputs_to_check
+        ]
+        analytic_grads = self._get_gradient(inputs_to_check, place,
+                                            output_names, no_grad_set)
+
+        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                               max_relative_error,
+                               "Gradient Check On %s" % str(place))
+
+    @staticmethod
+    def _create_var_descs_(block, var_dict):
+        # FIXME: Try unify with `append_input_output`
+        for param_name in var_dict:
+            var = var_dict[param_name]
+            if not isinstance(var, list) and not isinstance(var, tuple):
+                var = [(param_name, var, None)]
+            if not isinstance(var[0], list) and not isinstance(var[0], tuple):
+                var = [(param_name, var[0], var[1])]
+
+            for i, item in enumerate(var):
+                if not isinstance(item[0], basestring):
+                    item = [[param_name] + list(item)]
+                if len(item) == 2:
+                    if isinstance(item[1], tuple):
+                        var[i] = [item[0], item[1][0], item[1][1]]
+                    else:
+                        # only set var name and value, set lod to None
+                        var[i] = list(item) + [None]
+            var_descs = [(block.create_var(
+                name=name, shape=each.shape, dtype=each.dtype), each, lod)
+                         for name, each, lod in var]
+
+            yield param_name, var_descs
+
+    @staticmethod
+    def _merge_list(iterable):
+        return reduce(lambda a, b: list(a) + list(b), iterable, [])
+
+    @staticmethod
+    def _numpy_to_lod_tensor(np_value, lod, place):
+        tensor = core.LoDTensor()
+        tensor.set(np_value, place)
+        if lod is not None:
+            tensor.set_lod(lod)
+        return tensor
+
+    def _get_gradient(self, input_to_check, place, output_names, no_grad_set):
+        prog = Program()
+        block = prog.global_block()
+        inputs_with_np = {
+            key: value
+            for (key, value) in OpTest._create_var_descs_(
+                block, getattr(self, 'inputs', {}))
+        }
+        outputs_with_np = {
+            key: val
+            for (key, val) in OpTest._create_var_descs_(
+                block, getattr(self, 'outputs', {}))
+        }
+        inputs = {
+            k: [item[0] for item in inputs_with_np[k]]
+            for k in inputs_with_np
+        }
+        outputs = {
+            k: [item[0] for item in outputs_with_np[k]]
+            for k in outputs_with_np
+        }
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=getattr(self, 'attrs', {}))
+
+        # infer variable type and infer shape in compile-time
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        mean_inputs = map(block.var, output_names)
+
+        if len(mean_inputs) == 1:
+            loss = block.create_var(dtype=mean_inputs[0].dtype, shape=[1])
+            op = block.append_op(
+                inputs={"X": mean_inputs}, outputs={"Out": loss}, type='mean')
+            op.desc.infer_var_type(block.desc)
+            op.desc.infer_shape(block.desc)
+        else:
+            avg_sum = []
+            for cur_loss in mean_inputs:
+                cur_avg_loss = block.create_var(dtype=cur_loss.dtype, shape=[1])
+                op = block.append_op(
+                    inputs={"X": [cur_loss]},
+                    outputs={"Out": [cur_avg_loss]},
+                    type="mean")
+                op.desc.infer_var_type(block.desc)
+                op.desc.infer_shape(block.desc)
+                avg_sum.append(cur_avg_loss)
+
+            loss_sum = block.create_var(dtype=avg_sum[0].dtype, shape=[1])
+            op_sum = block.append_op(
+                inputs={"X": avg_sum}, outputs={"Out": loss_sum}, type='sum')
+            op_sum.desc.infer_var_type(block.desc)
+            op_sum.desc.infer_shape(block.desc)
+
+            loss = block.create_var(dtype=loss_sum.dtype, shape=[1])
+            op_loss = block.append_op(
+                inputs={"X": loss_sum},
+                outputs={"Out": loss},
+                type='scale',
+                attrs={'scale': 1.0 / float(len(avg_sum))})
+            op_loss.desc.infer_var_type(block.desc)
+            op_loss.desc.infer_shape(block.desc)
+
+        param_grad_list = append_backward(
+            loss=loss, parameter_list=input_to_check, no_grad_set=no_grad_set)
+
+        feed_dict = {
+            item[0].name: OpTest._numpy_to_lod_tensor(item[1], item[2], place)
+            for p_name in inputs_with_np for item in inputs_with_np[p_name]
+        }
+
+        fetch_list = [g for p, g in param_grad_list]
+        executor = Executor(place)
+        return map(
+            np.array,
+            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
diff --git a/python/paddle/v2/fluid/tests/test_accuracy_op.py b/python/paddle/v2/fluid/tests/test_accuracy_op.py
new file mode 100644
index 0000000000..ac3f3bdff4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_accuracy_op.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAccuracyOp(OpTest):
+    def setUp(self):
+        self.op_type = "accuracy"
+        n = 8192
+        infer = np.random.random((n, 1)).astype("float32")
+        indices = np.random.randint(0, 2, (n, 1))
+        label = np.random.randint(0, 2, (n, 1))
+        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+        num_correct = 0
+        for rowid in xrange(n):
+            for ele in indices[rowid]:
+                if ele == label[rowid]:
+                    num_correct += 1
+                    break
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype("float32"),
+            'Correct': np.array([num_correct]).astype("int32"),
+            'Total': np.array([n]).astype("int32")
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
new file mode 100644
index 0000000000..1de5d446b8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -0,0 +1,488 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from scipy.special import expit
+
+
+class TestExp(OpTest):
+    def setUp(self):
+        self.op_type = "exp"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.exp(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "sigmoid"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': 1 / (1 + np.exp(-self.inputs['X']))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+
+
+class TestLogSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "logsigmoid"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+
+
+class TestTanh(OpTest):
+    def setUp(self):
+        self.op_type = "tanh"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.tanh(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestTanhShrink(OpTest):
+    def setUp(self):
+        self.op_type = "tanh_shrink"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - np.tanh(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+
+
+class TestHardShrink(OpTest):
+    def setUp(self):
+        self.op_type = "hard_shrink"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        threshold = 0.5
+
+        self.inputs = {'X': x}
+        self.attrs = {'lambda': threshold}
+
+        t = np.copy(x)
+        t[(t >= -threshold) & (t <= threshold)] = 0
+        self.outputs = {'Out': t}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.005)
+
+
+class TestSoftShrink(OpTest):
+    def setUp(self):
+        self.op_type = "softshrink"
+        lambda_val = 0.1
+        self.attrs = {'lambda': lambda_val}
+        self.inputs = {
+            'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32")
+        }
+        y = np.copy(self.inputs['X'])
+        y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
+            y - lambda_val)
+        self.outputs = {'Out': y}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSqrt(OpTest):
+    def setUp(self):
+        self.op_type = "sqrt"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.sqrt(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestAbs(OpTest):
+    def setUp(self):
+        self.op_type = "abs"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        # Because we set delta = 0.005 in caculating numeric gradient,
+        # if x is too small, such as 0.002, x_neg will be -0.003
+        # x_pos will be 0.007, so the numeric gradient is unaccurate.
+        # we should avoid this
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.abs(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestCeil(OpTest):
+    def setUp(self):
+        self.op_type = "ceil"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.ceil(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestFloor(OpTest):
+    def setUp(self):
+        self.op_type = "floor"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.floor(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.round(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestRelu(OpTest):
+    def setUp(self):
+        self.op_type = "relu"
+        x = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        # The same reason with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], 0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestBRelu(OpTest):
+    def setUp(self):
+        self.op_type = "brelu"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        t_min = 1.0
+        t_max = 4.0
+        # The same with TestAbs
+        x[np.abs(x - t_min) < 0.005] = t_min + 0.02
+        x[np.abs(x - t_max) < 0.005] = t_max + 0.02
+
+        self.inputs = {'X': x}
+        self.attrs = {'t_min': t_min, 't_max': t_max}
+        t = np.copy(x)
+        t[t < t_min] = t_min
+        t[t > t_max] = t_max
+        self.outputs = {'Out': t}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+
+
+class TestRelu6(OpTest):
+    def setUp(self):
+        self.op_type = "relu6"
+        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        threshold = 6.0
+        # The same with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {
+            'Out': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+
+
+class TestSoftRelu(OpTest):
+    def setUp(self):
+        self.op_type = "soft_relu"
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        threshold = 2.0
+        # The same reason with TestAbs
+        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+        x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': threshold}
+        t = np.copy(x)
+        t[t < -threshold] = -threshold
+        t[t > threshold] = threshold
+        self.outputs = {'Out': np.log((np.exp(t) + 1))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+
+
+class TestELU(OpTest):
+    def setUp(self):
+        self.op_type = "elu"
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        alpha = 1.
+        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
+        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
+        self.inputs = {'X': x}
+        self.attrs = {'alpha': alpha}
+        self.outputs = {
+            'Out': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+
+
+class TestReciprocal(OpTest):
+    def setUp(self):
+        self.op_type = "reciprocal"
+        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.outputs = {'Out': np.reciprocal(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+
+class TestLog(OpTest):
+    def setUp(self):
+        self.op_type = "log"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.log(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSquare(OpTest):
+    def setUp(self):
+        self.op_type = "square"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.square(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestPow(OpTest):
+    def setUp(self):
+        self.op_type = "pow"
+        self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
+        self.attrs = {'factor': 3.0}
+        self.outputs = {'Out': np.power(self.inputs['X'], 3)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.02)
+
+
+class TestSTanh(OpTest):
+    def setUp(self):
+        self.op_type = "stanh"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        }
+        scale_a = 2.0 / 3.0
+        scale_b = 1.7159
+        self.attrs = {'scale_a': scale_a, 'scale_b': scale_b}
+        self.outputs = {'Out': scale_b * np.tanh(self.inputs['X'] * scale_a)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSoftplus(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float64")
+        }
+        self.outputs = {'Out': np.log(1 + np.exp(self.inputs['X']))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestSoftsign(OpTest):
+    def setUp(self):
+        self.op_type = "softsign"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {
+            'Out': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.007)
+
+
+class TestThresholdedRelu(OpTest):
+    def setUp(self):
+        self.op_type = "thresholded_relu"
+        threshold = 0.25
+        self.relative_error = 0.005
+        X = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+
+        # Same reason as TestAbs
+        X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
+
+        self.inputs = {'X': X}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {'Out': (X > threshold) * X}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=self.relative_error)
+
+
+class TestHardSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "hard_sigmoid"
+        self.relative_error = 0.002
+
+        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
+        slope = 0.2
+        offset = 0.5
+        lower_threshold = -offset / slope
+        upper_threshold = (1 - offset) / slope
+
+        self.inputs = {'X': X}
+        # Same reason as TestAbs
+        X[np.abs(X - lower_threshold) < self.relative_error] = \
+            lower_threshold + 0.2
+        X[np.abs(X - upper_threshold) < self.relative_error] = \
+            upper_threshold - 0.2
+
+        temp = X * slope + offset
+        self.outputs = {'Out': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.002)
+
+
+class TestSwish(OpTest):
+    def setUp(self):
+        self.op_type = "swish"
+        X = np.random.uniform(0.1, 1, [11, 17]).astype("float32")
+        self.inputs = {'X': X}
+        self.attrs = {'beta': 2.3}
+        self.outputs = {'Out': X * expit(self.attrs['beta'] * X)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_adadelta_op.py b/python/paddle/v2/fluid/tests/test_adadelta_op.py
new file mode 100644
index 0000000000..949318d007
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_adadelta_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdadeltaOp1(OpTest):
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        self.attrs = {'rho': rho, 'epsilon': epsilon}
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdadeltaOp2(OpTest):
+    '''Test Adadelta op with default attribute values
+    '''
+
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py
new file mode 100644
index 0000000000..3556bcf8ba
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py
@@ -0,0 +1,190 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from op_test import OpTest
+import math
+
+
+class TestAdagradOp1(OpTest):
+    ''' Test Adagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdagradOp2(OpTest):
+    ''' Test Adagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSparseAdagradOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7, 4]
+        row_numel = 12
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and initialize moment Variable
+        moment = scope.var('Moment').get_tensor()
+        moment_np_array = np.full((height, row_numel), 2.0).astype("float32")
+        moment.set(moment_np_array, place)
+
+        # create and run sgd operator
+        adagrad_op = Operator(
+            "adagrad",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            Moment='Moment',
+            MomentOut='Moment',
+            LearningRate='LearningRate',
+            epsilon=2.0)
+
+        adagrad_op.run(scope, place)
+
+        # get and compare moment result
+        moment_result_array = np.array(moment)
+
+        self.assertAlmostEqual(6.0, moment_result_array[rows[0], 0])
+        self.assertAlmostEqual(3.0, moment_result_array[rows[0], 2])
+        self.assertAlmostEqual(2.0, moment_result_array[1, 0])
+        # 2.0 + (1.0 + 1.0)^2
+        self.assertAlmostEqual(6.0, moment_result_array[rows[1], 10])
+        self.assertAlmostEqual(6.0, moment_result_array[rows[3], 4])
+
+        self.assertAlmostEqual(2.0, moment_result_array[5, 8])
+        self.assertAlmostEqual(3.0, moment_result_array[rows[2], 1])
+        self.assertAlmostEqual(18.0, moment_result_array[rows[2], 8])
+
+        # get and compare param result
+        result_array = np.array(param)
+
+        def get_out(param, lr, grad, m, epsilon):
+            return param - lr * grad / (math.sqrt(m) + epsilon)
+
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+            result_array[rows[0], 0],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+            result_array[rows[0], 2],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[1, 0], places=5)
+
+        # grad_merge = 1.0 + 1.0
+        # m = 6.0
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 2.0, 6.0, 2.0),
+            result_array[rows[1], 10],
+            places=5)
+
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 0.0, 2.0, 2.0), result_array[5, 8], places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 1.0, 3.0, 2.0),
+            result_array[rows[2], 1],
+            places=5)
+        self.assertAlmostEqual(
+            get_out(5.0, 2.0, 4.0, 18.0, 2.0),
+            result_array[rows[2], 8],
+            places=5)
+
+    def test_sparse_adagrad(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_adam_op.py b/python/paddle/v2/fluid/tests/test_adam_op.py
new file mode 100644
index 0000000000..df1fa8983c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_adam_op.py
@@ -0,0 +1,315 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from paddle.v2.fluid import core
+from paddle.v2.fluid.op import Operator
+
+
+class TestAdamOp1(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOp2(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adam Operator with supplied attributes
+        '''
+        self.op_type = "adam"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment1_out, \
+                moment2_out = adam_step(self.inputs, self.attrs)
+
+            self.outputs = {
+                'Moment1Out': moment1_out,
+                'Moment2Out': moment2_out,
+                'ParamOut': param_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment1'] = moment1_out
+            self.inputs['Moment2'] = moment2_out
+
+            # Update powers of Beta1 and Beta2 for next time step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
+            self.inputs['Beta2Pow'] *= self.attrs['beta1']
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adam_step(inputs, attributes):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = beta1 * moment1 + (1 - beta1) * grad
+    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
+    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    # grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = np.zeros(shape=[height, row_numel])
+    moment2_out = np.zeros(shape=[height, row_numel])
+    param_out = np.zeros(shape=[height, row_numel])
+
+    for idx, row_id in enumerate(rows):
+        moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
+                                                         ) * np_grad[idx]
+        moment2_out[row_id] = beta2 * moment2[row_id] + (
+            1 - beta2) * np.square(np_grad[idx])
+        lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+        param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
+            np.sqrt(moment2_out[row_id]) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+class TestSparseAdamOp(unittest.TestCase):
+    def setup(self, scope, place):
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+
+        height = 10
+        rows = [0, 4, 7]
+        self.rows = rows
+        row_numel = 12
+        self.row_numel = row_numel
+        self.dense_inputs = {
+            "Param": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment1": np.full((height, row_numel), 5.0).astype("float32"),
+            "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
+            'Beta1Pow': np.array([beta1**10]).astype("float32"),
+            'Beta2Pow': np.array([beta2**10]).astype("float32"),
+            "LearningRate": np.full((1), 2.0).astype("float32")
+        }
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        self.sparse_inputs = ["Grad"]
+
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
+        self.outputs = {
+            "ParamOut": param_out,
+            "Moment1Out": mom1,
+            "Moment2Out": mom2
+        }
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.setup(scope, place)
+
+        op_args = dict()
+        for key, np_array in self.dense_inputs.iteritems():
+            var = scope.var(key).get_tensor()
+            var.set(np_array, place)
+            op_args[key] = key
+        for s in self.sparse_inputs:
+            op_args[s] = s
+        for s in self.outputs:
+            var = scope.var(s).get_tensor()
+            var.set(self.outputs[s], place)
+            op_args[s] = s
+        for k in self.attrs:
+            op_args[k] = self.attrs[k]
+
+        # create and run sgd operator
+        adam_op = Operator("adam", **op_args)
+        adam_op.run(scope, place)
+
+        for key, np_array in self.outputs.iteritems():
+            out_var = scope.var(key).get_tensor()
+            actual = np.array(out_var)
+            actual = actual.reshape([actual.size])
+            np_array = np_array.reshape([np_array.size])
+            for idx, row_id in enumerate(self.rows):
+                j = 0
+                while j < self.row_numel:
+                    pos = row_id * self.row_numel + j
+                    self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
+                                    0.00001)
+                    j += 1
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_adamax_op.py b/python/paddle/v2/fluid/tests/test_adamax_op.py
new file mode 100644
index 0000000000..e285c454f0
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_adamax_op.py
@@ -0,0 +1,186 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamaxOp1(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.78
+        beta2 = 0.899
+        epsilon = 1e-5
+        beta1_pow = beta1**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                          self.attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOp2(OpTest):
+    '''Test Adamax Operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.8
+        beta2 = 0.99
+        epsilon = 1e-5
+        beta1_pow = 1
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                              self.attrs)
+
+            self.outputs = {
+                'ParamOut': param_out,
+                'MomentOut': moment_out,
+                'InfNormOut': inf_norm_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment'] = moment_out
+            self.inputs['InfNorm'] = inf_norm_out
+
+            # Update Beta1 Power accumulator for next step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adamax_step(inputs, attributes):
+    '''
+    Simulate one step of the adamax optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment, inf_norm and
+    beta1 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment = inputs['Moment']
+    inf_norm = inputs['InfNorm']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment_out = beta1 * moment + (1 - beta1) * grad
+    inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
+    lr_t = (lr / (1 - beta1_pow))
+    param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
+
+    return param_out, moment_out, inf_norm_out
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
new file mode 100644
index 0000000000..a32c24486e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import default_main_program
+import numpy
+
+
+class TestArrayReadWrite(unittest.TestCase):
+    def test_read_write(self):
+        x = [
+            layers.data(
+                name='x0', shape=[100]), layers.data(
+                    name='x1', shape=[100]), layers.data(
+                        name='x2', shape=[100])
+        ]
+
+        for each_x in x:
+            each_x.stop_gradient = False
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
+        arr = layers.array_write(x=x[0], i=i)
+        i = layers.increment(x=i)
+        arr = layers.array_write(x=x[1], i=i, array=arr)
+        i = layers.increment(x=i)
+        arr = layers.array_write(x=x[2], i=i, array=arr)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = False
+        a0 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        a1 = layers.array_read(array=arr, i=i)
+        i = layers.increment(x=i)
+        a2 = layers.array_read(array=arr, i=i)
+
+        mean_a0 = layers.mean(x=a0)
+        mean_a1 = layers.mean(x=a1)
+        mean_a2 = layers.mean(x=a2)
+
+        a_sum = layers.sums(input=[mean_a0, mean_a1, mean_a2])
+
+        mean_x0 = layers.mean(x=x[0])
+        mean_x1 = layers.mean(x=x[1])
+        mean_x2 = layers.mean(x=x[2])
+
+        x_sum = layers.sums(input=[mean_x0, mean_x1, mean_x2])
+
+        scope = core.Scope()
+        cpu = core.CPUPlace()
+
+        exe = Executor(cpu)
+
+        tensor = numpy.random.random(size=(100, 100)).astype('float32')
+
+        outs = exe.run(feed={'x0': tensor,
+                             'x1': tensor,
+                             'x2': tensor},
+                       fetch_list=[a_sum, x_sum],
+                       scope=scope)
+        self.assertEqual(outs[0], outs[1])
+
+        total_sum = layers.sums(input=[a_sum, x_sum])
+        total_sum_scaled = layers.scale(x=total_sum, scale=1 / 6.0)
+
+        append_backward(total_sum_scaled)
+
+        g_vars = map(default_main_program().global_block().var,
+                     [each_x.name + "@GRAD" for each_x in x])
+        g_out = [
+            item.sum()
+            for item in exe.run(
+                feed={'x0': tensor,
+                      'x1': tensor,
+                      'x2': tensor},
+                fetch_list=g_vars)
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        # since our final gradient is 1 and the neural network are all linear
+        # with mean_op.
+        # the input gradient should also be 1
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_assign_op.py b/python/paddle/v2/fluid/tests/test_assign_op.py
new file mode 100644
index 0000000000..fbbfe0d02c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import op_test
+import numpy
+import unittest
+
+
+class TestAssignOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign"
+        x = numpy.random.random(size=(100, 10))
+        self.inputs = {'X': x}
+        self.outputs = {'Out': x}
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_backward(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_assign_value_op.py b/python/paddle/v2/fluid/tests/test_assign_value_op.py
new file mode 100644
index 0000000000..93970f863b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_value_op.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class TestAssignValueOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign_value"
+        x = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.inputs = {}
+        self.outputs = {'Out': x}
+        self.attrs = {
+            'shape': x.shape,
+            'dtype': framework.convert_np_dtype_to_dtype_(x.dtype),
+            'fp32_values': [float(v) for v in x.flat]
+        }
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_assign(self):
+        val = (
+            -100 + 200 * numpy.random.random(size=(2, 5))).astype(numpy.int32)
+        x = layers.create_tensor(dtype="float32")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        fetched_x = exe.run(fluid.default_main_program(),
+                            feed={},
+                            fetch_list=[x])[0]
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_auc_op.py b/python/paddle/v2/fluid/tests/test_auc_op.py
new file mode 100644
index 0000000000..5e4caedf5d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_auc_op.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAucOp(OpTest):
+    def setUp(self):
+        self.op_type = "auc"
+        pred = np.random.random((128, 2)).astype("float32")
+        indices = np.random.randint(0, 2, (128, 2))
+        labels = np.random.randint(0, 2, (128, 1))
+        num_thresholds = 200
+        self.inputs = {'Out': pred, 'Indices': indices, 'Label': labels}
+        self.attrs = {'curve': 'ROC', 'num_thresholds': num_thresholds}
+        # NOTE: sklearn use a different way to generate thresholds
+        #       which will cause the result differs slightly:
+        # from sklearn.metrics import roc_curve, auc
+        # fpr, tpr, thresholds = roc_curve(labels, pred)
+        # auc_value = auc(fpr, tpr)
+        # we caculate AUC again using numpy for testing
+        kepsilon = 1e-7  # to account for floating point imprecisions
+        thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                      for i in range(num_thresholds - 2)]
+        thresholds = [0.0 - kepsilon] + thresholds + [1.0 + kepsilon]
+
+        # caculate TP, FN, TN, FP count
+        tp_list = np.ndarray((num_thresholds, ))
+        fn_list = np.ndarray((num_thresholds, ))
+        tn_list = np.ndarray((num_thresholds, ))
+        fp_list = np.ndarray((num_thresholds, ))
+        for idx_thresh, thresh in enumerate(thresholds):
+            tp, fn, tn, fp = 0, 0, 0, 0
+            for i, lbl in enumerate(labels):
+                if lbl:
+                    if pred[i, 0] >= thresh:
+                        tp += 1
+                    else:
+                        fn += 1
+                else:
+                    if pred[i, 0] >= thresh:
+                        fp += 1
+                    else:
+                        tn += 1
+            tp_list[idx_thresh] = tp
+            fn_list[idx_thresh] = fn
+            tn_list[idx_thresh] = tn
+            fp_list[idx_thresh] = fp
+
+        epsilon = 1e-6
+        tpr = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fn_list + epsilon)
+        fpr = fp_list.astype("float32") / (fp_list + tn_list + epsilon)
+        rec = (tp_list.astype("float32") + epsilon) / (
+            tp_list + fp_list + epsilon)
+
+        x = fpr[:num_thresholds - 1] - fpr[1:]
+        y = (tpr[:num_thresholds - 1] + tpr[1:]) / 2.0
+        auc_value = np.sum(x * y)
+
+        self.outputs = {'AUC': auc_value}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
new file mode 100644
index 0000000000..cf13166f25
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -0,0 +1,365 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.framework import grad_var_name
+
+
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
+def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
+    if data_format == "NCHW":
+        n, c, h, w = x.shape
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 2, 3))
+        x_sum = np.sum(x, axis=(0, 2, 3))
+        element_count = np.size(x) / int(np.shape(x)[1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        mean_tile = np.reshape(mean, (1, c, 1, 1))
+        mean_tile = np.tile(mean_tile, (n, 1, h, w))
+        var_tile = np.reshape(var, (1, c, 1, 1))
+        var_tile = np.tile(var_tile, (n, 1, h, w))
+        normalized = (x - mean_tile) / np.sqrt(var_tile + epsilon)
+        scale_tile = np.reshape(scale, (1, c, 1, 1))
+        scale_tile = np.tile(scale_tile, (n, 1, h, w))
+        offset_tile = np.reshape(offset, (1, c, 1, 1))
+        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
+        y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
+        return y, mean, var
+    elif data_format == "NHWC":
+        x_square = x * x
+        x_square_sum = np.sum(x_square, (0, 1, 2))
+        x_sum = np.sum(x, axis=(0, 1, 2))
+        element_count = np.size(x) / int(np.shape(x)[-1])
+        mean = x_sum / element_count
+        var = x_square_sum / element_count - mean * mean
+        normalized = (x - mean) / np.sqrt(var + epsilon)
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
+    else:
+        raise ValueError("Unknown data order.")
+
+
+def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+    # Use the following formulas to calculate gradients:
+    # grad_scale =
+    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
+    #
+    # grad_offset = sum(output_y)
+    #
+    # grad_x =
+    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
+    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
+
+    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
+    if data_format == "NCHW":
+        x = np.transpose(x, (0, 2, 3, 1))
+        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
+
+        # raise ValueError("data_format must be NHWC, got %s." % data_format)
+    grad_x = scale * (grad_y - np.mean(
+        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            grad_y * (x - mean), axis=(0, 1, 2)) /
+                      (var + epsilon)) / np.sqrt(var + epsilon)
+    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+                        axis=(0, 1, 2))
+    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+
+    # transfer back to N, C, H, W
+    if data_format == "NCHW":
+        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
+        x = np.transpose(x, (0, 3, 1, 2))
+        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
+    return grad_x, grad_scale, grad_offset
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.DataType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.DataType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+
+
+class TestBatchNormOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def test_python(self):
+        data_format = "NHWC"
+        epsilon = 0.00001
+        momentum = 0.9
+
+        # N, H, W, C: 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
+        x_shape = [n, h, w, c]
+        scale_shape = [c]
+
+        x_val = np.random.random_sample(x_shape).astype(np.float32)
+        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.ones(scale_shape).astype(np.float32)
+
+        # run forward
+        y_out, saved_mean, var_ref = _reference_training(
+            x_val, scale_val, bias_val, epsilon, "NHWC")
+
+        #
+        mean_out = saved_mean * (1. - momentum) + momentum * mean
+        variance_out = var_ref * (1. - momentum) + momentum * variance
+        saved_variance = 1. / np.sqrt(var_ref + epsilon)
+
+        # running N, C, H, W case
+        # should produce the same results
+        x_shape2 = [n, c, h, w]
+        x_val2 = np.transpose(x_val, (0, 3, 1, 2))
+        y_out2, saved_mean2, var_ref2 = _reference_training(
+            x_val2, scale_val, bias_val, epsilon, "NCHW")
+
+        self.__assert_close(saved_mean, saved_mean2, "batch mean")
+        self.__assert_close(var_ref, var_ref2, "batch variance")
+
+        # transfer (N, C, H, W) back to (N, H, W, C)
+        y_out2_trans = np.transpose(y_out2, (0, 2, 3, 1))
+        self.__assert_close(y_out, y_out2_trans, "batch variance")
+        print 'python: NHWC, NCHW, forward checking passed'
+
+        # test backward now
+        # NHWC
+        self.y_grad = np.random.random_sample(x_shape).astype(np.float32)
+        y_grad = self.y_grad
+        # y_grad = np.ones(x_shape).astype(np.float32)
+        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
+            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, "NHWC")
+
+        # NCHW
+        y_grad2 = np.transpose(y_grad, (0, 3, 1, 2))
+        # y_grad2 = np.ones(x_shape2).astype(np.float32)
+        x_grad_ref2, scale_grad_ref2, bias_grad_ref2 = _reference_grad(
+            x_val2, y_grad2, scale_val, saved_mean2, var_ref2, epsilon, "NCHW")
+
+        self.__assert_close(scale_grad_ref, scale_grad_ref2, "scale gradient")
+        self.__assert_close(bias_grad_ref, bias_grad_ref2, "bias gradient")
+
+        x_grad_transpose = np.transpose(x_grad_ref2, (0, 2, 3, 1))
+        self.__assert_close(x_grad_ref, x_grad_transpose, "x gradient")
+        print 'python: NHWC, NCHW, backward checking passed'
+
+    def test_forward_backward(self):
+        def test_with_place(place, data_layout, shape):
+            # attr
+            epsilon = 0.00001
+            momentum = 0.9
+
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
+            else:
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
+            scale_shape = [c]
+
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+            mean = np.zeros(scale_shape).astype(np.float32)
+            variance = np.ones(scale_shape).astype(np.float32)
+
+            # run forward
+            y_out, saved_mean, var_ref = _reference_training(
+                x_val, scale_val, bias_val, epsilon, data_format)
+
+            # update moving mean and variance
+            mean_out = saved_mean * (1. - momentum) + momentum * mean
+            variance_out = var_ref * (1. - momentum) + momentum * variance
+            saved_variance = 1. / np.sqrt(var_ref + epsilon)
+
+            #  for gradient test
+            # y_grad = np.ones(x_shape).astype(np.float32)
+            y_grad = np.zeros(x_shape).astype(np.float32)
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
+            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
+            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
+                x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
+                data_format)
+
+            scope = core.Scope()
+
+            # create input
+            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
+            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
+                                                place)
+            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
+                                               place)
+            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
+            variance_tensor = create_or_get_tensor(scope, "variance", variance,
+                                                   place)
+
+            # create output
+            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
+            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
+                                                     place)
+            saved_variance_tensor = create_or_get_tensor(
+                scope, "saved_variance", None, place)
+            mean_out_tensor = mean_tensor
+            variance_out_tensor = variance_tensor
+
+            batch_norm_op = Operator(
+                "batch_norm",
+                # inputs
+                X="x_val",
+                Scale="scale_val",
+                Bias="bias_val",
+                Mean="mean",
+                Variance="variance",
+                # outputs
+                Y="y_out",
+                MeanOut="mean",
+                VarianceOut="variance",
+                SavedMean="saved_mean",
+                SavedVariance="saved_variance",
+                # attrs
+                is_test=False,
+                data_layout=data_layout,
+                momentum=momentum,
+                epsilon=epsilon)
+
+            batch_norm_op.run(scope, place)
+
+            # check forward result
+            self.__assert_close(y_tensor, y_out, "y_out")
+            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
+            self.__assert_close(saved_variance_tensor, saved_variance,
+                                "saved_variance")
+            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
+            if isinstance(place, core.CUDAPlace):
+                atol = 5e-2
+            else:
+                atol = 1e-4
+            self.__assert_close(variance_out_tensor, variance_out,
+                                "variance_out", atol)
+            print "op test forward passed: ", str(place), data_layout
+
+            # run backward
+            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
+            set_output_grad(
+                scope,
+                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
+                place,
+                feed_dict={"y_out": y_grad})
+            batch_norm_op_grad.run(scope, place)
+
+            x_grad_tensor = create_or_get_tensor(scope,
+                                                 grad_var_name("x_val"), None,
+                                                 place)
+            scale_grad_tensor = create_or_get_tensor(scope,
+                                                     grad_var_name("scale_val"),
+                                                     None, place)
+            bias_grad_tensor = create_or_get_tensor(scope,
+                                                    grad_var_name("bias_val"),
+                                                    None, place)
+
+            # check gradient output
+            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
+            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
+            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
+            print "op test backward passed: ", str(place), data_layout
+
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            for data_format in ["NCHW", "NHWC"]:
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
new file mode 100644
index 0000000000..3674784985
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_beam_search_decode_op.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+
+
+class TestBeamSearchDecodeOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self.cpu_place = core.CPUPlace()
+
+    def append_lod_tensor(self, tensor_array, lod, data):
+        lod_tensor = core.LoDTensor()
+        lod_tensor.set_lod(lod)
+        lod_tensor.set(data, self.cpu_place)
+        tensor_array.append(lod_tensor)
+
+    def test_get_set(self):
+        ids = self.scope.var("ids").get_lod_tensor_array()
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="int64"))
+        self.append_lod_tensor(
+            ids, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="int64"))
+
+        scores = self.scope.var("scores").get_lod_tensor_array()
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 2, 3, 4, 5, 6]],
+            np.array(
+                [1, 2, 3, 4, 5, 6], dtype="float64"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 1, 1, 3, 5, 5, 6]],
+            np.array(
+                [0, 1, 2, 3, 4, 5], dtype="float64"))
+        self.append_lod_tensor(
+            scores, [[0, 3, 6], [0, 0, 1, 2, 3, 4, 5]],
+            np.array(
+                [0, 1, 2, 3, 4], dtype="float64"))
+
+        sentence_ids = self.scope.var("sentence_ids").get_tensor()
+        sentence_scores = self.scope.var("sentence_scores").get_tensor()
+
+        beam_search_decode_op = Operator(
+            "beam_search_decode",
+            # inputs
+            Ids="ids",
+            Scores="scores",
+            # outputs
+            SentenceIds="sentence_ids",
+            SentenceScores="sentence_scores")
+
+        beam_search_decode_op.run(self.scope, self.cpu_place)
+
+        expected_lod = [[0, 4, 8], [0, 1, 3, 6, 9, 10, 13, 16, 19]]
+        self.assertEqual(sentence_ids.lod(), expected_lod)
+        self.assertEqual(sentence_scores.lod(), expected_lod)
+
+        expected_data = np.array(
+            [2, 1, 0, 3, 1, 0, 3, 2, 1, 5, 4, 3, 2, 4, 4, 3, 6, 5, 4], "int64")
+        self.assertTrue(np.array_equal(np.array(sentence_ids), expected_data))
+        self.assertTrue(
+            np.array_equal(np.array(sentence_scores), expected_data))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_beam_search_op.py b/python/paddle/v2/fluid/tests/test_beam_search_op.py
new file mode 100644
index 0000000000..4da463df26
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_beam_search_op.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
+import paddle.v2.fluid.core as core
+import unittest
+import numpy as np
+
+
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class BeamSearchOpTester(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        self._create_ids()
+        self._create_scores()
+        self._create_pre_ids()
+        self.scope.var('selected_ids')
+        self.scope.var('selected_scores')
+
+    def test_run(self):
+        op = Operator(
+            'beam_search',
+            pre_ids="pre_ids",
+            ids='ids',
+            scores='scores',
+            selected_ids='selected_ids',
+            selected_scores='selected_scores',
+            level=0,
+            beam_size=2,
+            end_id=0, )
+        op.run(self.scope, core.CPUPlace())
+        selected_ids = self.scope.find_var("selected_ids").get_tensor()
+        print 'selected_ids', np.array(selected_ids)
+        print 'lod', selected_ids.lod()
+
+    def _create_pre_ids(self):
+        np_data = np.array([[1, 2, 3, 4]], dtype='int64')
+        tensor = create_tensor(self.scope, "pre_ids", np_data)
+
+    def _create_ids(self):
+        self.lod = [[0, 1, 4], [0, 1, 2, 3, 4]]
+        np_data = np.array(
+            [[4, 2, 5], [2, 1, 3], [3, 5, 2], [8, 2, 1]], dtype='int64')
+        tensor = create_tensor(self.scope, "ids", np_data)
+        tensor.set_lod(self.lod)
+
+    def _create_scores(self):
+        np_data = np.array(
+            [
+                [0.5, 0.3, 0.2],
+                [0.6, 0.3, 0.1],
+                [0.9, 0.5, 0.1],
+                [0.7, 0.5, 0.1],
+            ],
+            dtype='float32')
+        tensor = create_tensor(self.scope, "scores", np_data)
+        tensor.set_lod(self.lod)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
new file mode 100644
index 0000000000..4b03f512c2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bilinear_tensor_product_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestBilinearTensorProductOp(OpTest):
+    def setUp(self):
+        self.op_type = "bilinear_tensor_product"
+        batch_size = 6
+        size0 = 3
+        size1 = 4
+        size2 = 5
+        a = np.random.random((batch_size, size0)).astype("float32")
+        b = np.random.random((batch_size, size1)).astype("float32")
+        w = np.random.random((size2, size0, size1)).astype("float32")
+        bias = np.random.random((1, size2)).astype("float32")
+        output = np.zeros((batch_size, size2)).astype("float32")
+        for i in range(size2):
+            w_i = w[i, :, :]
+            output[:, i] = np.sum(np.matmul(a, w_i) * b, axis=1)
+        self.inputs = {
+            'X': a,
+            'Y': b,
+            'Weight': w,
+            'Bias': bias,
+        }
+        self.outputs = {'Out': output + bias}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y', 'Weight', 'Bias'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
new file mode 100644
index 0000000000..7413829897
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -0,0 +1,100 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def bipartite_match(distance, match_indices, match_dist):
+    """Bipartite Matching algorithm.
+    Arg:
+        distance (numpy.array) : The distance of two entries with shape [M, N].
+        match_indices (numpy.array): the matched indices from column to row
+            with shape [1, N], it must be initialized to -1.
+        match_dist (numpy.array): The matched distance from column to row
+            with shape [1, N], it must be initialized to 0.
+    """
+    match_pair = []
+    row, col = distance.shape
+    for i in range(row):
+        for j in range(col):
+            match_pair.append((i, j, distance[i][j]))
+
+    match_sorted = sorted(match_pair, key=lambda tup: tup[2], reverse=True)
+
+    row_indices = -1 * np.ones((row, ), dtype=np.int)
+
+    idx = 0
+    for i, j, dist in match_sorted:
+        if idx >= row:
+            break
+        if match_indices[j] == -1 and row_indices[i] == -1 and dist > 0:
+            match_indices[j] = i
+            row_indices[i] = j
+            match_dist[j] = dist
+            idx += 1
+
+
+def batch_bipartite_match(distance, lod):
+    """Bipartite Matching algorithm for batch input.
+    Arg:
+        distance (numpy.array) : The distance of two entries with shape [M, N].
+        lod (list of int): The offsets of each input in this batch.
+    """
+    n = len(lod) - 1
+    m = distance.shape[1]
+    match_indices = -1 * np.ones((n, m), dtype=np.int)
+    match_dist = np.zeros((n, m), dtype=np.float32)
+    for i in range(len(lod) - 1):
+        bipartite_match(distance[lod[i]:lod[i + 1], :], match_indices[i, :],
+                        match_dist[i, :])
+    return match_indices, match_dist
+
+
+class TestBipartiteMatchOpForWithLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 5, 11, 23]]
+        dist = np.random.random((23, 217)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': (dist, lod)}
+        self.outputs = {
+            'ColToRowMatchIndices': (match_indices),
+            'ColToRowMatchDis': (match_dist),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBipartiteMatchOpWithoutLoD(OpTest):
+    def setUp(self):
+        self.op_type = 'bipartite_match'
+        lod = [[0, 8]]
+        dist = np.random.random((8, 17)).astype('float32')
+        match_indices, match_dist = batch_bipartite_match(dist, lod[0])
+
+        self.inputs = {'DistMat': dist}
+        self.outputs = {
+            'ColToRowMatchIndices': match_indices,
+            'ColToRowMatchDis': match_dist,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_calc_gradient.py b/python/paddle/v2/fluid/tests/test_calc_gradient.py
new file mode 100644
index 0000000000..c773e81768
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_calc_gradient.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import calc_gradient
+
+
+class TestCalcGradient(unittest.TestCase):
+    def test_calc_gradient(self):
+        x = layers.create_parameter(dtype="float32", shape=[5, 10])
+        y = layers.create_parameter(dtype="float32", shape=[10, 8])
+        mul_out = layers.mul(x=x, y=y)
+        mean_out = layers.mean(x=mul_out)
+        a = calc_gradient(mean_out, mul_out)
+        b = calc_gradient(mean_out, x)
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[a, b])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cast_op.py b/python/paddle/v2/fluid/tests/test_cast_op.py
new file mode 100644
index 0000000000..327b246ed8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cast_op.py
@@ -0,0 +1,40 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import op_test
+import unittest
+import numpy as np
+import paddle.v2.fluid.core as core
+
+
+class TestCastOp(op_test.OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float64')}
+        self.attrs = {
+            'in_dtype': int(core.DataType.FP32),
+            'out_dtype': int(core.DataType.FP64)
+        }
+        self.op_type = 'cast'
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_grad(self):
+        self.check_grad(['X'], ['Out'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
new file mode 100644
index 0000000000..5c3efe9baa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class Segment(object):
+    def __init__(self, chunk_type, start_idx, end_idx):
+        self.chunk_type = chunk_type
+        self.start_idx = start_idx
+        self.end_idx = end_idx
+
+    def __str__(self):
+        return '(Segment: %s, %s, %s)' % (self.chunk_type, self.start_idx,
+                                          self.end_idx)
+
+    __repr__ = __str__
+
+
+class TestChunkEvalOp(OpTest):
+    num_sequences = 5
+    batch_size = 50
+
+    def parse_scheme(self):
+        if self.scheme == 'IOB':
+            self.num_tag_types = 2
+        elif self.scheme == 'IOE':
+            self.num_tag_types = 2
+
+    def fill_with_chunks(self, data, chunks):
+        for chunk in chunks:
+            if self.scheme == 'IOB':
+                data[chunk.start_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.start_idx + 1:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                         self.num_tag_types - 1)
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1
+                ) if chunk.start_idx < chunk.end_idx else data[chunk.start_idx]
+            elif self.scheme == 'IOE':
+                data[chunk.start_idx:
+                     chunk.end_idx] = chunk.chunk_type * self.num_tag_types
+                data[chunk.end_idx] = chunk.chunk_type * self.num_tag_types + (
+                    self.num_tag_types - 1)
+
+    def rand_chunks(self, starts, num_chunks):
+        if num_chunks < 0:
+            num_chunks = np.random.randint(starts[-1])
+        chunks = []
+        # generate chunk beginnings
+        chunk_begins = sorted(
+            np.random.choice(
+                range(starts[-1]), num_chunks, replace=False))
+        seq_chunk_begins = []
+        begin_idx = 0
+        # divide chunks into sequences
+        for i in range(len(starts) - 1):
+            tmp_chunk_begins = []
+            while begin_idx < len(chunk_begins) and chunk_begins[
+                    begin_idx] < starts[i + 1]:
+                tmp_chunk_begins.append(chunk_begins[begin_idx])
+                begin_idx += 1
+            seq_chunk_begins.append(tmp_chunk_begins)
+        # generate chunk ends
+        chunk_ends = []
+        for i in range(len(seq_chunk_begins)):
+            for j in range(len(seq_chunk_begins[i])):
+                low = seq_chunk_begins[i][j]
+                high = seq_chunk_begins[i][j + 1] if j < len(seq_chunk_begins[
+                    i]) - 1 else starts[i + 1]
+                chunk_ends.append(np.random.randint(low, high))
+        # generate chunks
+        for chunk_pos in zip(chunk_begins, chunk_ends):
+            chunk_type = np.random.randint(self.num_chunk_types)
+            chunks.append(Segment(chunk_type, *chunk_pos))
+        return chunks
+
+    def gen_chunks(self, infer, label, starts):
+        chunks = self.rand_chunks(starts,
+                                  self.num_infer_chunks + self.num_label_chunks
+                                  - self.num_correct_chunks)
+        correct_chunks = np.random.choice(
+            range(len(chunks)), self.num_correct_chunks, replace=False)
+        infer_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in correct_chunks],
+            self.num_infer_chunks - self.num_correct_chunks,
+            replace=False)
+        infer_chunks = sorted(correct_chunks.tolist() + infer_chunks.tolist())
+        label_chunks = np.random.choice(
+            [x for x in range(len(chunks)) if x not in infer_chunks],
+            self.num_label_chunks - self.num_correct_chunks,
+            replace=False)
+        label_chunks = sorted(correct_chunks.tolist() + label_chunks.tolist())
+        self.fill_with_chunks(infer, [chunks[idx] for idx in infer_chunks])
+        self.fill_with_chunks(label, [chunks[idx] for idx in label_chunks])
+        # exclude types in excluded_chunk_types
+        if len(self.excluded_chunk_types) > 0:
+            for idx in correct_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_correct_chunks -= 1
+            for idx in infer_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_infer_chunks -= 1
+            for idx in label_chunks:
+                if chunks[idx].chunk_type in self.excluded_chunk_types:
+                    self.num_label_chunks -= 1
+        return self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks
+
+    def set_confs(self):
+        # Use the IOB scheme and labels with 2 chunk types
+        self.scheme = 'IOB'
+        self.num_chunk_types = 2
+        self.excluded_chunk_types = []
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9
+
+    def set_data(self):
+        infer = np.zeros((self.batch_size, )).astype('int64')
+        infer.fill(self.num_chunk_types * self.num_tag_types)
+        label = np.copy(infer)
+        starts = np.random.choice(
+            range(1, self.batch_size), self.num_sequences - 1,
+            replace=False).tolist()
+        starts.extend([0, self.batch_size])
+        starts = sorted(starts)
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = self.gen_chunks(
+            infer, label, starts)
+        self.inputs = {
+            'Inference': (infer, [starts]),
+            'Label': (label, [starts])
+        }
+        precision = float(
+            self.num_correct_chunks
+        ) / self.num_infer_chunks if self.num_infer_chunks else 0
+        recall = float(self.num_correct_chunks
+                       ) / self.num_label_chunks if self.num_label_chunks else 0
+        f1 = float(2 * precision * recall) / (
+            precision + recall) if self.num_correct_chunks else 0
+        self.outputs = {
+            'Precision': np.asarray(
+                [precision], dtype='float32'),
+            'Recall': np.asarray(
+                [recall], dtype='float32'),
+            'F1-Score': np.asarray(
+                [f1], dtype='float32'),
+            'NumInferChunks': np.asarray(
+                [self.num_infer_chunks], dtype='int64'),
+            'NumLabelChunks': np.asarray(
+                [self.num_label_chunks], dtype='int64'),
+            'NumCorrectChunks': np.asarray(
+                [self.num_correct_chunks], dtype='int64')
+        }
+
+    def setUp(self):
+        self.op_type = 'chunk_eval'
+        self.set_confs()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestChunkEvalOpWithExclude(TestChunkEvalOp):
+    def set_confs(self):
+        # Use the IOE scheme and labels with 3 chunk types
+        self.scheme = 'IOE'
+        self.num_chunk_types = 3
+        self.excluded_chunk_types = [1]
+        self.other_chunk_type = self.num_chunk_types
+        self.attrs = {
+            'num_chunk_types': self.num_chunk_types,
+            'chunk_scheme': self.scheme,
+            'excluded_chunk_types': self.excluded_chunk_types
+        }
+        self.parse_scheme()
+        self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 15, 18, 20
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
new file mode 100644
index 0000000000..b30f321c79
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_clip_by_norm_op.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestClipByNormOp(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input) < self.max_relative_error] = 0.5
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        norm = np.sqrt(np.sum(np.square(input)))
+        if norm > self.max_norm:
+            output = self.max_norm * input / norm
+        else:
+            output = input
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+
+class TestCase1(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1e20
+
+
+class TestCase2(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.max_norm = 0.1
+
+
+class TestCase3(TestClipByNormOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max_norm = 1.0
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_clip_op.py b/python/paddle/v2/fluid/tests/test_clip_op.py
new file mode 100644
index 0000000000..ef0b75e286
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_clip_op.py
@@ -0,0 +1,72 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestClipOp(OpTest):
+    def setUp(self):
+        self.max_relative_error = 0.006
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        input[np.abs(input - self.min) < self.max_relative_error] = 0.5
+        input[np.abs(input - self.max) < self.max_relative_error] = 0.5
+        self.op_type = "clip"
+        self.inputs = {'X': input, }
+        self.attrs = {}
+        self.attrs['min'] = self.min
+        self.attrs['max'] = self.max
+        self.outputs = {
+            'Out': np.clip(self.inputs['X'], self.attrs['min'],
+                           self.attrs['max'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=self.max_relative_error)
+
+    def initTestCase(self):
+        self.shape = (4, 4)
+        self.max = 0.7
+        self.min = 0.1
+
+
+class TestCase1(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16, 8)
+        self.max = 0.7
+        self.min = 0.0
+
+
+class TestCase2(TestClipOp):
+    def initTestCase(self):
+        self.shape = (8, 16)
+        self.max = 1.0
+        self.min = 0.0
+
+
+class TestCase3(TestClipOp):
+    def initTestCase(self):
+        self.shape = (4, 8, 16)
+        self.max = 0.7
+        self.min = 0.2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_compare_op.py b/python/paddle/v2/fluid/tests/test_compare_op.py
new file mode 100644
index 0000000000..c9be80fc45
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_compare_op.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import op_test
+import unittest
+import numpy
+
+
+def create_test_class(op_type, typename, callback):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = numpy.random.random(size=(10, 7)).astype(typename)
+            b = numpy.random.random(size=(10, 7)).astype(typename)
+            c = callback(a, b)
+            self.inputs = {'X': a, 'Y': b}
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+
+        def test_output(self):
+            self.check_output()
+
+    cls_name = "{0}_{1}".format(op_type, typename)
+    Cls.__name__ = cls_name
+    globals()[cls_name] = Cls
+
+
+for _type_name in {'float32', 'float64', 'int32', 'int64'}:
+    create_test_class('less_than', _type_name, lambda _a, _b: _a < _b)
+    create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b)
+    create_test_class('equal', _type_name, lambda _a, _b: _a == _b)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_concat_op.py b/python/paddle/v2/fluid/tests/test_concat_op.py
new file mode 100644
index 0000000000..ea0a95ebec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_concat_op.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestConcatOp(OpTest):
+    def setUp(self):
+        self.op_type = "concat"
+        x0 = np.random.random((2, 1, 4, 5)).astype('float32')
+        x1 = np.random.random((2, 2, 4, 5)).astype('float32')
+        x2 = np.random.random((2, 3, 4, 5)).astype('float32')
+        axis = 1
+        self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]}
+        self.attrs = {'axis': axis}
+        self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cond_op.py b/python/paddle/v2/fluid/tests/test_cond_op.py
new file mode 100644
index 0000000000..4b7ca0963e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cond_op.py
@@ -0,0 +1,131 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import paddle.v2.fluid.core as core
+import unittest
+import numpy as np
+from paddle.v2.fluid.op import Operator, CondOp
+
+
+class PySimpleCond(object):
+    '''
+    A simple implementation of dynamic if-else based on numpy
+    '''
+
+    def __init__(self):
+        array = [1] * 10
+        for i in range(1, 10, 2):
+            array[i] = 0
+        self.cond = np.array(array)
+        self.x = np.ones(shape=(10, 1)).astype("float32")
+
+    def forward(self):
+        self.index_t = np.where(self.cond == 1)
+        self.index_f = np.where(self.cond == 0)
+        y_t = self.x[self.index_t]
+        y_f = self.x[self.index_f]
+        y_t = y_t * 2.
+        y_f = y_f * (-2.)
+        output = np.zeros(shape=(10, 1))
+        output[self.index_t] = y_t
+        output[self.index_f] = y_f
+        return output
+
+
+class PySimpleCondTest(unittest.TestCase):
+    def setUp(self):
+        self.condnn = PySimpleCond()
+
+    def test_forward(self):
+        output = self.condnn.forward()
+
+
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestCondOp(unittest.TestCase):
+    '''
+    Test CondOp
+
+    equation:
+        cond = [True, False, True, False, ...]
+        y[index_t] = x[index_t] * 2.
+        y[index_f] = x[index_f] * -2.
+    outputs:
+        y
+    '''
+
+    def setUp(self):
+        self.py_cond = PySimpleCond()
+
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_cond_op()
+        self.create_sub_net()
+        self.condop.run(self.scope, core.CPUPlace())
+        return np.array(self.scope.find_var("Out").get_tensor())
+
+    def create_global_variables(self):
+        x_np_data = self.py_cond.x
+        create_tensor(self.scope, "X", [10, 1], x_np_data)
+        cond_np_data = self.py_cond.cond.astype("int32")
+        create_tensor(self.scope, "cond", [10, 1], cond_np_data)
+        self.scope.var("SubScopes")
+        self.scope.var("IndexTensors")
+        self.scope.var("Out")
+
+    def create_cond_op(self):
+        self.condop = CondOp(
+            Cond="cond",
+            Xs=["X"],
+            Outs=["Out"],
+            SubScopes="SubScopes",
+            IndexTensors="IndexTensors")
+
+    def create_sub_net(self):
+        truenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=2.)
+        truenet.append_op(scale_op_t)
+        truenet.complete_add_op(True)
+        self.condop.set_truenet(truenet)
+
+        falsenet = core.Net.create()
+        scale_op_t = Operator("scale", X='X', Out='Out', scale=-2.)
+        falsenet.append_op(scale_op_t)
+        falsenet.complete_add_op(True)
+        self.condop.set_falsenet(falsenet)
+
+    def test_forward(self):
+        print 'test cond op forward'
+        pd_output = self.forward()
+        py_output = self.py_cond.forward()
+        print 'pd_output', pd_output
+        print
+        print 'py_output', py_output
+        self.assertEqual(pd_output.shape, py_output.shape)
+        print 'test passed'
+        return 0
+
+
+if __name__ == "__main__":
+    exit(
+        0
+    )  # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
new file mode 100644
index 0000000000..5ee729cfee
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+import numpy
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_forward(self):
+        data = layers.data(name='X', shape=[1], dtype='float32')
+        data.stop_gradient = False
+        cond = layers.ConditionalBlock(inputs=[data])
+        out = layers.create_tensor(dtype='float32')
+        with cond.block():
+            hidden = layers.fc(input=data, size=10)
+            layers.assign(hidden, out)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        exe.run(default_startup_program())
+
+        x = numpy.random.random(size=(10, 1)).astype('float32')
+
+        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
+        print outs
+        loss = layers.mean(x=out)
+        append_backward(loss=loss)
+        outs = exe.run(
+            feed={'X': x},
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
+        print outs
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_const_value.py b/python/paddle/v2/fluid/tests/test_const_value.py
new file mode 100644
index 0000000000..d5b7cfded1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_const_value.py
@@ -0,0 +1,28 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class ConditionalBlock(unittest.TestCase):
+    def test_const_value(self):
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.TEMP_VAR_NAME, "@TEMP@")
+        self.assertEqual(framework.GRAD_VAR_SUFFIX, "@GRAD")
+        self.assertEqual(framework.ZERO_VAR_SUFFIX, "@ZERO")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
new file mode 100644
index 0000000000..24de74d730
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def conv2d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilation']
+    out_h = 1 + (in_h + 2 * pad[0] - (dilation[0] * (f_h - 1) + 1)) / stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((out_c, f_c, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_h:dilation[0], 0:d_bolck_w:dilation[
+        1]] = filter
+
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + d_bolck_h,
+                    j * stride[1]:j * stride[1] + d_bolck_w]
+
+                f_sub = filter_dilation[g * sub_out_c:(g + 1) *
+                                        sub_out_c, :, :, :]
+                for k in range(sub_out_c):
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    return out
+
+
+class TestConv2dOp(OpTest):
+    def setUp(self):
+        self.use_cudnn = False
+        self.init_op_type()
+        self.init_group()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv2d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilation': self.dilations
+        }
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv2d_forward_naive(input, filter, self.groups,
+                                      conv2d_param).astype('float32')
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.02)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv2d"
+
+
+class TestWithPad(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithStride(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.input_size = [2, 3, 6, 6]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+
+class TestWithGroup(TestConv2dOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestConv2dOp):
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.input_size = [2, 3, 10, 10]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+#----------------Conv2dCUDNN----------------
+class TestCUDNN(TestConv2dOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
+class TestCUDNNWithPad(TestWithPad):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
+class TestCUDNNWithStride(TestWithStride):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
+class TestCUDNNWithGroup(TestWithGroup):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
+class TestCUDNNWith1x1(TestWith1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d"
+
+
+#  cudnn v5 does not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv_cudnn"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
new file mode 100644
index 0000000000..0c76e222c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv2d_transpose_op.py
@@ -0,0 +1,217 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def conv2dtranspose_forward_naive(input_, filter_, attrs):
+    in_n, in_c, in_h, in_w = input_.shape
+    f_c, out_c, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+    d_bolck_h = dilations[0] * (f_h - 1) + 1
+    d_bolck_w = dilations[1] * (f_w - 1) + 1
+    out_h = (in_h - 1) * stride[0] + d_bolck_h
+    out_w = (in_w - 1) * stride[1] + d_bolck_w
+
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    for n in range(in_n):
+        for i in range(in_h):
+            for j in range(in_w):
+                input_masked = input_[n, :, i, j]  # (c)
+                input_masked = np.reshape(input_masked, (in_c, 1, 1))
+                input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                for k in range(out_c):
+                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
+                    i1, i2 = i * stride[0], i * stride[0] + d_bolck_h
+                    j1, j2 = j * stride[0], j * stride[0] + d_bolck_h
+                    out[n, k, i1:i2:dilations[0], j1:j2:dilations[1]] += tmp_out
+
+    out = out[:, :, pad[0]:out_h - pad[0], pad[1]:out_w - pad[1]]
+    return out
+
+
+class TestConv2dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.use_cudnn = False
+        self.init_op_type()
+        self.init_test_case()
+
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+        }
+
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype('float32')
+
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Input']))
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.02,
+                no_grad_set=set(['Filter']))
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.02)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2d_transpose"
+
+
+class TestWithPad(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithStride(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+class TestWithDilation(TestConv2dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [2, 2]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+
+# ------------ test_cudnn ------------
+class TestCUDNN(TestConv2dTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+class TestCUDNNWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [2, 2]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv2d_transpose"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1]
+#         self.stride = [2, 2]
+#         self.dilations = [2, 2]
+#         self.input_size = [2, 3, 5, 5]  # NCHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv2d_transpose"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
new file mode 100644
index 0000000000..8121e32865
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
@@ -0,0 +1,250 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def conv3d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_d, in_h, in_w = input.shape
+    out_c, f_c, f_d, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad, dilation = conv_param['stride'], conv_param['pad'], conv_param[
+        'dilations']
+
+    out_d = 1 + (in_d + 2 * pad[0] - (dilation[0] * (f_d - 1) + 1)) / stride[0]
+    out_h = 1 + (in_h + 2 * pad[1] - (dilation[1] * (f_h - 1) + 1)) / stride[1]
+    out_w = 1 + (in_w + 2 * pad[2] - (dilation[2] * (f_w - 1) + 1)) / stride[2]
+
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    d_bolck_d = (dilation[0] * (f_d - 1) + 1)
+    d_bolck_h = (dilation[1] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[2] * (f_w - 1) + 1)
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], ),
+                               (pad[2], )),
+                       mode='constant',
+                       constant_values=0)
+
+    filter_dilation = np.zeros((out_c, f_c, d_bolck_d, d_bolck_h, d_bolck_w))
+    filter_dilation[:, :, 0:d_bolck_d:dilation[0], 0:d_bolck_h:dilation[1], 0:
+                    d_bolck_w:dilation[2]] = filter
+
+    for d in range(out_d):
+        for i in range(out_h):
+            for j in range(out_w):
+                for g in range(group):
+                    input_pad_masked = \
+                        input_pad[:, g * f_c:(g + 1) * f_c,
+                        d * stride[0]:d * stride[0] + d_bolck_d,
+                        i * stride[1]:i * stride[1] + d_bolck_h,
+                        j * stride[2]:j * stride[2] + d_bolck_w]
+
+                    f_sub = filter_dilation[g * sub_out_c:(g + 1) *
+                                            sub_out_c, :, :, :, :]
+                    for k in range(sub_out_c):
+                        out[:, g * sub_out_c + k, d, i, j] = \
+                            np.sum(input_pad_masked * f_sub[k, :, :, :, :],
+                                   axis=(1, 2, 3, 4))
+
+    return out
+
+
+class TestConv3dOp(OpTest):
+    def setUp(self):
+        self.use_cudnn = False
+        self.init_group()
+        self.init_op_type()
+        self.init_dilation()
+        self.init_test_case()
+
+        conv3d_param = {
+            'stride': self.stride,
+            'pad': self.pad,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+        }
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv3d_forward_naive(input, filter, self.groups,
+                                      conv3d_param).astype("float32")
+
+        self.inputs = {'Input': input, 'Filter': filter}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.03)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv3d"
+
+
+class TestCase1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3, 3]
+
+
+class TestWithGroup1(TestConv3dOp):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithGroup2(TestCase1):
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWith1x1(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 4, 4, 4]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 1, 1, 1]
+
+    def init_dilation(self):
+        self.dilations = [1, 1, 1]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestWithDilation(TestConv3dOp):
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.input_size = [2, 3, 6, 6, 6]  # NCDHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 2, 2, 2]
+
+    def init_dilation(self):
+        self.dilations = [2, 2, 2]
+
+    def init_group(self):
+        self.groups = 3
+
+
+class TestCUDNN(TestConv3dOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWithGroup1CUDNN(TestWithGroup1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWithGroup2CUDNN(TestWithGroup2):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+class TestWith1x1CUDNN(TestWith1x1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d"
+
+
+# FIXME(typhoonzero): find a way to determine if
+# using cudnn > 6 in python
+# class TestWithDilationCUDNN(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv3d"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
new file mode 100644
index 0000000000..4934c5a34e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv3d_transpose_op.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def conv3dtranspose_forward_naive(input_, filter_, attrs):
+    in_n, in_c, in_d, in_h, in_w = input_.shape
+    f_c, out_c, f_d, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad, dilations = attrs['strides'], attrs['paddings'], attrs[
+        'dilations']
+
+    d_bolck_d = dilations[0] * (f_d - 1) + 1
+    d_bolck_h = dilations[1] * (f_h - 1) + 1
+    d_bolck_w = dilations[2] * (f_w - 1) + 1
+    out_d = (in_d - 1) * stride[0] + d_bolck_d
+    out_h = (in_h - 1) * stride[1] + d_bolck_h
+    out_w = (in_w - 1) * stride[2] + d_bolck_w
+    out = np.zeros((in_n, out_c, out_d, out_h, out_w))
+
+    for n in range(in_n):
+        for d in range(in_d):
+            for i in range(in_h):
+                for j in range(in_w):
+                    input_masked = input_[n, :, d, i, j]  # (c)
+                    input_masked = np.reshape(input_masked, (in_c, 1, 1, 1))
+                    input_masked = np.tile(input_masked, (1, f_d, f_h, f_w))
+
+                    for k in range(out_c):
+                        tmp_out = np.sum(input_masked * filter_[:, k, :, :, :],
+                                         axis=0)
+                        d1, d2 = d * stride[0], d * stride[0] + d_bolck_d
+                        i1, i2 = i * stride[1], i * stride[1] + d_bolck_h
+                        j1, j2 = j * stride[2], j * stride[2] + d_bolck_w
+                        out[n, k, d1:d2:dilations[0], i1:i2:dilations[1], j1:j2:
+                            dilations[2]] += tmp_out
+
+    out = out[:, :, pad[0]:out_d - pad[0], pad[1]:out_h - pad[1], pad[2]:out_w -
+              pad[2]]
+    return out
+
+
+class TestConv3dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.use_cudnn = False
+        self.init_op_type()
+        self.init_test_case()
+
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            'dilations': self.dilations,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+        }
+
+        output = conv3dtranspose_forward_naive(input_, filter_,
+                                               self.attrs).astype("float32")
+
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['Input', 'Filter']),
+                'Output',
+                max_relative_error=0.03)
+        else:
+            self.check_grad(
+                set(['Input', 'Filter']), 'Output', max_relative_error=0.03)
+
+    def test_check_grad_no_filter(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+        else:
+            self.check_grad(
+                ['Input'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+        else:
+            self.check_grad(
+                ['Filter'],
+                'Output',
+                max_relative_error=0.03,
+                no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0, 0]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv3d_transpose"
+
+
+class TestWithPad(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithStride(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+class TestWithDilation(TestConv3dTransposeOp):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [2, 2, 2]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+
+# ------------ test_cudnn ------------
+class TestCUDNN(TestConv3dTransposeOp):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
+class TestCUDNNWithPad(TestWithPad):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [1, 1, 1]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
+class TestCUDNNWithStride(TestWithStride):
+    def init_test_case(self):
+        self.pad = [1, 1, 1]
+        self.stride = [2, 2, 2]
+        self.dilations = [1, 1, 1]
+        self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3, 3]
+
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "conv3d_transpose"
+
+
+# #cudnn v5 does not support dilation conv.
+# class TestCUDNNWithDilation(TestWithDilation):
+#     def init_test_case(self):
+#         self.pad = [1, 1, 1]
+#         self.stride = [2, 2, 2]
+#         self.dilations = [2, 2, 2]
+#         self.input_size = [2, 3, 5, 5, 5]  # NCDHW
+#         f_c = self.input_size[1]
+#         self.filter_size = [f_c, 6, 3, 3, 3]
+#
+#     def init_op_type(self):
+#         self.op_type = "conv3d_transpose"
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_conv_shift_op.py b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
new file mode 100644
index 0000000000..7029d5a2eb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_conv_shift_op.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv_shift_forward(x, y):
+    out = np.zeros_like(x)
+    M = x.shape[1]
+    N = y.shape[1]
+    y_half_width = (N - 1) / 2
+    for i in xrange(M):
+        for j in xrange(N):
+            out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
+    return out
+
+
+class TestConvShiftOp(OpTest):
+    def setUp(self):
+        self.op_type = "conv_shift"
+
+        batch_size = 4
+        x_dim = 17
+        y_dim = 3  # must be odd and <= x_dim
+        x = np.random.random((batch_size, x_dim)).astype("float32")
+        y = np.random.random((batch_size, y_dim)).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+
+        out = conv_shift_forward(x, y)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cos_sim_op.py b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
new file mode 100644
index 0000000000..33db12ba9c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cos_sim_op.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestCosSimOp(OpTest):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5)).astype("float32"),
+            'Y': np.random.random((6, 5)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.06)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.06, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Y'))
+
+
+class TestCosSimOp2(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5)).astype("float32"),
+            'Y': np.random.random((1, 5)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=1)
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=1)
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=1) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimOp3(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((6, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+class TestCosSimOp4(TestCosSimOp):
+    def setUp(self):
+        self.op_type = "cos_sim"
+        self.inputs = {
+            'X': np.random.random((6, 5, 2)).astype("float32"),
+            'Y': np.random.random((1, 5, 2)).astype("float32")
+        }
+        expect_x_norm = np.linalg.norm(self.inputs['X'], axis=(1, 2))
+        expect_y_norm = np.linalg.norm(self.inputs['Y'], axis=(1, 2))
+        expect_out = (self.inputs['X'] * self.inputs['Y']).sum(axis=(1, 2)) / \
+            expect_x_norm / expect_y_norm
+        self.outputs = {
+            'XNorm': np.expand_dims(expect_x_norm, 1),
+            'YNorm': np.expand_dims(expect_y_norm, 1),
+            'Out': np.expand_dims(expect_out, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_create_op_doc_string.py b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
new file mode 100644
index 0000000000..2b7951ecea
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_create_op_doc_string.py
@@ -0,0 +1,25 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.layers as layers
+
+
+class TestDocString(unittest.TestCase):
+    def test_layer_doc_string(self):
+        print layers.dropout.__doc__
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
new file mode 100644
index 0000000000..f819387cdc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -0,0 +1,160 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import random
+import numpy as np
+
+from op_test import OpTest
+
+
+class CRFDecoding(object):
+    def __init__(self, emission_weights, transition_weights,
+                 seq_start_positions):
+        assert (emission_weights.shape[0] == seq_start_positions[-1])
+        self.tag_num = emission_weights.shape[1]
+        self.seq_num = len(seq_start_positions) - 1
+
+        self.seq_start_positions = seq_start_positions
+        self.x = emission_weights
+
+        self.a = transition_weights[0, :]
+        self.b = transition_weights[1, :]
+        self.w = transition_weights[2:, :]
+
+        self.track = np.zeros(
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
+        self.decoded_path = np.zeros(
+            (seq_start_positions[-1], 1), dtype="int64")
+
+    def _decode_one_sequence(self, decoded_path, x):
+        seq_len, tag_num = x.shape
+        alpha = np.zeros((seq_len, tag_num), dtype="float64")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
+
+        for i in range(tag_num):
+            alpha[0, i] = self.a[i] + x[0, i]
+
+        for k in range(1, seq_len):
+            for i in range(tag_num):
+                max_score = -np.finfo("float64").max
+                max_idx = 0
+                for j in range(tag_num):
+                    score = alpha[k - 1, j] + self.w[j, i]
+                    if score > max_score:
+                        max_score = score
+                        max_idx = j
+                alpha[k, i] = max_score + x[k, i]
+                track[k, i] = max_idx
+
+        max_score = -np.finfo("float64").max
+        max_idx = 0
+        for i in range(tag_num):
+            score = alpha[seq_len - 1, i] + self.b[i]
+            if score > max_score:
+                max_score = score
+                max_idx = i
+
+        decoded_path[-1] = max_idx
+        for i in range(seq_len - 1, 0, -1):
+            decoded_path[i - 1] = max_idx = track[i, max_idx]
+
+    def decode(self):
+        for i in range(self.seq_num):
+            start = self.seq_start_positions[i]
+            end = self.seq_start_positions[i + 1]
+            self._decode_one_sequence(self.decoded_path[start:end, :],
+                                      self.x[start:end, :])
+        return self.decoded_path
+
+
+class TestCRFDecodingOp1(OpTest):
+    """
+    Compare the dynamic program with random generated parameters and inputs
+    with grouth truth not being given.
+    """
+
+    def set_test_data(self):
+        SEQ_NUM = 3
+        TAG_NUM = 17
+        MAX_SEQ_LEN = 10
+
+        lod = [[0]]
+        for i in range(SEQ_NUM):
+            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+        emission = np.random.uniform(-1, 1,
+                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+        transition = np.random.uniform(-0.5, 0.5,
+                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
+
+        self.inputs = {
+            "Emission": (emission, lod),
+            "Transition": transition,
+        }
+
+        decoder = CRFDecoding(emission, transition, lod[0])
+        decoded_path = decoder.decode()
+
+        self.outputs = {"ViterbiPath": decoded_path}
+
+    def setUp(self):
+        self.op_type = "crf_decoding"
+        self.set_test_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestCRFDecodingOp2(OpTest):
+    """
+    Compare the dynamic program with brute force computation with
+    ground truth being given.
+    """
+
+    def setUp(self):
+        self.op_type = "crf_decoding"
+        TAG_NUM = 5
+
+        lod = [[0, 1, 3, 6, 10]]
+        transition = np.repeat(
+            np.arange(
+                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
+            TAG_NUM + 2,
+            axis=0)
+        emission = np.repeat(
+            np.arange(
+                TAG_NUM, dtype="float64").reshape(1, TAG_NUM),
+            lod[-1][-1],
+            axis=0)
+
+        labels = np.random.randint(
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+        predicted_labels = np.ones(
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+        expected_output = (labels == predicted_labels).astype("int64")
+
+        self.inputs = {
+            "Emission": (emission, lod),
+            "Transition": transition,
+            "Label": (labels, lod)
+        }
+
+        self.outputs = {"ViterbiPath": expected_output}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_crop_op.py b/python/paddle/v2/fluid/tests/test_crop_op.py
new file mode 100644
index 0000000000..36bf176168
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_crop_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def crop(data, offsets, crop_shape):
+    def indexOf(shape, index):
+        result = []
+        for dim in reversed(shape):
+            result.append(index % dim)
+            index = index / dim
+        return result[::-1]
+
+    result = []
+    for i, value in enumerate(data.flatten()):
+        index = indexOf(data.shape, i)
+        selected = True
+        if len(index) == len(offsets):
+            for j, offset in enumerate(offsets):
+                selected = selected and index[j] >= offset and index[
+                    j] < crop_shape[j] + offset
+            if selected:
+                result.append(value)
+    return np.array(result).reshape(crop_shape)
+
+
+class TestCropOp(OpTest):
+    def setUp(self):
+        self.op_type = "crop"
+        self.crop_by_input = False
+        self.attrs = {}
+        self.initTestCase()
+        self.attrs['offsets'] = self.offsets
+        if self.crop_by_input:
+            self.inputs = {
+                'X': np.random.random(self.x_shape).astype("float32"),
+                'Y': np.random.random(self.crop_shape).astype("float32")
+            }
+        else:
+            self.attrs['shape'] = self.crop_shape
+            self.inputs = {
+                'X': np.random.random(self.x_shape).astype("float32"),
+            }
+        self.outputs = {
+            'Out': crop(self.inputs['X'], self.offsets, self.crop_shape)
+        }
+
+    def initTestCase(self):
+        self.x_shape = (8, 8)
+        self.crop_shape = (2, 2)
+        self.offsets = [1, 2]
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+
+class TestCase1(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (16, 8, 32)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 5, 3]
+
+
+class TestCase2(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 8)
+        self.crop_shape = [4, 8]
+        self.offsets = [0, 0]
+
+
+class TestCase3(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 8, 16)
+        self.crop_shape = [2, 2, 3]
+        self.offsets = [1, 5, 3]
+        self.crop_by_input = True
+
+
+class TestCase4(TestCropOp):
+    def initTestCase(self):
+        self.x_shape = (4, 4)
+        self.crop_shape = [4, 4]
+        self.offsets = [0, 0]
+        self.crop_by_input = True
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
new file mode 100644
index 0000000000..ae8e9be6de
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_cross_entropy_op.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest, randomize_probability
+
+
+class TestCrossEntropyOp1(OpTest):
+    """Test cross-entropy with discrete one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 30
+        class_num = 10
+
+        X = randomize_probability(batch_size, class_num, dtype='float64')
+
+        label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64")
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label[i][0]])] for i in range(X.shape[0])],
+            dtype="float64")
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp2(OpTest):
+    """Test cross-entropy with vectorized soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 5
+        class_num = 37
+
+        X = randomize_probability(batch_size, class_num)
+        label = np.random.uniform(0.1, 1.0,
+                                  [batch_size, class_num]).astype("float32")
+        label /= label.sum(axis=1, keepdims=True)
+        cross_entropy = (-label * np.log(X)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
+class TestCrossEntropyOp3(OpTest):
+    """Test cross-entropy with vectorized one-hot representation of labels.
+    """
+
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        batch_size = 5
+        class_num = 17
+
+        X = randomize_probability(batch_size, class_num)
+        label_index = np.random.randint(
+            0, class_num, (batch_size), dtype="int32")
+        label = np.zeros(X.shape)
+        label[np.arange(batch_size), label_index] = 1
+
+        cross_entropy = np.asmatrix(
+            [[-np.log(X[i][label_index[i]])] for i in range(X.shape[0])],
+            dtype="float32")
+        cross_entropy2 = (-label * np.log(X)).sum(
+            axis=1, keepdims=True).astype("float32")
+
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_ctc_align.py b/python/paddle/v2/fluid/tests/test_ctc_align.py
new file mode 100644
index 0000000000..773c69d1ad
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ctc_align.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+def CTCAlign(input, lod, blank, merge_repeated):
+    lod0 = lod[0]
+    result = []
+    for i in range(len(lod0) - 1):
+        prev_token = -1
+        for j in range(lod0[i], lod0[i + 1]):
+            token = input[j][0]
+            if (token != blank) and not (merge_repeated and
+                                         token == prev_token):
+                result.append(token)
+            prev_token = token
+    result = np.array(result).reshape([len(result), 1]).astype("int32")
+    return result
+
+
+class TestCTCAlignOp(OpTest):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 18]]
+        self.blank = 0
+        self.merge_repeated = False
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0]).reshape(
+                [18, 1]).astype("int32")
+
+    def setUp(self):
+        self.config()
+        output = CTCAlign(self.input, self.input_lod, self.blank,
+                          self.merge_repeated)
+
+        self.inputs = {"Input": (self.input, self.input_lod), }
+        self.outputs = {"Output": output}
+        self.attrs = {
+            "blank": self.blank,
+            "merge_repeated": self.merge_repeated
+        }
+
+    def test_check_output(self):
+        self.check_output()
+        pass
+
+
+class TestCTCAlignOpCase1(TestCTCAlignOp):
+    def config(self):
+        self.op_type = "ctc_align"
+        self.input_lod = [[0, 11, 19]]
+        self.blank = 0
+        self.merge_repeated = True
+        self.input = np.array(
+            [0, 1, 2, 2, 0, 4, 0, 4, 5, 0, 6, 6, 0, 0, 7, 7, 7, 0, 0]).reshape(
+                [19, 1]).astype("int32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/v2/fluid/tests/test_data_feeder.py
new file mode 100644
index 0000000000..f967221015
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+
+
+def test_converter():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28])
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
+    print(result)
+
+
+if __name__ == '__main__':
+    test_converter()
diff --git a/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
new file mode 100644
index 0000000000..78d4e3608e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_decayed_adagrad_op.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDecayedAdagradOp1(OpTest):
+    ''' Test DecayedAdagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.80
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDecayedAdagradOp2(OpTest):
+    ''' Test DecayedAdagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_default_scope_funcs.py b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
new file mode 100644
index 0000000000..5ff52f6d6b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_default_scope_funcs.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.v2.fluid.default_scope_funcs import *
+import unittest
+
+
+class TestDefaultScopeFuncs(unittest.TestCase):
+    def test_cur_scope(self):
+        self.assertIsNotNone(get_cur_scope())
+
+    def test_none_variable(self):
+        self.assertIsNone(find_var("test"))
+
+    def test_create_var_get_var(self):
+        var_a = var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        enter_local_scope()
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        leave_local_scope()
+
+    def test_var_get_int(self):
+        def __new_scope__():
+            i = var("var_i")
+            self.assertFalse(i.is_int())
+            i.set_int(10)
+            self.assertTrue(i.is_int())
+            self.assertEqual(10, i.get_int())
+
+        for _ in xrange(10):
+            scoped_function(__new_scope__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_detection_output_op.py b/python/paddle/v2/fluid/tests/test_detection_output_op.py
new file mode 100644
index 0000000000..8a5e06b38f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_detection_output_op.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "detection_output"
+        self.init_test_case()
+
+        #loc.shape ((1, 4, 4, 1, 1))
+        #conf.shape ((1, 4, 2, 1, 1))
+
+        loc = np.array([[[[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]],
+                         [[[0.1]], [[0.1]], [[0.1]], [[0.1]]]]])
+        conf = np.array([[[[[0.1]], [[0.9]]], [[[0.2]], [[0.8]]],
+                          [[[0.3]], [[0.7]]], [[[0.4]], [[0.6]]]]])
+        priorbox = np.array([
+            0.1, 0.1, 0.5, 0.5, 0.1, 0.1, 0.2, 0.2, 0.2, 0.2, 0.6, 0.6, 0.1,
+            0.1, 0.2, 0.2, 0.3, 0.3, 0.7, 0.7, 0.1, 0.1, 0.2, 0.2, 0.4, 0.4,
+            0.8, 0.8, 0.1, 0.1, 0.2, 0.2
+        ])
+
+        output = np.array([
+            0, 1, 0.68997443, 0.099959746, 0.099959746, 0.50804031, 0.50804031
+        ])
+        self.inputs = {
+            'Loc': loc.astype('float32'),
+            'Conf': conf.astype('float32'),
+            'PriorBox': priorbox.astype('float32')
+        }
+        self.attrs = {
+            'num_classes': self.num_classes,
+            'top_k': self.top_k,
+            'nms_top_k': self.nms_top_k,
+            'background_label_id': self.background_label_id,
+            'nms_threshold': self.nms_threshold,
+            'confidence_threshold': self.confidence_threshold,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def init_test_case(self):
+        self.num_classes = 2
+        self.top_k = 10
+        self.nms_top_k = 20
+        self.background_label_id = 0
+        self.nms_threshold = 0.01
+        self.confidence_threshold = 0.01
+
+
+if __name__ == '__main__':
+    # FIXME: detection_output_op will be rewritten. This unittest should be
+    # enabled after rewriting.
+    exit(0)  # temporary disable this unittest
diff --git a/python/paddle/v2/fluid/tests/test_dropout_op.py b/python/paddle/v2/fluid/tests/test_dropout_op.py
new file mode 100644
index 0000000000..b0c55df9f5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dropout_op.py
@@ -0,0 +1,86 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDropoutOp(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64)).astype('float32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+
+class TestDropoutOp2(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 1.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': np.zeros((32, 64)).astype('float32'),
+            'Mask': np.zeros((32, 64)).astype('float32')
+        }
+
+
+class TestDropoutOp3(TestDropoutOp):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 2)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.0, 'fix_seed': True, 'is_test': False}
+        self.outputs = {
+            'Out': self.inputs['X'],
+            'Mask': np.ones((32, 64, 2)).astype('float32')
+        }
+
+
+class TestDropoutOp4(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.35, 'fix_seed': True, 'is_test': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDropoutOp5(OpTest):
+    def setUp(self):
+        self.op_type = "dropout"
+        self.inputs = {'X': np.random.random((32, 64, 3)).astype("float32")}
+        self.attrs = {'dropout_prob': 0.75, 'is_test': True}
+        self.outputs = {
+            'Out': self.inputs['X'] * (1.0 - self.attrs['dropout_prob'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
new file mode 100644
index 0000000000..2ac926c63c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import unittest
+import numpy
+
+
+class TestDynRNN(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 2
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+
+    def test_plain_while_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+
+            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+
+            sent_emb_array = fluid.layers.lod_tensor_to_array(
+                x=sent_emb, table=rank_table)
+
+            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            i.stop_gradient = False
+
+            boot_mem = fluid.layers.fill_constant_batch_size_like(
+                input=fluid.layers.array_read(
+                    array=sent_emb_array, i=i),
+                value=0,
+                shape=[-1, 100],
+                dtype='float32')
+            boot_mem.stop_gradient = False
+
+            mem_array = fluid.layers.array_write(x=boot_mem, i=i)
+
+            cond = fluid.layers.less_than(x=i, y=seq_len)
+            cond.stop_gradient = False
+            while_op = fluid.layers.While(cond=cond)
+            out = fluid.layers.create_array(dtype='float32')
+
+            with while_op.block():
+                mem = fluid.layers.array_read(array=mem_array, i=i)
+                ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
+
+                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+
+                hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
+
+                fluid.layers.array_write(x=hidden, i=i, array=out)
+                fluid.layers.increment(x=i, in_place=True)
+                fluid.layers.array_write(x=hidden, i=i, array=mem_array)
+                fluid.layers.less_than(x=i, y=seq_len, cond=cond)
+
+            all_timesteps = fluid.layers.array_to_lod_tensor(
+                x=out, table=rank_table)
+            last = fluid.layers.sequence_last_step(input=all_timesteps)
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(1e-4)
+            sgd.minimize(loss=loss)
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+
+        data = next(self.train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(numpy.isnan(val))
+
+    def test_train_dyn_rnn(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+
+            rnn = fluid.layers.DynamicRNN()
+
+            with rnn.block():
+                in_ = rnn.step_input(sent_emb)
+                mem = rnn.memory(shape=[100], dtype='float32')
+                out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh')
+                rnn.update_memory(mem, out_)
+                rnn.output(out_)
+
+            last = fluid.layers.sequence_last_step(input=rnn())
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.Adam(1e-3)
+            sgd.minimize(loss=loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(self.train_data())
+        loss_0 = exe.run(main_program,
+                         feed=feeder.feed(data),
+                         fetch_list=[loss])[0]
+        for _ in xrange(100):
+            val = exe.run(main_program,
+                          feed=feeder.feed(data),
+                          fetch_list=[loss])[0]
+        # loss should be small after 100 mini-batch
+        self.assertLess(val[0], loss_0[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
new file mode 100644
index 0000000000..dd608432df
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_gradient_check.py
@@ -0,0 +1,378 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import random
+import collections
+import paddle.v2.fluid as fluid
+import unittest
+from decorators import *
+
+
+class Memory(object):
+    def __init__(self, shape, dtype='float32'):
+        self.ex = numpy.zeros(shape=shape, dtype=dtype)
+        self.cur = None
+
+    def update(self, val):
+        assert val.shape == self.ex.shape
+        assert val.dtype == self.ex.dtype
+        self.cur = val
+
+    def ex(self):
+        return self.ex
+
+    def next(self):
+        self.ex = self.cur
+        self.cur = None
+
+    def __next__(self):
+        self.next()
+
+    def reset(self):
+        self.ex = numpy.zeros(shape=self.ex.shape, dtype=self.ex.dtype)
+        self.cur = None
+
+
+class Output(object):
+    def __init__(self):
+        self.outs = []
+
+    def next_sequence(self):
+        self.outs.append([])
+
+    def out(self, val):
+        self.outs[-1].append(val)
+
+    def last(self):
+        return self.outs[-1][-1]
+
+
+class BaseRNN(object):
+    def __init__(self, ins, mems, params, outs, num_seq=5, max_seq_len=15):
+        self.num_seq = num_seq
+        self.inputs = collections.defaultdict(list)
+
+        for _ in xrange(num_seq):
+            seq_len = random.randint(1, max_seq_len - 1)
+            for iname in ins:
+                ishape = ins[iname].get('shape', None)
+                idtype = ins[iname].get('dtype', 'float32')
+                lst = []
+                for _ in xrange(seq_len):
+                    lst.append(numpy.random.random(size=ishape).astype(idtype))
+                self.inputs[iname].append(lst)
+
+        self.mems = dict()
+        for mname in mems:
+            mshape = mems[mname].get('shape', None)
+            mdtype = mems[mname].get('dtype', 'float32')
+            self.mems[mname] = Memory(shape=mshape, dtype=mdtype)
+
+        self.params = dict()
+        for pname in params:
+            pshape = params[pname].get('shape', None)
+            pdtype = params[pname].get('dtype', 'float32')
+            self.params[pname] = numpy.random.random(size=pshape).astype(pdtype)
+
+        self.outputs = dict()
+
+        for oname in outs:
+            self.outputs[oname] = Output()
+
+    def step(self, **kwargs):
+        raise NotImplementedError()
+
+    def exe(self):
+        retv = dict()
+        for out in self.outputs:
+            retv[out] = []
+
+        for seq_id in xrange(self.num_seq):
+            for mname in self.mems:
+                self.mems[mname].reset()
+            for out in self.outputs:
+                self.outputs[out].next_sequence()
+
+            iname0 = self.inputs.keys()[0]
+            seq_len = len(self.inputs[iname0][seq_id])
+
+            for step_id in xrange(seq_len):
+                xargs = dict()
+
+                for iname in self.inputs:
+                    xargs[iname] = self.inputs[iname][seq_id][step_id]
+
+                for mname in self.mems:
+                    xargs[mname] = self.mems[mname]
+
+                for pname in self.params:
+                    xargs[pname] = self.params[pname]
+
+                for out in self.outputs:
+                    xargs[out] = self.outputs[out]
+
+                self.step(**xargs)
+
+                for mname in self.mems:
+                    next(self.mems[mname])
+
+            for out in self.outputs:
+                retv[out].append(self.outputs[out].last())
+
+        for out in retv:
+            retv[out] = numpy.array(retv[out])
+        return retv
+
+    def to_feed(self, place):
+        feed_dict = dict()
+
+        for iname in self.inputs:
+            lod = [0]
+            np_flatten = []
+            for seq_id in xrange(len(self.inputs[iname])):
+                seq_len = len(self.inputs[iname][seq_id])
+                lod.append(lod[-1] + seq_len)
+                np_flatten.extend(self.inputs[iname][seq_id])
+
+            t = fluid.Tensor()
+            t.set(numpy.array(np_flatten), place)
+            t.set_lod([lod])
+            feed_dict[iname] = t
+
+        for pname in self.params:
+            feed_dict[pname] = self.params[pname]
+        return feed_dict
+
+    def get_numeric_gradient_of_param(self, param_name, delta=0.001):
+        p = self.params[param_name]
+        if len(p.shape) != 2:
+            raise ValueError("Not support get numeric gradient of an parameter,"
+                             " which is not matrix")
+        g = numpy.zeros(shape=p.shape, dtype=p.dtype)
+
+        for i in xrange(p.shape[0]):
+            for j in xrange(p.shape[1]):
+                o = p[i][j]
+                p[i][j] += delta
+                pos = self._exe_mean_out_()
+                p[i][j] -= 2 * delta
+                neg = self._exe_mean_out_()
+                p[i][j] = o
+                g[i][j] = (pos - neg) / (delta * 2)
+        return g
+
+    def get_numeric_gradient_of_input(self,
+                                      input_name,
+                                      delta=0.001,
+                                      return_one_tensor=True):
+        ipt = self.inputs[input_name]
+        grad = []
+
+        for seq in ipt:
+            seq_grad = []
+            for item in seq:
+                item_grad = numpy.zeros(shape=item.shape, dtype=item.dtype)
+                if len(item.shape) != 1:
+                    raise ValueError("Not support")
+
+                for i in xrange(len(item)):
+                    o = item[i]
+                    item[i] += delta
+                    pos = self._exe_mean_out_()
+                    item[i] -= 2 * delta
+                    neg = self._exe_mean_out_()
+                    item[i] = o
+                    item_grad[i] = (pos - neg) / (delta * 2)
+                seq_grad.append(item_grad)
+            grad.append(seq_grad)
+
+        if not return_one_tensor:
+            return grad
+
+        for i in xrange(len(grad)):
+            grad[i] = numpy.concatenate(grad[i])
+        grad = numpy.concatenate(grad)
+        return grad
+
+    def _exe_mean_out_(self):
+        outs = self.exe()
+        return numpy.array([o.mean() for o in outs.itervalues()]).mean()
+
+
+class SeedFixedTestCase(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Fix random seeds to remove randomness from tests"""
+        cls._np_rand_state = numpy.random.get_state()
+        cls._py_rand_state = random.getstate()
+
+        numpy.random.seed(123)
+        random.seed(124)
+
+    @classmethod
+    def tearDownClass(cls):
+        """Restore random seeds"""
+        numpy.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+
+class TestSimpleMul(SeedFixedTestCase):
+    DATA_NAME = 'X'
+    DATA_WIDTH = 32
+    PARAM_NAME = 'W'
+    HIDDEN_WIDTH = 10
+    OUT_NAME = 'Out'
+
+    class SimpleMul(BaseRNN):
+        def __init__(self):
+            base = TestSimpleMul
+            super(base.SimpleMul, self).__init__({
+                base.DATA_NAME: {
+                    'shape': [base.DATA_WIDTH]
+                }
+            }, {}, {
+                base.PARAM_NAME: {
+                    'shape': [base.DATA_WIDTH, base.HIDDEN_WIDTH]
+                }
+            }, [base.OUT_NAME])
+
+        def step(self, X, W, Out):
+            Out.out(numpy.matmul(X, W))
+
+    # Test many times in local to ensure the random seed cannot breaks CI
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMul.SimpleMul()
+        dat = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        dat.stop_gradient = False
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(dat)
+            o = fluid.layers.fc(input=d,
+                                param_attr=self.PARAM_NAME,
+                                bias_attr=False,
+                                size=self.HIDDEN_WIDTH,
+                                act=None)
+            rnn.output(o)
+
+        out = rnn()
+        out = fluid.layers.sequence_pool(out, pool_type='last')
+        loss = fluid.layers.mean(x=out)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        out, w_g, i_g = map(numpy.array,
+                            exe.run(feed=py_rnn.to_feed(cpu),
+                                    fetch_list=[
+                                        out, self.PARAM_NAME + "@GRAD",
+                                        self.DATA_NAME + "@GRAD"
+                                    ],
+                                    return_numpy=False))
+        out_by_python = py_rnn.exe()[self.OUT_NAME]
+        self.assertTrue(numpy.allclose(out, out_by_python))
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.05))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(
+            input_name=self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.05))
+
+
+class TestSimpleMulWithMemory(SeedFixedTestCase):
+    DATA_WIDTH = 32
+    HIDDEN_WIDTH = 20
+    DATA_NAME = 'X'
+    PARAM_NAME = 'W'
+
+    class SimpleMulWithMemory(BaseRNN):
+        def __init__(self):
+            super(TestSimpleMulWithMemory.SimpleMulWithMemory, self).__init__({
+                TestSimpleMulWithMemory.DATA_NAME: {
+                    'shape': [TestSimpleMulWithMemory.DATA_WIDTH]
+                }
+            }, {'Mem': {
+                'shape': [TestSimpleMulWithMemory.HIDDEN_WIDTH]
+            }}, {
+                TestSimpleMulWithMemory.PARAM_NAME: {
+                    'shape': [
+                        TestSimpleMulWithMemory.DATA_WIDTH,
+                        TestSimpleMulWithMemory.HIDDEN_WIDTH
+                    ]
+                }
+            }, ['Out'])
+
+        def step(self, X, Mem, W, Out):
+            o = numpy.matmul(X, W)
+            assert isinstance(Mem, Memory)
+            o += Mem.ex
+            Mem.update(o)
+            assert isinstance(Out, Output)
+            Out.out(o)
+
+    # many_times used locally for debug. Make sure the calculation is stable.
+    # @many_times(10)
+    @prog_scope()
+    def test_forward_backward(self):
+        py_rnn = TestSimpleMulWithMemory.SimpleMulWithMemory()
+        data = fluid.layers.data(
+            name=self.DATA_NAME, shape=[self.DATA_WIDTH], lod_level=1)
+        data.stop_gradient = False
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            d = rnn.step_input(data)
+            mem = rnn.memory(value=0.0, shape=[self.HIDDEN_WIDTH])
+            hidden = fluid.layers.fc(input=d,
+                                     size=self.HIDDEN_WIDTH,
+                                     param_attr=self.PARAM_NAME,
+                                     bias_attr=False,
+                                     act=None)
+            o = fluid.layers.elementwise_add(x=hidden, y=mem)
+            rnn.update_memory(mem, o)
+            rnn.output(o)
+
+        out = rnn()
+        last = fluid.layers.sequence_pool(input=out, pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        fluid.backward.append_backward(loss)
+
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        feed = py_rnn.to_feed(cpu)
+        last_np, w_g, i_g = map(numpy.array,
+                                exe.run(feed=feed,
+                                        fetch_list=[
+                                            last, self.PARAM_NAME + "@GRAD",
+                                            self.DATA_NAME + "@GRAD"
+                                        ],
+                                        return_numpy=False))
+        last_by_py, = py_rnn.exe().values()
+        w_g_num = py_rnn.get_numeric_gradient_of_param(self.PARAM_NAME)
+        self.assertTrue(numpy.allclose(last_np, last_by_py))
+
+        self.assertTrue(numpy.allclose(w_g_num, w_g, rtol=0.1))
+        i_g_num = py_rnn.get_numeric_gradient_of_input(self.DATA_NAME)
+        i_g_num = i_g_num.reshape(i_g.shape)
+
+        # Since this RNN has many float add. The number could be not stable.
+        # rtol = 0.1
+        self.assertTrue(numpy.allclose(i_g_num, i_g, rtol=0.1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
new file mode 100644
index 0000000000..d14923b6b3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2 as paddle
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.backward import append_backward
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import Program, switch_main_program
+import bisect
+import numpy as np
+
+fluid.default_startup_program().random_seed = 1
+
+
+class TestDyRnnStaticInput(unittest.TestCase):
+    def setUp(self):
+        self._delta = 0.005
+        self._max_sequence_len = 3
+        self._program = Program()
+        switch_main_program(self._program)
+        self.output_dim = 10
+        self.place = core.CPUPlace()
+        self.prepare_x_tensor()
+        self.prepare_static_input_tensor()
+        self.exe = fluid.Executor(self.place)
+
+    def prepare_x_tensor(self):
+        self.x_tensor_dim = 10
+        lod = [[0, 2, 3, 6]]
+        shape = [lod[0][-1], self.x_tensor_dim]
+        self.x_tensor_data = np.random.random(shape).astype('float32')
+        self.x_tensor = core.LoDTensor()
+        self.x_tensor.set_lod(lod)
+        self.x_tensor.set(self.x_tensor_data, self.place)
+
+    def prepare_static_input_tensor(self):
+        self.static_input_tensor_dim = 4
+        lod = [[0, 1, 3, 6]]
+        shape = [lod[0][-1], self.static_input_tensor_dim]
+        self.static_input_data = np.random.random(shape).astype('float32')
+        self.static_input_tensor = core.LoDTensor()
+        self.static_input_tensor.set_lod(lod)
+        self.static_input_tensor.set(self.static_input_data, self.place)
+
+    def fetch_value(self, var):
+        fetch_outs = self.exe.run(feed={
+            'x_tensor': self.x_tensor,
+            'static_input_tensor': self.static_input_tensor
+        },
+                                  fetch_list=[var],
+                                  return_numpy=False)
+        return self._lodtensor_to_ndarray(fetch_outs[0])
+
+    def _lodtensor_to_ndarray(self, lod_tensor):
+        dims = lod_tensor.get_dims()
+        ndarray = np.zeros(shape=dims).astype('float32')
+        for i in xrange(np.product(dims)):
+            ndarray.ravel()[i] = lod_tensor.get_float_element(i)
+        return ndarray, lod_tensor.lod()
+
+    def build_graph(self, only_forward=False):
+        x_tensor = fluid.layers.data(
+            name='x_tensor',
+            shape=[self.x_tensor_dim],
+            dtype='float32',
+            lod_level=1)
+        x_tensor.stop_gradient = False
+
+        static_input_tensor = fluid.layers.data(
+            name='static_input_tensor',
+            shape=[self.static_input_tensor_dim],
+            dtype='float32',
+            lod_level=1)
+        static_input_tensor.stop_gradient = False
+
+        if only_forward:
+            static_input_out_array = self._program.global_block().create_var(
+                name='static_input_out_array',
+                type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                dtype='float32')
+            static_input_out_array.stop_gradient = True
+
+        rnn = fluid.layers.DynamicRNN()
+        with rnn.block():
+            step_x = rnn.step_input(x_tensor)
+            step_static_input = rnn.static_input(static_input_tensor)
+            if only_forward:
+                fluid.layers.array_write(
+                    x=step_static_input,
+                    i=rnn.step_idx,
+                    array=static_input_out_array)
+            last = fluid.layers.sequence_pool(
+                input=step_static_input, pool_type='last')
+            projected = fluid.layers.fc(input=[step_x, last],
+                                        size=self.output_dim)
+            rnn.output(projected)
+
+        if only_forward:
+            static_input_step_outs = []
+            step_idx = fluid.layers.fill_constant(
+                shape=[1], dtype='int64', value=0)
+            step_idx.stop_gradient = True
+
+            for i in xrange(self._max_sequence_len):
+                step_out = fluid.layers.array_read(static_input_out_array,
+                                                   step_idx)
+                step_out.stop_gradient = True
+                static_input_step_outs.append(step_out)
+                fluid.layers.increment(x=step_idx, value=1.0, in_place=True)
+
+        if only_forward:
+            return static_input_step_outs
+
+        last = fluid.layers.sequence_pool(input=rnn(), pool_type='last')
+        loss = fluid.layers.mean(x=last)
+        append_backward(loss)
+        static_input_grad = self._program.global_block().var(
+            framework.grad_var_name('static_input_tensor'))
+        return static_input_grad, loss
+
+    def get_seq_len_from_lod(self, lod):
+        return [lod[0][i + 1] - lod[0][i] for i in xrange(len(lod[0]) - 1)]
+
+    def get_expected_static_step_outs(self):
+        x_lod = self.x_tensor.lod()
+        x_seq_len = self.get_seq_len_from_lod(x_lod)
+        x_seq_len_sorted = sorted(x_seq_len)
+        x_sorted_indices = np.argsort(x_seq_len)[::-1]
+
+        static_lod = self.static_input_tensor.lod()
+        static_sliced = [
+            self.static_input_data[static_lod[0][i]:static_lod[0][i + 1]]
+            for i in xrange(len(static_lod[0]) - 1)
+        ]
+        static_seq_len = self.get_seq_len_from_lod(static_lod)
+        static_reordered = []
+        for i in xrange(len(x_sorted_indices)):
+            static_reordered.extend(static_sliced[x_sorted_indices[i]].tolist())
+        static_seq_len_reordered = [
+            static_seq_len[x_sorted_indices[i]]
+            for i in xrange(len(x_sorted_indices))
+        ]
+
+        static_step_outs = []
+        static_step_lods = []
+
+        for i in xrange(self._max_sequence_len):
+            end = len(x_seq_len) - bisect.bisect_left(x_seq_len_sorted, i + 1)
+            lod = [0]
+            for i in xrange(end):
+                lod.append(static_seq_len_reordered[i] + lod[-1])
+            static_step_lods.append([lod])
+            end = lod[-1]
+            static_step_outs.append(
+                np.array(static_reordered[:end]).astype('float32'))
+
+        return static_step_outs, static_step_lods
+
+    def test_step_out(self):
+        static_step_outs = self.build_graph(only_forward=True)
+        self.exe.run(framework.default_startup_program())
+        expected_outs, expected_lods = self.get_expected_static_step_outs()
+        for i in xrange(self._max_sequence_len):
+            step_out, lod = self.fetch_value(static_step_outs[i])
+            self.assertTrue(np.allclose(step_out, expected_outs[i]))
+            self.assertTrue(np.allclose(lod, expected_lods[i]))
+
+    def test_network_gradient(self):
+        static_input_grad, loss = self.build_graph()
+        self.exe.run(framework.default_startup_program())
+
+        actual_gradients, actual_lod = self.fetch_value(static_input_grad)
+
+        static_input_shape = self.static_input_tensor.get_dims()
+        numeric_gradients = np.zeros(shape=static_input_shape).astype('float32')
+        # calculate numeric gradients
+        tensor_size = np.product(static_input_shape)
+        for i in xrange(tensor_size):
+            origin = self.static_input_tensor.get_float_element(i)
+            x_pos = origin + self._delta
+            self.static_input_tensor.set_float_element(i, x_pos)
+            y_pos = self.fetch_value(loss)[0][0]
+            x_neg = origin - self._delta
+            self.static_input_tensor.set_float_element(i, x_neg)
+            y_neg = self.fetch_value(loss)[0][0]
+            self.static_input_tensor.set_float_element(i, origin)
+            numeric_gradients.ravel()[i] = (y_pos - y_neg) / self._delta / 2
+        self.assertTrue(np.allclose(actual_gradients, numeric_gradients, 0.001))
+        self.assertTrue(np.allclose(actual_lod, self.static_input_tensor.lod()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_edit_distance_op.py b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
new file mode 100644
index 0000000000..bebdc5cba3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_edit_distance_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def Levenshtein(hyp, ref):
+    """ Compute the Levenshtein distance between two strings.
+
+    :param hyp: hypothesis string in index
+    :type hyp: list
+    :param ref: reference string in index
+    :type ref: list
+    """
+    m = len(hyp)
+    n = len(ref)
+    if m == 0:
+        return n
+    if n == 0:
+        return m
+
+    dist = np.zeros((m + 1, n + 1)).astype("float32")
+    for i in range(0, m + 1):
+        dist[i][0] = i
+    for j in range(0, n + 1):
+        dist[0][j] = j
+
+    for i in range(1, m + 1):
+        for j in range(1, n + 1):
+            cost = 0 if hyp[i - 1] == ref[j - 1] else 1
+            deletion = dist[i - 1][j] + 1
+            insertion = dist[i][j - 1] + 1
+            substitution = dist[i - 1][j - 1] + cost
+            dist[i][j] = min(deletion, insertion, substitution)
+    return dist[m][n]
+
+
+class TestEditDistanceOp(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = False
+        x1 = np.array([[0, 12, 3, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[0, 12, 4, 7, 8]]).astype("int64")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 5]
+        x2_lod = [0, 3, 4]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        sequence_num = np.array(2).astype("int64")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestEditDistanceOpNormalized(OpTest):
+    def setUp(self):
+        self.op_type = "edit_distance"
+        normalized = True
+        x1 = np.array([[0, 10, 3, 6, 5, 8, 2]]).astype("int64")
+        x2 = np.array([[0, 10, 4, 6, 7, 8]]).astype("int64")
+        x1 = np.transpose(x1)
+        x2 = np.transpose(x2)
+        x1_lod = [0, 1, 3, 6]
+        x2_lod = [0, 2, 3, 5]
+
+        num_strs = len(x1_lod) - 1
+        distance = np.zeros((num_strs, 1)).astype("float32")
+        sequence_num = np.array(3).astype("int64")
+        for i in range(0, num_strs):
+            distance[i] = Levenshtein(
+                hyp=x1[x1_lod[i]:x1_lod[i + 1]],
+                ref=x2[x2_lod[i]:x2_lod[i + 1]])
+            if normalized is True:
+                len_ref = x2_lod[i + 1] - x2_lod[i]
+                distance[i] = distance[i] / len_ref
+        self.attrs = {'normalized': normalized}
+        self.inputs = {'Hyps': (x1, [x1_lod]), 'Refs': (x2, [x2_lod])}
+        self.outputs = {'Out': distance, 'SequenceNum': sequence_num}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_add_op.py b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
new file mode 100644
index 0000000000..3564772fb5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_add_op.py
@@ -0,0 +1,147 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseAddOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
+
+
+class TestElementwiseAddOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseAddOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseAddOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
+        }
+
+
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 1).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_div_op.py b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
new file mode 100644
index 0000000000..77b113af76
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_div_op.py
@@ -0,0 +1,128 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class ElementwiseDivOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        """ Warning
+        CPU gradient check error!
+        'X': np.random.random((32,84)).astype("float32"),
+        'Y': np.random.random((32,84)).astype("float32")
+        """
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+class TestElementwiseDivOp_scalar(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype(np.float32),
+            'Y': np.random.uniform(0.1, 1, [1]).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] / self.inputs['Y']}
+
+
+class TestElementwiseDivOp_Vector(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [32]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [32]).astype("float32")
+        }
+        self.outputs = {'Out': np.divide(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseDivOp_broadcast_0(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [2]).astype("float32")
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_1(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseDivOp_broadcast_2(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [4]).astype("float32")
+        }
+
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseDivOp_broadcast_3(ElementwiseDivOp):
+    def setUp(self):
+        self.op_type = "elementwise_div"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [3, 4]).astype("float32")
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.divide(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_max_op.py b/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
new file mode 100644
index 0000000000..0fc15693b1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_max_op.py
@@ -0,0 +1,130 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        # If x and y have the same value, the max() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseMaxOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.random((32, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.maximum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (2, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (4, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_max"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.maximum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_min_op.py b/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
new file mode 100644
index 0000000000..51584d6980
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_min_op.py
@@ -0,0 +1,130 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        # If x and y have the same value, the min() is not differentiable.
+        # So we generate test data by the following method
+        # to avoid them being too close to each other.
+        x = np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        sgn = np.random.choice([-1, 1], [13, 17]).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseMinOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random_integers(-5, 5, [2, 3, 4]).astype("float32")
+        y = np.array([0.5]).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.random((32, )).astype("float32")
+        sgn = np.random.choice([-1, 1], (32, )).astype("float32")
+        y = x + sgn * np.random.uniform(0.1, 1, (32, )).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+        self.outputs = {'Out': np.minimum(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMaxOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (2, )).astype(np.float32)
+        y = x[:, 0, 0] + sgn * \
+            np.random.uniform(1, 2, (2, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(2, 1, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, )).astype(np.float32)
+        y = x[0, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 1))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (4, )).astype(np.float32)
+        y = x[0, 0, :] + sgn * \
+            np.random.uniform(1, 2, (4, )).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 1, 4))
+        }
+
+
+class TestElementwiseMaxOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_min"
+        x = np.random.uniform(0.5, 1, (2, 3, 4, 5)).astype(np.float32)
+        sgn = np.random.choice([-1, 1], (3, 4)).astype(np.float32)
+        y = x[0, :, :, 0] + sgn * \
+            np.random.uniform(1, 2, (3, 4)).astype(np.float32)
+        self.inputs = {'X': x, 'Y': y}
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out':
+            np.minimum(self.inputs['X'], self.inputs['Y'].reshape(1, 3, 4, 1))
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
new file mode 100644
index 0000000000..12dfa6599c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_mul_op.py
@@ -0,0 +1,117 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class ElementwiseMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
+class TestElementwiseMulOp_scalar(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']}
+
+
+class TestElementwiseMulOp_Vector(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float64"),
+            'Y': np.random.random((32, )).astype("float64")
+        }
+        self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
+
+
+class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(2).astype(np.float64)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(3).astype(np.float64)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(4).astype(np.float64)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "elementwise_mul"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
+            'Y': np.random.rand(3, 4).astype(np.float64)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
new file mode 100644
index 0000000000..e31749df9b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_pow_op.py
@@ -0,0 +1,43 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwisePowOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestElementwisePowOp_scalar(TestElementwisePowOp):
+    def setUp(self):
+        self.op_type = "elementwise_pow"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype('float32'),
+            'Y': np.random.rand(1).astype('float32')
+        }
+        self.outputs = {'Out': np.power(self.inputs['X'], self.inputs['Y'])}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
new file mode 100644
index 0000000000..cf53d85bba
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_elementwise_sub_op.py
@@ -0,0 +1,119 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestElementwiseOp(OpTest):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.005)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.005, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y'))
+
+
+class TestElementwiseSubOp_scalar(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_Vector(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.random((32, )).astype("float32"),
+            'Y': np.random.random((32, )).astype("float32")
+        }
+        self.outputs = {'Out': self.inputs['X'] - self.inputs['Y']}
+
+
+class TestElementwiseSubOp_broadcast_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(2).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 0}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(2, 1, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 1)
+        }
+
+
+class TestElementwiseSubOp_broadcast_2(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(4).astype(np.float32)
+        }
+
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 1, 4)
+        }
+
+
+class TestElementwiseSubOp_broadcast_3(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_sub"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] - self.inputs['Y'].reshape(1, 3, 4, 1)
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_error_clip.py b/python/paddle/v2/fluid/tests/test_error_clip.py
new file mode 100644
index 0000000000..6f7718f4d8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_error_clip.py
@@ -0,0 +1,81 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+BATCH_SIZE = 128
+CLIP_MAX = 2e-6
+CLIP_MIN = -1e-6
+
+prog = fluid.framework.Program()
+
+with fluid.program_guard(main_program=prog):
+    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+prog_clip = prog.clone()
+prog_clip.block(0).var(hidden1.name).set_error_clip(
+    fluid.clip.ErrorClipByValue(
+        max=CLIP_MAX, min=CLIP_MIN))
+
+avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
+fluid.backward.append_backward(loss=avg_cost)
+fluid.backward.append_backward(
+    loss=avg_cost_clip, callback=fluid.clip.error_clip_callback)
+
+hidden1_grad = prog.block(0).var(hidden1.name + "@GRAD")
+hidden1_grad_clip = prog_clip.block(0).var(hidden1.name + "@GRAD")
+
+hidden2_grad = prog.block(0).var(hidden2.name + "@GRAD")
+hidden2_grad_clip = prog_clip.block(0).var(hidden2.name + "@GRAD")
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+exe.run(fluid.default_startup_program())
+
+count = 0
+for data in train_reader():
+    count += 1
+    if count > 5:
+        break
+    out1, out2 = exe.run(prog,
+                         feed=feeder.feed(data),
+                         fetch_list=[hidden1_grad, hidden2_grad])
+    out1_clip, out2_clip = exe.run(
+        prog_clip,
+        feed=feeder.feed(data),
+        fetch_list=[hidden1_grad_clip, hidden2_grad_clip])
+    if not ((out1.clip(
+            min=CLIP_MIN, max=CLIP_MAX) == out1_clip).all() and
+            (out2 == out2_clip).all()):
+        exit(1)
+
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_exception.py b/python/paddle/v2/fluid/tests/test_exception.py
new file mode 100644
index 0000000000..cd57ca586b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_exception.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+import unittest
+
+
+class TestException(unittest.TestCase):
+    def test_exception(self):
+        ex = None
+        try:
+            core.__unittest_throw_exception__()
+        except core.EnforceNotMet as ex:
+            self.assertIn("test exception", ex.message)
+
+        self.assertIsNotNone(ex)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
new file mode 100644
index 0000000000..44f93be6cb
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -0,0 +1,44 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy
+import paddle.v2.fluid.core as core
+
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.layers import mul, data
+
+
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], dtype='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            dtype='float32',
+            append_batch_size=False)
+        out = mul(x=a, y=b)
+        place = core.CPUPlace()
+        a_np = numpy.random.random((100, 784)).astype('float32')
+        b_np = numpy.random.random((784, 100)).astype('float32')
+        exe = Executor(place)
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+        out = outs[0]
+        self.assertEqual((100, 100), out.shape)
+        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_expand_op.py b/python/paddle/v2/fluid/tests/test_expand_op.py
new file mode 100644
index 0000000000..b1a1cbc0fa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_expand_op.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestExpandOpRank1(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random(12).astype("float32")}
+        self.attrs = {'expand_times': [2]}
+        output = np.tile(self.inputs['X'], 2)
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank2(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((12, 14)).astype("float32")}
+        self.attrs = {'expand_times': [2, 3]}
+        output = np.tile(self.inputs['X'], (2, 3))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3_Corner(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [1, 1, 1]}
+        output = np.tile(self.inputs['X'], (1, 1, 1))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank3(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5)).astype("float32")}
+        self.attrs = {'expand_times': [2, 1, 4]}
+        output = np.tile(self.inputs['X'], (2, 1, 4))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestExpandOpRank4(OpTest):
+    def setUp(self):
+        self.op_type = "expand"
+        self.inputs = {'X': np.random.random((2, 4, 5, 7)).astype("float32")}
+        self.attrs = {'expand_times': [3, 2, 1, 2]}
+        output = np.tile(self.inputs['X'], (3, 2, 1, 2))
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_feed_fetch_method.py b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
new file mode 100644
index 0000000000..827a7590ff
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_feed_fetch_method.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+import unittest
+import numpy as np
+
+
+class TestFeedFetch(unittest.TestCase):
+    def test_feed_fetch(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        input_array = np.ones((4, 4, 6)).astype("float32")
+        input_array[0, 0, 0] = 3
+        input_array[3, 3, 5] = 10
+        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor.set(input_array, place)
+
+        core.set_feed_variable(scope, input_tensor, "feed", 0)
+
+        output_tensor = core.get_fetch_variable(scope, "feed", 0)
+
+        output_lod = output_tensor.lod()
+        self.assertEqual(0, output_lod[0][0])
+        self.assertEqual(2, output_lod[0][1])
+        self.assertEqual(4, output_lod[0][2])
+
+        output_array = np.array(output_tensor)
+        self.assertEqual(3, output_array[0, 0, 0])
+        self.assertEqual(10, output_array[3, 3, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
new file mode 100644
index 0000000000..f34a1ceb23
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_batch_size_like_op.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFillConstantBatchSizeLikeWhenFirstDimIsBatchSize(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
+        self.attrs = {'value': 3.5, 'shape': [-1, 132, 7]}
+
+        out = np.random.random((219, 132, 7)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillConstantBatchSizeLikeWhenSecondDimIsBatchSize(OpTest):
+    def setUp(self):
+        self.op_type = "fill_constant_batch_size_like"
+        self.inputs = {'Input': np.random.random((219, 232)).astype("float32")}
+        self.attrs = {
+            'value': 3.5,
+            'shape': [132, -1, 7],
+            'input_dim_idx': 0,
+            'output_dim_idx': 1
+        }
+
+        out = np.random.random((132, 219, 7)).astype("float32")
+        out.fill(3.5)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fill_constant_op.py b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
new file mode 100644
index 0000000000..a05fa39729
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_constant_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFillConstantOp1(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillConstantOp2(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92]}
+        self.outputs = {'Out': np.full((123, 92), 0.0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py
new file mode 100644
index 0000000000..901546f6f8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fill_op.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+
+
+class TestFillOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill"
+        val = np.random.random(size=[100, 200])
+        self.inputs = {}
+        self.attrs = {
+            'value': val.flatten().tolist(),
+            'shape': [100, 200],
+            'dtype': int(core.DataType.FP64)
+        }
+        self.outputs = {'Out': val.astype('float64')}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/v1_api_demo/traffic_prediction/data/get_data.sh b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
old mode 100755
new mode 100644
similarity index 53%
rename from v1_api_demo/traffic_prediction/data/get_data.sh
rename to python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
index f2fa548d47..b7f0b96647
--- a/v1_api_demo/traffic_prediction/data/get_data.sh
+++ b/python/paddle/v2/fluid/tests/test_fill_zeros_like_op.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,22 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-set -e
-set -x
+import unittest
+import numpy as np
+from op_test import OpTest
 
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
 
-#download the dataset
-echo "Downloading traffic data..."
-wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
+class TestFillZerosLikeOp(OpTest):
+    def setUp(self):
+        self.op_type = "fill_zeros_like"
+        self.inputs = {'X': np.random.random((219, 232)).astype("float32")}
+        self.outputs = {'Out': np.zeros_like(self.inputs["X"])}
 
-#extract package
-echo "Unzipping..."
-tar -zxvf traffic_data.tar.gz
+    def test_check_output(self):
+        self.check_output()
 
-echo "data/speeds.csv" > train.list
-echo "data/speeds.csv" > test.list
-echo "data/speeds.csv" > pred.list
 
-echo "Done."
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_framework_debug_str.py b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
new file mode 100644
index 0000000000..f8fcfb2249
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_framework_debug_str.py
@@ -0,0 +1,27 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.v2.fluid.framework import Program
+
+
+class TestDebugStringFramework(unittest.TestCase):
+    def test_debug_str(self):
+        p = Program()
+        p.current_block().create_var(name='t', shape=[0, 1])
+        self.assertRaises(ValueError, callableObj=p.__str__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_ftrl_op.py b/python/paddle/v2/fluid/tests/test_ftrl_op.py
new file mode 100644
index 0000000000..895337de0f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_ftrl_op.py
@@ -0,0 +1,76 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFTRLOp(OpTest):
+    def setUp(self):
+        self.op_type = "ftrl"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        sq_accum = np.full((102, 105), 0.1).astype("float32")
+        linear_accum = np.full((102, 105), 0.1).astype("float32")
+        lr = np.array([0.01]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+        lr_power = -0.5
+
+        self.inputs = {
+            'Param': w,
+            'SquaredAccumulator': sq_accum,
+            'LinearAccumulator': linear_accum,
+            'Grad': g,
+            'LearningRate': lr
+        }
+        self.attrs = {
+            'l1': l1,
+            'l2': l2,
+            'lr_power': lr_power,
+            'learning_rate': lr
+        }
+        new_accum = sq_accum + g * g
+        if lr_power == -0.5:
+            linear_out = linear_accum + g - (
+                (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
+        else:
+            linear_out = linear_accum + g - ((np.power(
+                new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
+
+        x = (l1 * np.sign(linear_out) - linear_out)
+        if lr_power == -0.5:
+            y = (np.sqrt(new_accum) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+        else:
+            y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
+            pre_shrink = x / y
+            param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
+
+        sq_accum_out = sq_accum + g * g
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'SquaredAccumOut': sq_accum_out,
+            'LinearAccumOut': linear_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gather_op.py b/python/paddle/v2/fluid/tests/test_gather_op.py
new file mode 100644
index 0000000000..7675636797
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gather_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestGatherOp(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        xnp = np.random.random((10, 20)).astype("float32")
+        self.inputs = {'X': xnp, 'Index': np.array([1, 3, 5]).astype("int32")}
+        self.outputs = {'Out': self.inputs["X"][self.inputs["Index"]]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
new file mode 100644
index 0000000000..79beb8b1fc
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py
@@ -0,0 +1,63 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.executor import Executor
+
+
+class TestGaussianRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "gaussian_random"
+        self.inputs = {}
+        self.attrs = {"shape": [1000, 784], "mean": .0, "std": 1., "seed": 10}
+
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
+        self.gaussian_random_test(place=fluid.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.gaussian_random_test(place=fluid.CUDAPlace(0))
+
+    def gaussian_random_test(self, place):
+
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+        tensor = outs[0]
+
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_get_places_op.py b/python/paddle/v2/fluid/tests/test_get_places_op.py
new file mode 100644
index 0000000000..68698c5f4a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_get_places_op.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid as fluid
+import decorators
+import unittest
+
+
+class TestGetPlaces(unittest.TestCase):
+    @decorators.prog_scope()
+    def test_get_places(self):
+        places = fluid.layers.get_places()
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(fluid.default_main_program())
+        self.assertEqual(places.type, fluid.core.VarDesc.VarType.PLACE_LIST)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gradient_clip.py b/python/paddle/v2/fluid/tests/test_gradient_clip.py
new file mode 100644
index 0000000000..9337791c21
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gradient_clip.py
@@ -0,0 +1,82 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+BATCH_SIZE = 128
+CLIP = 1
+
+prog = fluid.framework.Program()
+with fluid.program_guard(main_program=prog):
+    image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+
+    hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+    hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+    predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+
+    label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+prog_clip = prog.clone()
+
+avg_cost_clip = prog_clip.block(0).var(avg_cost.name)
+
+p_g = fluid.backward.append_backward(loss=avg_cost)
+p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip)
+
+with fluid.program_guard(main_program=prog_clip):
+    fluid.clip.set_gradient_clip(
+        fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP))
+    p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip)
+
+grad_list = [elem[1] for elem in p_g]
+grad_clip_list = [elem[1] for elem in p_g_clip]
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
+exe.run(fluid.default_startup_program())
+
+count = 0
+for data in train_reader():
+    count += 1
+    if count > 5:
+        break
+    out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list)
+    out_clip = exe.run(prog_clip,
+                       feed=feeder.feed(data),
+                       fetch_list=grad_clip_list)
+    global_norm = 0
+    for v in out[1:]:
+        global_norm += np.sum(np.power(v, 2))
+    global_norm = np.sqrt(global_norm)
+
+    global_norm_clip = 0
+    for v in out_clip[1:]:
+        global_norm_clip += np.sum(np.power(v, 2))
+    global_norm_clip = np.sqrt(global_norm_clip)
+
+    if not np.isclose(
+            a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3):
+        exit(1)
+exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_gru_op.py b/python/paddle/v2/fluid/tests/test_gru_op.py
new file mode 100644
index 0000000000..69cfd6c481
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gru_op.py
@@ -0,0 +1,172 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+from test_lstm_op import identity, sigmoid, tanh, relu
+
+
+class TestGRUOp(OpTest):
+    lod = [[0, 2, 6, 9]]
+    batch_size = lod[0][-1]
+    frame_size = 5
+    activate = {
+        'identity': identity,
+        'sigmoid': sigmoid,
+        'tanh': tanh,
+        'relu': relu
+    }
+
+    @staticmethod
+    def seq_to_batch(lod, is_reverse):
+        idx_in_seq_list = []
+        seq_starts = lod[0]
+        seq_lens = []
+        for i in range(len(seq_starts) - 1):
+            seq_lens.append(seq_starts[i + 1] - seq_starts[i])
+        sorted_seqs = sorted(
+            range(len(seq_lens)), lambda x, y: seq_lens[y] - seq_lens[x])
+        num_batch = seq_lens[sorted_seqs[0]]
+        for batch_idx in range(num_batch):
+            idx_in_seq = []
+            for i in range(len(seq_lens)):
+                if seq_lens[sorted_seqs[i]] <= batch_idx:
+                    break
+                idx = (seq_starts[sorted_seqs[i] + 1] - 1 - batch_idx
+                       ) if is_reverse else (
+                           seq_starts[sorted_seqs[i]] + batch_idx)
+                idx_in_seq.append(idx)
+            idx_in_seq_list.append(idx_in_seq)
+        return idx_in_seq_list, sorted_seqs
+
+    def gru_step(self, x, h_p, w, b):
+        batch_size = x.shape[0]
+        frame_size = w.shape[0]
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        return g, r_h_p, h
+
+    def gru(self):
+        input, lod = self.inputs['Input']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, self.frame_size * 3))
+        batch_gate = self.outputs['BatchGate']
+        batch_reset_hidden_prev = self.outputs['BatchResetHiddenPrev']
+        batch_hidden = self.outputs['BatchHidden']
+        hidden = self.outputs['Hidden']
+        idx_in_seq_list = self.idx_in_seq_list
+        h_p = self.inputs['H0'][self.sorted_seqs] if self.inputs.has_key(
+            'H0') else np.zeros((len(idx_in_seq_list[0]), self.frame_size))
+        num_batch = len(idx_in_seq_list)
+        end_idx = 0
+        for batch_idx in range(num_batch):
+            x = input[idx_in_seq_list[batch_idx]]
+            g, r_h_p, h = self.gru_step(x, h_p, w, b)
+            if batch_idx < (num_batch - 1):
+                h_p = h[:len(idx_in_seq_list[batch_idx + 1])]
+            start_idx = end_idx
+            end_idx = start_idx + len(idx_in_seq_list[batch_idx])
+            batch_gate[start_idx:end_idx] = g
+            batch_reset_hidden_prev[start_idx:end_idx] = r_h_p
+            batch_hidden[start_idx:end_idx] = h
+            hidden[idx_in_seq_list[batch_idx]] = h
+        return batch_gate, batch_reset_hidden_prev, hidden
+
+    def set_data(self):
+        lod = self.lod
+        self.idx_in_seq_list, self.sorted_seqs = self.seq_to_batch(
+            lod, self.is_reverse)
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        input = np.random.rand(batch_size, frame_size * 3).astype('float64')
+        h0 = np.random.rand(len(self.idx_in_seq_list[0]),
+                            frame_size).astype('float64')
+        weight = np.random.rand(frame_size, frame_size * 3).astype('float64')
+        bias = np.random.rand(1, frame_size * 3).astype('float64')
+
+        self.inputs = {
+            'Input': (input, lod),
+            'H0': h0,
+            'Weight': weight,
+            'Bias': bias
+        }
+
+        self.outputs = {
+            'BatchGate': np.zeros(
+                (batch_size, frame_size * 3), dtype='float64'),
+            'BatchResetHiddenPrev': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'BatchHidden': np.zeros(
+                (batch_size, frame_size), dtype='float64'),
+            'Hidden': np.zeros(
+                (batch_size, frame_size), dtype='float64')
+        }
+
+    def set_confs(self):
+        self.is_reverse = False
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+    def setUp(self):
+        self.op_type = "gru"
+        self.set_confs()
+        self.set_data()
+        self.gru()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'H0', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpNoInitial(TestGRUOp):
+    def set_data(self):
+        super(TestGRUOpNoInitial, self).set_data()
+        self.inputs.pop('H0')
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'Weight', 'Bias'], ['Hidden'])
+
+
+class TestGRUOpReverse(TestGRUOp):
+    def set_confs(self):
+        self.is_reverse = True
+        self.attrs = {
+            'activation': 'tanh',
+            'gate_activation': 'sigmoid',
+            'is_reverse': self.is_reverse
+        }
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
new file mode 100644
index 0000000000..71f13c4513
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -0,0 +1,134 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class GRUActivationType(OpTest):
+    identity = 0
+    sigmoid = 1
+    tanh = 2
+    relu = 3
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh(x):
+    return 2. * sigmoid(2. * x) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+class TestGRUUnitOp(OpTest):
+    batch_size = 5
+    frame_size = 10
+    activate = {
+        GRUActivationType.identity: identity,
+        GRUActivationType.sigmoid: sigmoid,
+        GRUActivationType.tanh: tanh,
+        GRUActivationType.relu: relu,
+    }
+
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        self.op_type = 'gru_unit'
+        self.inputs = {
+            'Input': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float64'),
+            'HiddenPrev': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size)).astype('float64'),
+            'Weight': np.random.uniform(
+                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
+                (frame_size, frame_size * 3)).astype('float64'),
+        }
+        self.attrs = {
+            'activation': GRUActivationType.tanh,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def set_outputs(self):
+        # GRU calculations
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        x = self.inputs['Input']
+        h_p = self.inputs['HiddenPrev']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, frame_size * 3))
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * c + (1 - u) * h_p
+        self.outputs = {
+            'Gate': g.astype('float64'),
+            'ResetHiddenPrev': r_h_p.astype('float64'),
+            'Hidden': h.astype('float64')
+        }
+
+    def setUp(self):
+        self.set_inputs()
+        self.set_outputs()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight'], ['Hidden'])
+
+
+class TestGRUUnitOpWithBias(TestGRUUnitOp):
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        super(TestGRUUnitOpWithBias, self).set_inputs()
+        self.inputs['Bias'] = np.random.uniform(
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
+        self.attrs = {
+            'activation': GRUActivationType.identity,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def test_check_grad(self):
+        self.check_grad(['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'])
+
+    def test_check_grad_ingore_input(self):
+        self.check_grad(
+            ['HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            no_grad_set=set('Input'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
new file mode 100644
index 0000000000..71ff47316e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestHingeLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'hinge_loss'
+        samples_num = 64
+        logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
+
+        self.inputs = {
+            'Logits': logits,
+            'Labels': labels,
+        }
+        loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
new file mode 100644
index 0000000000..e4560af778
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_huber_loss_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+class TestHuberLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'huber_loss'
+        samples_num = 64
+        delta = 1.0
+        self.inputs = {
+            'X': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
+            'Y': np.random.uniform(0, 1., (samples_num, 1)).astype('float32'),
+        }
+        residual = self.inputs['Y'] - self.inputs['X']
+        loss = np.vectorize(huber_loss_forward)(residual,
+                                                delta).astype('float32')
+        self.attrs = {'delta': delta}
+        self.outputs = {
+            'Residual': residual,
+            'Out': loss.reshape((samples_num, 1))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.008)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.008, no_grad_set=set("residual"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.008, no_grad_set=set('residual'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_im2sequence_op.py b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
new file mode 100644
index 0000000000..2cab3e31a5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_im2sequence_op.py
@@ -0,0 +1,167 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def get_output_shape(attrs, in_shape):
+    img_height = in_shape[2]
+    img_width = in_shape[3]
+
+    paddings = attrs['paddings']
+    kernels = attrs['kernels']
+    strides = attrs['strides']
+
+    output_height = \
+      1 +  \
+      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+          strides[0]
+
+    output_width = \
+      1 + \
+      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+          strides[1]
+
+    return output_height, output_width
+
+
+def im2col(attrs, im, col):
+    """
+    im: {CHW}
+    col:
+        {outputHeight, outputWidth, inputChannels, filterHeight, filterWidth}
+    """
+    input_channels, input_height, input_width = im.shape
+    output_height, output_width, _, filter_height, filter_width = col.shape
+
+    stride_height, stride_width = attrs['strides']
+    padding_height, padding_width = attrs['paddings'][0:2]
+
+    for col_row_idx in range(0, output_height):
+        for col_col_idx in range(0, output_width):
+            for channel in range(0, input_channels):
+                for filter_row_idx in range(0, filter_height):
+                    for filter_col_idx in range(0, filter_width):
+                        im_row_offset = col_row_idx * stride_height \
+                            + filter_row_idx - padding_height
+
+                        im_col_offset = col_col_idx * stride_width \
+                            + filter_col_idx - padding_width
+
+                        if (im_row_offset < 0 or
+                                im_row_offset >= input_height or
+                                im_col_offset < 0 or
+                                im_col_offset >= input_width):
+                            col[col_row_idx][col_col_idx][channel][\
+                                filter_row_idx][filter_col_idx] = 0.0
+                        else:
+                            im_offset = (channel * input_height + im_row_offset \
+                                         ) * input_width + im_col_offset
+
+                            col[col_row_idx][col_col_idx][channel][\
+                                filter_row_idx][filter_col_idx] = im[channel][ \
+                                    im_row_offset][im_col_offset]
+
+
+def Im2Sequence(inputs, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape)
+    img_channels = inputs.shape[1]
+    batch_size = inputs.shape[0]
+    out = np.zeros([
+        batch_size, output_height, output_width, img_channels,
+        attrs['kernels'][0], attrs['kernels'][1]
+    ]).astype("float32")
+
+    for i in range(len(inputs)):
+        im2col(attrs, inputs[i], out[i])
+
+    out = out.reshape([
+        batch_size * output_height * output_width,
+        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+    ])
+    return out
+
+
+class TestBlockExpandOp(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 4
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 1, 1, 1]
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+
+        out = Im2Sequence(x, self.attrs)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestBlockExpandOpCase2(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1]
+        }
+
+
+class TestBlockExpandOpCase3(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 0, 2, 0]
+        }
+
+
+class TestBlockExpandOpCase4(TestBlockExpandOp):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0]
+        }
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
new file mode 100644
index 0000000000..c64cfed5f5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -0,0 +1,83 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program
+
+
+def conv_block(input, num_filter, groups, dropouts):
+    return nets.img_conv_group(
+        input=input,
+        pool_size=2,
+        pool_stride=2,
+        conv_num_filter=[num_filter] * groups,
+        conv_filter_size=3,
+        conv_act='relu',
+        conv_with_batchnorm=True,
+        conv_batchnorm_drop_rate=dropouts,
+        pool_type='max')
+
+
+class TestLayer(unittest.TestCase):
+    def test_batch_norm_layer(self):
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            hidden1 = fluid.layers.batch_norm(input=images)
+            hidden2 = fluid.layers.fc(input=hidden1, size=128, act='relu')
+            fluid.layers.batch_norm(input=hidden2)
+
+        print str(main_program)
+
+    def test_dropout_layer(self):
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.dropout(x=images, dropout_prob=0.5)
+
+        print str(main_program)
+
+    def test_img_conv_group(self):
+        main_program = Program()
+        startup_program = Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            images = fluid.layers.data(
+                name='pixel', shape=[3, 48, 48], dtype='float32')
+            conv1 = conv_block(images, 64, 2, [0.3, 0])
+            conv_block(conv1, 256, 3, [0.4, 0.4, 0])
+
+        print str(main_program)
+
+    def test_elementwise_add_with_act(self):
+        main_program = Program()
+        startup_program = Program()
+        with fluid.program_guard(main_program, startup_program):
+            image1 = fluid.layers.data(
+                name='pixel1', shape=[3, 48, 48], dtype='float32')
+            image2 = fluid.layers.data(
+                name='pixel2', shape=[3, 48, 48], dtype='float32')
+            fluid.layers.elementwise_add(x=image1, y=image2, act='relu')
+        print(main_program)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_infer_shape.py b/python/paddle/v2/fluid/tests/test_infer_shape.py
new file mode 100644
index 0000000000..521096388a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_infer_shape.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.core as core
+
+
+class TestInferShape(unittest.TestCase):
+    def test_sum_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        shape = [10, 20]
+
+        # prepare input/output
+        x1 = block.var("x1")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        x2 = block.var("x2")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x2.set_shape(shape)
+
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+
+        # prepare the operator
+        sum_op_desc = block.append_op()
+        sum_op_desc.set_type("sum")
+        sum_op_desc.set_input("X", ["x1", "x2"])
+        sum_op_desc.set_output("Out", ["out"])
+
+        sum_op_desc.check_attrs()
+        sum_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), shape)
+
+    def test_mul_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        x_shape = [10, 20]
+        y_shape = [20, 30]
+
+        # prepare input/output
+        x1 = block.var("x")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(x_shape)
+        x2 = block.var("y")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x2.set_shape(y_shape)
+
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+
+        # prepare the operator
+        mul_op_desc = block.append_op()
+        mul_op_desc.set_type("mul")
+        mul_op_desc.set_input("X", ["x"])
+        mul_op_desc.set_input("Y", ["y"])
+        mul_op_desc.set_output("Out", ["out"])
+        mul_op_desc.set_attr("x_num_col_dims", 1)
+        mul_op_desc.set_attr("y_num_col_dims", 1)
+
+        mul_op_desc.check_attrs()
+        mul_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
new file mode 100644
index 0000000000..adf428aa84
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -0,0 +1,87 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import paddle.v2.fluid.core as core
+
+import paddle.v2.fluid.executor as executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.io import save_inference_model, load_inference_model
+
+
+class TestBook(unittest.TestCase):
+    def test_fit_line_inference_model(self):
+        MODEL_DIR = "./tmp/inference_model"
+
+        init_program = Program()
+        program = Program()
+
+        with program_guard(program, init_program):
+            x = layers.data(name='x', shape=[2], dtype='float32')
+            y = layers.data(name='y', shape=[1], dtype='float32')
+
+            y_predict = layers.fc(input=x, size=1, act=None)
+
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+
+            sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+            sgd_optimizer.minimize(avg_cost, init_program)
+
+        place = core.CPUPlace()
+        exe = executor.Executor(place)
+
+        exe.run(init_program, feed={}, fetch_list=[])
+
+        for i in xrange(100):
+            tensor_x = np.array(
+                [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
+            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
+
+            exe.run(program,
+                    feed={'x': tensor_x,
+                          'y': tensor_y},
+                    fetch_list=[avg_cost])
+
+        save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
+
+        reload(executor)  # reload to build a new scope
+        exe = executor.Executor(place)
+
+        [infer_prog, feed_var_names, fetch_vars] = load_inference_model(
+            MODEL_DIR, exe)
+
+        outs = exe.run(
+            infer_prog,
+            feed={feed_var_names[0]: tensor_x,
+                  feed_var_names[1]: tensor_y},
+            fetch_list=fetch_vars)
+        actual = outs[0]
+
+        self.assertEqual(feed_var_names, ["x", "y"])
+        self.assertEqual(len(fetch_vars), 1)
+        self.assertEqual(str(fetch_vars[0]), str(avg_cost))
+        self.assertEqual(expected, actual)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py
new file mode 100644
index 0000000000..67746b4d7d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
@@ -0,0 +1,368 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.initializer as initializer
+
+DELTA = 0.00001
+
+
+class TestConstantInitializer(unittest.TestCase):
+    def test_constant_initializer_default_value(self):
+        """Test the constant initializer with default value
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.ConstantInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'fill_constant')
+        self.assertAlmostEqual(init_op.attr('value'), 0.0, delta=DELTA)
+
+    def test_constant_initializer(self):
+        """Test constant initializer with supplied value
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.ConstantInitializer(2.3))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'fill_constant')
+        self.assertAlmostEqual(init_op.attr('value'), 2.3, delta=DELTA)
+
+
+class TestUniformInitializer(unittest.TestCase):
+    def test_uniform_initializer_default_value(self):
+        """Test the uniform initializer with default value
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        self.assertAlmostEqual(init_op.attr('min'), -1.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_initializer_random_seed(self):
+        """Test the uniform initializer with manually setting seed
+        """
+        program = framework.Program()
+        program.random_seed = 123
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer())
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer(seed=456))
+        init_op = block.ops[1]
+        self.assertEqual(init_op.attr("seed"), 123)
+        init_op1 = block.ops[0]
+        self.assertEqual(init_op1.attr("seed"), 456)
+
+    def test_uniform_initializer(self):
+        """Test uniform initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.UniformInitializer(-4.2, 3.1, 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        self.assertAlmostEqual(init_op.attr('min'), -4.2, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), 3.1, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 123)
+
+
+class TestNormalInitializer(unittest.TestCase):
+    def test_normal_initializer_default_value(self):
+        """Test the normal initializer with default value
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.NormalInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.0, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_initializer(self):
+        """Test normal initializer with supplied attributes
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.NormalInitializer(2.3, 1.9, 123))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        self.assertAlmostEqual(init_op.attr('mean'), 2.3, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), 1.9, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 123)
+
+
+class TestXavierInitializer(unittest.TestCase):
+    def test_uniform_xavier_initializer(self):
+        """Test Xavier initializer with uniform distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.XavierInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / (param.shape[0] + param.shape[1]))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_xavier_initializer_conv(self):
+        """Test Xavier initializer with uniform distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.XavierInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (
+            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_xavier_initializer(self):
+        """Test Xavier initializer with normal distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.XavierInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / (param.shape[0] + param.shape[1]))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_xavier_initializer_conv(self):
+        """Test Xavier initializer with normal distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.XavierInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (
+            (param.shape[0] + param.shape[1]) * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_xavier_initializer_supplied_arguments(self):
+        """Test the Xavier initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.XavierInitializer(
+                fan_in=12, fan_out=23, seed=134))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / (12 + 23))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 134)
+
+
+class TestMSRAInitializer(unittest.TestCase):
+    def test_uniform_msra_initializer(self):
+        """Test MSRA initializer with uniform distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_uniform_msra_initializer_conv(self):
+        """Test MSRA initializer with uniform distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer())
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        receptive_field_size = float(15 * 20)
+        limit = np.sqrt(6.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer(self):
+        """Test MSRA initializer with normal distribution on
+           for matrix multiply.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        std = np.sqrt(2.0 / param.shape[0])
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_normal_msra_initializer_conv(self):
+        """Test MSRA initializer with normal distribution on
+           for convolutions.
+        """
+        program = framework.Program()
+        block = program.global_block()
+        param = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10, 15, 20],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(uniform=False))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'gaussian_random')
+        receptive_field_size = float(15 * 20)
+        std = np.sqrt(2.0 / (param.shape[1] * receptive_field_size))
+        self.assertAlmostEqual(init_op.attr('mean'), 0.0, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('std'), std, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 0)
+
+    def test_msra_initializer_supplied_arguments(self):
+        """Test the MSRA initializer with supplied arguments
+        """
+        program = framework.Program()
+        block = program.global_block()
+        block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="param",
+            initializer=initializer.MSRAInitializer(
+                fan_in=12, seed=134))
+        self.assertEqual(len(block.ops), 1)
+        init_op = block.ops[0]
+        self.assertEqual(init_op.type, 'uniform_random')
+        limit = np.sqrt(6.0 / 12)
+        self.assertAlmostEqual(init_op.attr('min'), -limit, delta=DELTA)
+        self.assertAlmostEqual(init_op.attr('max'), limit, delta=DELTA)
+        self.assertEqual(init_op.attr('seed'), 134)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_iou_similarity_op.py b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
new file mode 100644
index 0000000000..128f2e4977
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_iou_similarity_op.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestIOUSimilarityOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "iou_similarity"
+        self.boxes1 = np.array(
+            [[4.0, 3.0, 7.0, 5.0], [5.0, 6.0, 10.0, 7.0]]).astype('float32')
+        self.boxes2 = np.array([[3.0, 4.0, 6.0, 8.0], [14.0, 14.0, 15.0, 15.0],
+                                [0.0, 0.0, 20.0, 20.0]]).astype('float32')
+        self.output = np.array(
+            [[2.0 / 16.0, 0, 6.0 / 400.0],
+             [1.0 / 16.0, 0.0, 5.0 / 400.0]]).astype('float32')
+
+        self.inputs = {'X': self.boxes1, 'Y': self.boxes2}
+
+        self.outputs = {'Out': self.output}
+
+
+class TestIOUSimilarityOpWithLoD(TestIOUSimilarityOp):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        super(TestIOUSimilarityOpWithLoD, self).setUp()
+        self.boxes1_lod = [[0, 1, 2]]
+        self.output_lod = [[0, 1, 2]]
+
+        self.inputs = {'X': (self.boxes1, self.boxes1_lod), 'Y': self.boxes2}
+        self.outputs = {'Out': (self.output, self.output_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_is_empty_op.py b/python/paddle/v2/fluid/tests/test_is_empty_op.py
new file mode 100644
index 0000000000..7c17e3d57a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_is_empty_op.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
+
+
+def create_tensor(scope, name, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(np_data.shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class TestIsEmptyOp(unittest.TestCase):
+    def setUp(self):
+        self.scope = core.Scope()
+        # create input variables
+        np_data0 = np.array([0, 1, 2])
+        create_tensor(self.scope, "X0", np_data0)
+
+        np_data1 = np.array([1])
+        t = create_tensor(self.scope, "X1", np_data1)
+        t.set_dims([0])
+
+        # create output variables
+        self.scope.var("out")
+
+    def test_no_empty(self):
+        self.one_case("X0", False)
+
+    def test_empty(self):
+        self.one_case("X1", True)
+
+    def one_case(self, input, target):
+        op = Operator(type="is_empty", X=input, Out="out")
+        op.run(self.scope, core.CPUPlace())
+        out = self.scope.var("out").get_tensor()
+        self.assertEqual(np.array(out)[0], target)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_l1_norm_op.py b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
new file mode 100644
index 0000000000..bbc2087846
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_l1_norm_op.py
@@ -0,0 +1,42 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from op_test import OpTest
+
+
+class TestL1NormOp(OpTest):
+    """Test l1_norm
+    """
+
+    def setUp(self):
+        self.op_type = "l1_norm"
+        self.max_relative_error = 0.005
+
+        X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
+        X[np.abs(X) < self.max_relative_error] = 0.1
+        self.inputs = {'X': X}
+        self.outputs = {'Out': np.sum(np.abs(X))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=self.max_relative_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_label_smooth_op.py b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
new file mode 100644
index 0000000000..19a4df5744
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_label_smooth_op.py
@@ -0,0 +1,55 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLabelSmoothOp(OpTest):
+    def config(self):
+        self.op_type = "label_smooth"
+        self.epsilon = 0.1
+        batch_size, self.label_dim = 5, 10
+        self.label = np.zeros((batch_size, self.label_dim)).astype("float64")
+        nonzero_index = np.random.randint(self.label_dim, size=(batch_size))
+        self.label[np.arange(batch_size), nonzero_index] = 1
+
+    def setUp(self):
+        self.config()
+        smoothed_label = (1 - self.epsilon
+                          ) * self.label + self.epsilon / self.label_dim
+        self.inputs = {'X': self.label}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLabelSmoothOpWithPriorDist(TestLabelSmoothOp):
+    def setUp(self):
+        self.config()
+        dist = np.random.random((1, self.label_dim))
+        smoothed_label = (1 - self.epsilon) * self.label + self.epsilon * dist
+        self.inputs = {'X': self.label, 'PriorDist': dist}
+        self.attrs = {'epsilon': self.epsilon}
+        self.outputs = {'Out': smoothed_label}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
new file mode 100644
index 0000000000..7d5dc7d1a6
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+from operator import mul
+from op_test import OpTest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from paddle.v2.fluid.framework import grad_var_name
+
+
+def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
+    x_shape = x.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape = [N, D]
+
+    mean = np.mean(x, axis=1)
+    var = np.var(x, axis=1) + epsilon
+    output = scale.reshape([1, D]) * np.divide(
+        (x - mean.reshape([N, 1])),
+        (np.sqrt(var)).reshape([N, 1])) + beta.reshape([1, D])
+
+    x.shape, output.shape = x_shape, x_shape
+    return output, mean, var
+
+
+def _reference_layer_norm_grad(x, grad_y, scale, mean, var, begin_norm_axis=1):
+    x_shape = x.shape
+    scale_shape = scale.shape
+    N = reduce(mul, x_shape[0:begin_norm_axis], 1)
+    D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+    x.shape, grad_y.shape = [N, D], [N, D]
+    var.shape, mean.shape = [N, 1], [N, 1]
+    scale.shape = [1, D]
+
+    # d_bias
+    d_bias = np.sum(grad_y, axis=0).reshape([1, D])
+    # d_scale
+    d_scale = np.sum(((x - mean) * np.sqrt(1 / var)) * grad_y,
+                     axis=0).reshape([1, D])
+    # dx
+    dx_end = scale * np.sqrt(1.0 / var) * grad_y
+    d_mean_0 = np.sum(-np.sqrt(1.0 / var) * grad_y * scale, axis=1).reshape(
+        [N, 1])  # the second part equals to zero.
+    d_mean = 1.0 / D * d_mean_0
+    d_std = np.sum(
+        -(1.0 / var) * (x - mean) * grad_y * scale, axis=1).reshape([N, 1]) * (
+            1.0 / D * np.sqrt(1.0 / var).reshape([N, 1]) * (x - mean))
+
+    grad_x = dx_end + d_mean + d_std
+
+    grad_y.shape = x_shape
+    x.shape = x_shape
+    scale.shape = scale_shape
+    return grad_x, d_scale, d_bias
+
+
+def get_backward_op(scope, op, no_grad_set):
+    backward_op = core.Operator.backward(op, no_grad_set)
+    for input in backward_op.input_vars():
+        var = scope.var(input)
+        var.get_tensor()
+    for output in backward_op.output_vars():
+        var = scope.var(output)
+        var.get_tensor()
+    return backward_op
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place, feed_dict=None):
+    def __set_tensor__(name, data=None):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if data is None:
+            if out_dtype == core.DataType.FP64:
+                data = np.ones(out_tensor.shape(), dtype=np.float64)
+            elif out_dtype == core.DataType.FP32:
+                data = np.ones(out_tensor.shape(), dtype=np.float32)
+            else:
+                raise ValueError("Not supported data type " + str(out_dtype))
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        data = None
+        if output in feed_dict:
+            data = feed_dict[output]
+        __set_tensor__(output, data)
+
+
+class TestLayerNormdOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(
+            np.allclose(
+                np.array(tensor).reshape(np_array.shape), np_array, atol=atol),
+            msg)
+
+    def __assert_grad_close(self,
+                            tensor,
+                            np_array,
+                            name,
+                            place,
+                            max_relative_error=0.02):
+        a = np.array(tensor).reshape(np_array.shape)
+        b = np_array
+        abs_a = np.abs(a)
+        abs_a[abs_a < 1e-5] = 1
+
+        diff_mat = np.abs(a - b) / abs_a
+        max_diff = np.max(diff_mat)
+
+        def err_msg():
+            offset = np.argmax(diff_mat > max_relative_error)
+            return ("%s Variable %s max gradient diff %f over limit %f, "
+                    "the first error element is %d, %f, %f") % (
+                        "Gradient Check On %s" % str(place), name, max_diff,
+                        max_relative_error, offset, a.flatten()[offset],
+                        b.flatten()[offset])
+
+        self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_forward_backward(self, shape, begin_norm_axis):
+        def test_with_place(place, shape, begin_norm_axis=1):
+            # setUp
+            assert begin_norm_axis > 0 and begin_norm_axis < len(
+                shape), 'begin_norm_axis must be between 0 and len(shape)-1.'
+            # attr
+            epsilon = 0.00001
+            x_shape = shape
+            D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
+            scale_shape = [D]
+            np.random.random(123)
+            x_val = np.random.random_sample(x_shape).astype(np.float32)
+            scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+            bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+            y_grad = np.random.random_sample(x_shape).astype(np.float32)
+
+            # run forward
+            y_out, saved_mean, var_ref = _reference_layer_norm_naive(
+                x_val, scale_val, bias_val, epsilon, begin_norm_axis)
+            naive_fw = {"Y": y_out, "Mean": saved_mean, "Variance": var_ref}
+
+            # get gradient
+            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_layer_norm_grad(
+                x_val, y_grad, scale_val, saved_mean, var_ref, begin_norm_axis)
+            naive_grad = {
+                "X": x_grad_ref,
+                "Scale": scale_grad_ref,
+                "Bias": bias_grad_ref
+            }
+
+            scope = core.Scope()
+
+            # create input
+            input_map = {"X": x_val, "Scale": scale_val, "Bias": bias_val}
+            for i_name in input_map:
+                create_or_get_tensor(scope, i_name, input_map[i_name], place)
+
+            # create output
+            output_map = {"Y": None, "Mean": None, "Variance": None}
+            output_tensor = {}
+            for o_name in output_map:
+                output_tensor[o_name] = create_or_get_tensor(
+                    scope, o_name, output_map[o_name], place)
+
+            layer_norm_op = Operator(
+                "layer_norm",
+                # inputs
+                X="X",
+                Scale="Scale",
+                Bias="Bias",
+                # outputs
+                Y="Y",
+                Mean="Mean",
+                Variance="Variance",
+                # attrs
+                epsilon=epsilon,
+                begin_norm_axis=begin_norm_axis)
+
+            layer_norm_op.run(scope, place)
+
+            # check forward result
+            atol = 5e-2 if isinstance(place, core.CUDAPlace) else 1e-4
+            for o_tensor in output_tensor:
+                self.__assert_close(output_tensor[o_tensor], naive_fw[o_tensor],
+                                    o_tensor, atol)
+
+            # run backward
+            layer_norm_op_grad = get_backward_op(scope, layer_norm_op, set())
+            set_output_grad(
+                scope, ["Y", "Mean", "Variance"],
+                place,
+                feed_dict={"Y": y_grad})
+            layer_norm_op_grad.run(scope, place)
+
+            # get output
+            grad_tensor = {}
+            for o_name in naive_grad:
+                grad_tensor[o_name] = x_ = create_or_get_tensor(
+                    scope, grad_var_name(o_name), None, place)
+
+            # check gradient output
+            for o_grad in naive_grad:
+                self.__assert_grad_close(grad_tensor[o_grad],
+                                         naive_grad[o_grad], o_grad + "@GRAD",
+                                         place)
+
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu() and core.op_support_gpu("layer_norm"):
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            test_with_place(place, shape, begin_norm_axis)
+
+    def test_check_forward_backward_with_scale_and_bias(self):
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=1)
+        self.check_forward_backward(shape=[2, 3, 4, 5], begin_norm_axis=3)
+
+    def test_check_forward_backward_with_scale(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward_with_bias(self):
+        pass  # TODO(zcd)
+
+    def test_check_forward_backward(self):
+        pass  # TODO(zcd)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
new file mode 100644
index 0000000000..3f54e28def
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -0,0 +1,306 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.nets as nets
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program
+from paddle.v2.fluid.param_attr import ParamAttr
+import decorators
+
+
+class TestBook(unittest.TestCase):
+    def test_fit_a_line(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
+
+        print(str(program))
+
+    def test_recognize_digits_mlp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=[hidden2, hidden1],
+                                size=10,
+                                act='softmax',
+                                param_attr=["sftmax.w1", "sftmax.w2"])
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
+
+    def test_simple_conv2d(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
+
+        print(str(program))
+
+    def test_conv2d_transpose(self):
+        program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
+
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
+
+    def test_word_embedding(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
+
+    def test_linear_chain_crf(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            label_dict_len = 10
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+            layers.chunk_eval(
+                input=crf_decode,
+                label=label,
+                chunk_scheme="IOB",
+                num_chunk_types=(label_dict_len - 1) / 2)
+            self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
+
+        print(str(program))
+
+    def test_sigmoid_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            dat = layers.data(name='data', shape=[10], dtype='float32')
+            lbl = layers.data(name='label', shape=[10], dtype='float32')
+            self.assertIsNotNone(
+                layers.sigmoid_cross_entropy_with_logits(
+                    x=dat, label=lbl))
+        print(str(program))
+
+    def test_sequence_expand(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10], dtype='float32')
+            y = layers.data(
+                name='y', shape=[10, 20], dtype='float32', lod_level=1)
+            self.assertIsNotNone(layers.sequence_expand(x=x, y=y))
+        print(str(program))
+
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 30], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=30)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
+    def test_dynamic_lstmp(self):
+        program = Program()
+        with program_guard(program):
+            hidden_dim, proj_dim = 16, 8
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            fc_out = layers.fc(input=seq_data, size=4 * hidden_dim)
+            self.assertIsNotNone(
+                layers.dynamic_lstmp(
+                    input=fc_out, size=4 * hidden_dim, proj_size=proj_dim))
+        print(str(program))
+
+    def test_sequence_softmax(self):
+        program = Program()
+        with program_guard(program):
+            seq_data = layers.data(
+                name='seq_data', shape=[10, 10], dtype='float32', lod_level=1)
+            seq = layers.fc(input=seq_data, size=20)
+            self.assertIsNotNone(layers.sequence_softmax(x=seq))
+        print(str(program))
+
+    def test_get_places(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.get_places(device_count=4)
+            self.assertIsNotNone(x)
+        print(str(program))
+
+    def test_sequence_reshape(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[8], dtype='float32', lod_level=1)
+            out = layers.sequence_reshape(input=x, new_dim=16)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_im2sequence(self):
+        print("test_im2sequence")
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            output = layers.im2sequence(
+                input=x, stride=[1, 1], filter_size=[2, 2])
+            self.assertIsNotNone(output)
+        print(str(program))
+
+    @decorators.prog_scope()
+    def test_nce(self):
+        window_size = 5
+        words = []
+        for i in xrange(window_size):
+            words.append(
+                layers.data(
+                    name='word_{0}'.format(i), shape=[1], dtype='int64'))
+
+        dict_size = 10000
+        label_word = int(window_size / 2) + 1
+
+        embs = []
+        for i in xrange(window_size):
+            if i == label_word:
+                continue
+
+            emb = layers.embedding(
+                input=words[i],
+                size=[dict_size, 32],
+                param_attr='emb.w',
+                is_sparse=True)
+
+            embs.append(emb)
+
+        embs = layers.concat(input=embs, axis=1)
+        loss = layers.nce(input=embs,
+                          label=words[label_word],
+                          num_total_classes=dict_size,
+                          param_attr='nce.w',
+                          bias_attr='nce.b')
+        avg_loss = layers.mean(x=loss)
+        self.assertIsNotNone(avg_loss)
+        print(str(default_main_program()))
+
+    def test_row_conv(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[16], dtype='float32', lod_level=1)
+            out = layers.row_conv(input=x, future_context_size=2)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+    def test_multiplex(self):
+        program = Program()
+        with program_guard(program):
+            x1 = layers.data(name='x1', shape=[4], dtype='float32')
+            x2 = layers.data(name='x2', shape=[4], dtype='float32')
+            index = layers.data(name='index', shape=[1], dtype='int32')
+            out = layers.multiplex(inputs=[x1, x2], index=index)
+            self.assertIsNotNone(out)
+        print(str(program))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_learning_rate_decay.py b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
new file mode 100644
index 0000000000..dc348cf2d2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_learning_rate_decay.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import math
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.learning_rate_decay as lr_decay
+
+
+def exponential_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * decay_rate**exponent
+
+
+def natural_exp_decay(learning_rate,
+                      global_step,
+                      decay_steps,
+                      decay_rate,
+                      staircase=False):
+    exponent = float(global_step) / float(decay_steps)
+    if staircase:
+        exponent = math.floor(exponent)
+    return learning_rate * math.exp(-1 * decay_rate * exponent)
+
+
+def inverse_time_decay(learning_rate,
+                       global_step,
+                       decay_steps,
+                       decay_rate,
+                       staircase=False):
+    temp = float(global_step) / float(decay_steps)
+    if staircase:
+        temp = math.floor(temp)
+    return learning_rate / (1 + decay_rate * temp)
+
+
+class TestLearningRateDecay(unittest.TestCase):
+    def check_decay(self, python_decay_fn, fluid_decay_fn, staircase):
+        init_lr = 1.0
+        decay_steps = 5
+        decay_rate = 0.5
+
+        global_step = layers.create_global_var(
+            shape=[1], value=0.0, dtype='float32', persistable=True)
+
+        decayed_lr = fluid_decay_fn(
+            learning_rate=init_lr,
+            global_step=global_step,
+            decay_steps=decay_steps,
+            decay_rate=decay_rate,
+            staircase=staircase)
+        layers.increment(global_step, 1.0)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+
+        exe.run(fluid.default_startup_program())
+        for step in range(10):
+            step_val, lr_val = exe.run(fluid.default_main_program(),
+                                       feed=[],
+                                       fetch_list=[global_step, decayed_lr])
+            python_decayed_lr = python_decay_fn(
+                learning_rate=init_lr,
+                global_step=step,
+                decay_steps=decay_steps,
+                decay_rate=decay_rate,
+                staircase=staircase)
+            self.assertAlmostEqual(python_decayed_lr, lr_val[0])
+
+    def test_decay(self):
+        decay_fns = [
+            (exponential_decay, lr_decay.exponential_decay, True),
+            (exponential_decay, lr_decay.exponential_decay, False),
+            (natural_exp_decay, lr_decay.natural_exp_decay, True),
+            (natural_exp_decay, lr_decay.natural_exp_decay, False),
+            (inverse_time_decay, lr_decay.inverse_time_decay, True),
+            (inverse_time_decay, lr_decay.inverse_time_decay, False),
+        ]
+
+        for py_decay_fn, fluid_decay_fn, staircase in decay_fns:
+            print("decay_fn=" + str(py_decay_fn) + " staircase=" + str(
+                staircase))
+            main_program = framework.Program()
+            startup_program = framework.Program()
+            with framework.program_guard(main_program, startup_program):
+                self.check_decay(py_decay_fn, fluid_decay_fn, staircase)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
new file mode 100644
index 0000000000..cbfd9d5e5b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_linear_chain_crf_op.py
@@ -0,0 +1,156 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import random
+import numpy as np
+
+from op_test import OpTest
+
+
+class LinearChainCrfForward(object):
+    def __init__(self, seq_start_positions, emission_weights, emission_row_max,
+                 emission_exps, transition_weights, transition_exps, labels):
+        self.tag_num = emission_weights.shape[1]
+        self.seq_num = len(seq_start_positions) - 1
+
+        self.seq_start_positions = seq_start_positions
+        self.labels = labels
+        self.x = emission_weights
+
+        self.x_row_max = emission_row_max
+        self.x_exps = emission_exps
+
+        # unnormalized logits of the transition weights for the start mark.
+        self.a = transition_weights[0, :]
+        self.a_exps = transition_exps[0, :]
+        # unnormalized logits of the transition weights for the end mark.
+        self.b = transition_weights[1, :]
+        self.b_exps = transition_exps[1, :]
+        # unnormalized logits of the transition weights for all the other tags.
+        self.w = transition_weights[2:, :]
+        self.w_exps = transition_exps[2:, :]
+
+        # The output of linear chain crf operator.
+        # alpha is a memo table in dynamic programming to caculate
+        # nomalization factor.
+        self.alpha = np.zeros(
+            (seq_start_positions[-1], self.tag_num), dtype="float64")
+        self.log_likelihood = np.zeros((self.seq_num, 1))
+
+    def _l1_norm(self, x):
+        s = np.sum(x)
+        x /= s
+        return s
+
+    def _forward_a_sequence(self, x, x_row_max, x_exps, label, alpha):
+        seq_len = x_row_max.shape[0]
+        log_likelihood = 0.
+
+        for i in range(self.tag_num):
+            alpha[0, i] = self.a_exps[i] * x_exps[0, i]
+        log_likelihood = -x_row_max[0] - np.log(self._l1_norm(alpha[0, :]))
+
+        # calculate the unnormalized logits of the normalization factor.
+        for k in range(1, seq_len):
+            for i in range(self.tag_num):
+                s = 0.
+                for j in range(self.tag_num):
+                    s += alpha[k - 1, j] * self.w_exps[j, i]
+                alpha[k, i] = x_exps[k, i] * s
+            log_likelihood -= x_row_max[k] + np.log(self._l1_norm(alpha[k, :]))
+        s = 0.
+        for i in range(self.tag_num):
+            s += alpha[-1, i] * self.b_exps[i]
+        log_likelihood -= np.log(s)
+
+        # calculate the nominator part.
+        log_likelihood += (
+            self.a[label[0]] + x[0, label[0]] + self.b[label[-1]])
+
+        for k in range(1, seq_len):
+            log_likelihood += (x[k, label[k]] + self.w[label[k - 1], label[k]])
+        return -log_likelihood
+
+    def crf_forward_compute(self):
+        for i in range(self.seq_num):
+            start = self.seq_start_positions[i]
+            end = self.seq_start_positions[i + 1]
+
+            self.log_likelihood[i] = self._forward_a_sequence(
+                self.x[start:end, :], self.x_row_max[start:end, :],
+                self.x_exps[start:end, :], self.labels[start:end, :],
+                self.alpha[start:end, :])
+        return self.alpha, self.log_likelihood
+
+
+class TestLinearChainCrfOp(OpTest):
+    def set_test_data(self):
+        # TODO(caoying) Fix the unittest by: add the boundary cases when
+        # sequence lengths are 1, 2, and 3.
+
+        SEQ_NUM = 3
+        TAG_NUM = 17
+        MAX_SEQ_LEN = 5
+
+        # the linear_chain_crf operator only supports sequence (LoD level = 1)
+        lod = [[0]]
+        for i in range(SEQ_NUM):
+            lod[-1].append(lod[-1][-1] + random.randint(1, MAX_SEQ_LEN))
+        emission = np.random.uniform(-1, 1,
+                                     [lod[-1][-1], TAG_NUM]).astype("float64")
+        emission_row_max = np.amax(emission, axis=1, keepdims=True)
+        emission_exps = np.exp(emission - emission_row_max)
+
+        transition = np.random.uniform(-0.5, 0.5,
+                                       [TAG_NUM + 2, TAG_NUM]).astype("float64")
+        transition_exps = np.exp(transition)
+
+        labels = np.random.randint(
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
+
+        self.inputs = {
+            "Emission": (emission, lod),
+            "Transition": transition,
+            "Label": (labels, lod)
+        }
+        crf = LinearChainCrfForward(lod[0], emission, emission_row_max,
+                                    emission_exps, transition, transition_exps,
+                                    labels)
+        alpha, log_likelihood = crf.crf_forward_compute()
+
+        self.outputs = {
+            "Alpha": alpha,
+            "EmissionExps": emission_exps,
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood
+        }
+
+    def setUp(self):
+        self.op_type = "linear_chain_crf"
+        self.set_test_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Emission", "Transition"], "LogLikelihood")
+
+    def test_check_grad_ignore_transition(self):
+        self.check_grad(
+            ["Emission"], "LogLikelihood", no_grad_set=set("Transition"))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
new file mode 100644
index 0000000000..eff28368f1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestLoDArrayLength(unittest.TestCase):
+    def test_array_length(self):
+        tmp = layers.zeros(shape=[10], dtype='int32')
+        i = layers.fill_constant(shape=[1], dtype='int64', value=10)
+        arr = layers.array_write(tmp, i=i)
+        arr_len = layers.array_length(arr)
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        result = exe.run(fetch_list=[arr_len])[0]
+        self.assertEqual(11, result[0])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
new file mode 100644
index 0000000000..eb0392e8bf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -0,0 +1,41 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.v2.fluid.layers import lod_rank_table, data
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import numpy
+import unittest
+
+
+class TestLoDRankTable(unittest.TestCase):
+    def test_lod_rank_table(self):
+        x = data(name='x', shape=[100])
+        cpu = core.CPUPlace()
+        rank_table = lod_rank_table(x=x, level=1)
+        rank_table.persistable = True
+        exe = Executor(cpu)
+        scope = core.Scope()
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.random.random(size=(17, 100)), cpu)
+        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
+        exe.run(scope=scope, feed={'x': tensor})
+        var = scope.find_var(rank_table.name)
+        table = var.get_lod_rank_table()
+        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_reset_op.py b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
new file mode 100644
index 0000000000..4ee360403e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_reset_op.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLodResetOpByAttr(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 7, 10]
+        self.inputs = {'X': (x, lod)}
+        self.attrs = {'target_lod': target_lod_0}
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestLodResetOpByInput(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0 = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array([target_lod_0]).astype('int32')
+        }
+        self.outputs = {'Out': (x, [target_lod_0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+class TestLodResetOpBoth(OpTest):
+    def setUp(self):
+        self.op_type = "lod_reset"
+        x = np.random.random((10, 20)).astype("float32")
+        lod = [[0, 3, 5, 10]]
+        target_lod_0_attr = [0, 7, 10]
+        target_lod_0_in = [0, 4, 7, 10]
+        self.inputs = {
+            'X': (x, lod),
+            'TargetLoD': np.array(target_lod_0_in).astype('int32')
+        }
+        self.attrs = {'target_lod': target_lod_0_attr}
+        self.outputs = {'Out': (x, [target_lod_0_in])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", no_grad_set=set("TargetLoD"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
new file mode 100644
index 0000000000..0f3ac3c03d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestLoDTensorArray(unittest.TestCase):
+    def test_get_set(self):
+        scope = core.Scope()
+        arr = scope.var('tmp_lod_tensor_array')
+        tensor_array = arr.get_lod_tensor_array()
+        self.assertEqual(0, len(tensor_array))
+        cpu = core.CPUPlace()
+        for i in xrange(10):
+            t = core.LoDTensor()
+            t.set(numpy.array([i], dtype='float32'), cpu)
+            t.set_lod([[0, 1]])
+            tensor_array.append(t)
+
+        self.assertEqual(10, len(tensor_array))
+
+        for i in xrange(10):
+            t = tensor_array[i]
+            self.assertEqual(numpy.array(t), numpy.array([i], dtype='float32'))
+            self.assertEqual([[0, 1]], t.lod())
+
+            t = core.LoDTensor()
+            t.set(numpy.array([i + 10], dtype='float32'), cpu)
+            t.set_lod([[0, 2]])
+            tensor_array[i] = t
+            t = tensor_array[i]
+            self.assertEqual(
+                numpy.array(t), numpy.array(
+                    [i + 10], dtype='float32'))
+            self.assertEqual([[0, 2]], t.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
new file mode 100644
index 0000000000..c2d04db99b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -0,0 +1,209 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+import numpy
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_lod_tensor_to_array_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
+
+    def test_lod_tensor_to_array_level_0_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 9, 10]])
+        expect = map(lambda x: numpy.array(x).astype('int32'),
+                     [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
+
+    def test_lod_tensor_to_array_level_1(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(20).reshape(20, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5], [0, 3, 9, 11, 17, 20]])
+
+        expect = [
+            numpy.array(
+                [9, 10, 0, 1, 2], dtype='int32'), numpy.array(
+                    [11, 12, 13, 14, 15, 16, 3, 4, 5, 6, 7, 8], dtype='int32'),
+            numpy.array(
+                [17, 18, 19], dtype='int32')
+        ]
+
+        lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def test_lod_tensor_to_array_level_1_empty_seq(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(31).reshape(31, 1).astype('int32'), self.place())
+
+        tensor.set_lod([[0, 3, 5, 9, 11],
+                        [0, 3, 7, 11, 11, 12, 17, 19, 21, 23, 30, 31]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[
+                12, 13, 14, 15, 16, 0, 1, 2, 23, 24, 25, 26, 27, 28, 29
+            ], [17, 18, 3, 4, 5, 6, 11, 30], [19, 20, 7, 8, 9, 10], [21, 22]]
+        ]
+
+        lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=4)
+
+    def test_lod_tensor_to_array_level_2(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+
+        expect = [
+            numpy.array(
+                item, dtype='int32')
+            for item in [[21, 0, 1, 2, 3, 4, 5, 6, 46, 47, 48, 49], range(
+                22, 39) + range(7, 21), range(39, 46)]
+        ]
+        lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
+               [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
+
+    def test_lod_tensor_to_array_level_2_skip_level(self):
+        tensor = core.LoDTensor()
+        tensor.set(
+            numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
+                        [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
+        self.main(
+            tensor=tensor,
+            expect_array=None,
+            expect_lod=None,
+            expect_max_len=4,
+            level=1)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
+        place = self.place()
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[10])
+            x.persistable = True
+            table = layers.lod_rank_table(x, level=level)
+            max_len = layers.max_sequence_len(table)
+            max_len.persistable = True
+            array = layers.lod_tensor_to_array(x, table)
+            array.persistable = True
+
+            result = layers.array_to_lod_tensor(array, table)
+            result.persistable = True
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program, feed={'x': tensor}, scope=scope)
+        var = scope.find_var(array.name)
+        array = var.get_lod_tensor_array()
+        if expect_array is not None and expect_lod is not None:
+            self.check_array_same(array, expect_array, expect_lod)
+        self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
+
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
+    def check_array_same(self, array, expect_tensor, expect_lod):
+        self.assertEqual(len(expect_tensor), len(array))
+        for i, exp in enumerate(zip(expect_tensor, expect_lod)):
+            exp_tensor, exp_lod = exp
+            exp_tensor = numpy.expand_dims(exp_tensor, axis=1)
+            self.assertTrue(numpy.allclose(exp_tensor, numpy.array(array[i])))
+            self.assertEqual(exp_lod, array[i].lod())
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(
+            numpy.allclose(numpy.array(actual), numpy.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            table = layers.lod_rank_table(x, level=0)
+            array = layers.lod_tensor_to_array(x, table)
+            result = layers.array_to_lod_tensor(array, table)
+
+            mean = layers.mean(x=result)
+
+            append_backward(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(numpy.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+
+        exe = Executor(place)
+        g_out = [
+            numpy.array(item).sum()
+            for item in exe.run(program,
+                                feed={'x': tensor},
+                                fetch_list=[g_vars],
+                                return_numpy=False)
+        ]
+        g_out_sum = numpy.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000..338355d0c4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_logical_op.py b/python/paddle/v2/fluid/tests/test_logical_op.py
new file mode 100644
index 0000000000..dd67dc561b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_logical_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import op_test
+import unittest
+import numpy as np
+
+
+def create_test_class(op_type, callback, binary_op=True):
+    class Cls(op_test.OpTest):
+        def setUp(self):
+            a = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+            if binary_op:
+                b = np.random.choice(a=[True, False], size=(10, 7)).astype(bool)
+                c = callback(a, b)
+            else:
+                c = callback(a)
+            self.outputs = {'Out': c}
+            self.op_type = op_type
+            if binary_op:
+                self.inputs = {'X': a, 'Y': b}
+            else:
+                self.inputs = {'X': a}
+
+        def test_output(self):
+            self.check_output()
+
+    Cls.__name__ = op_type
+    globals()[op_type] = Cls
+
+
+create_test_class('logical_and', lambda _a, _b: np.logical_and(_a, _b))
+create_test_class('logical_or', lambda _a, _b: np.logical_or(_a, _b))
+create_test_class('logical_not', lambda _a: np.logical_not(_a), False)
+create_test_class('logical_xor', lambda _a, _b: np.logical_xor(_a, _b))
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lookup_table_op.py b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
new file mode 100644
index 0000000000..0c566c76c9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lookup_table_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLookupTableOp(OpTest):
+    def setUp(self):
+        self.op_type = "lookup_table"
+        table = np.random.random((17, 31)).astype("float32")
+        ids = np.random.randint(0, 17, 4).astype("int64")
+        ids_expand = np.expand_dims(ids, axis=1)
+        self.inputs = {'W': table, 'Ids': ids_expand}
+        self.outputs = {'Out': table[ids]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+
+
+class TestLookupTableOpWithPadding(TestLookupTableOp):
+    def test_check_output(self):
+        ids = np.squeeze(self.inputs['Ids'])
+        padding_idx = np.random.choice(ids, 1)[0]
+        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
+        self.attrs = {'padding_idx': long(padding_idx)}
+        self.check_output()
+
+    def test_check_grad(self):
+        # Since paddings are not trainable and fixed in forward, the gradient of 
+        # paddings makes no sense and we don't test the gradient here.
+        pass
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py
new file mode 100644
index 0000000000..a841dcf79f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
@@ -0,0 +1,91 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLRNOp(OpTest):
+    def get_input(self):
+        ''' TODO(gongweibao): why it's grad diff is so large?
+        x = np.ndarray(
+            shape=(self.N, self.C, self.H, self.W), dtype=float, order='C')
+        for m in range(0, self.N):
+            for i in range(0, self.C):
+                for h in range(0, self.H):
+                    for w in range(0, self.W):
+                        x[m][i][h][w] = m * self.C * self.H * self.W +  \
+                                        i * self.H * self.W +  \
+                                        h * self.W + w + 1
+        '''
+        x = np.random.rand(self.N, self.C, self.H, self.W).astype("float32")
+        return x + 1
+
+    def get_out(self):
+        start = -(self.n - 1) / 2
+        end = start + self.n
+
+        mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32")
+        mid.fill(self.k)
+        for m in range(0, self.N):
+            for i in range(0, self.C):
+                for c in range(start, end + 1):
+                    ch = i + c
+                    if ch < 0 or ch >= self.C:
+                        continue
+
+                    s = mid[m][i][:][:]
+                    r = self.x[m][ch][:][:]
+                    s += np.square(r) * self.alpha
+
+        mid2 = np.power(mid, -self.beta)
+        return np.multiply(self.x, mid2), mid
+
+    def get_attrs(self):
+        attrs = {
+            'n': self.n,
+            'k': self.k,
+            'alpha': self.alpha,
+            'beta': self.beta
+        }
+        return attrs
+
+    def setUp(self):
+        self.op_type = "lrn"
+        self.N = 2
+        self.C = 3
+        self.H = 5
+        self.W = 5
+
+        self.n = 5
+        self.k = 2.0
+        self.alpha = 0.0001
+        self.beta = 0.75
+        self.x = self.get_input()
+        self.out, self.mid_out = self.get_out()
+
+        self.inputs = {'X': self.x}
+        self.outputs = {'Out': self.out, 'MidOut': self.mid_out}
+        self.attrs = self.get_attrs()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstm_op.py b/python/paddle/v2/fluid/tests/test_lstm_op.py
new file mode 100644
index 0000000000..3e79f9d8e1
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstm_op.py
@@ -0,0 +1,300 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    y = np.copy(x)
+    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+    return 1. / (1. + np.exp(-y))
+
+
+def tanh(x):
+    y = -2. * x
+    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+    return (2. / (1. + np.exp(y))) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+ACTIVATION = {
+    'identity': identity,
+    'sigmoid': sigmoid,
+    'tanh': tanh,
+    'relu': relu
+}
+
+
+def lstm(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand):
+        g = np.dot(h_pre, w_h)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        return h, c
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    hidden = []
+    cell = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        h_pre = h0[i]  # 1 x D
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            h_pre, c_pre = _step(x[j], w_h, w_c, h_pre, c_pre, act_gate,
+                                 act_cell, act_cand)
+            hidden.append(h_pre.flatten())
+            cell.append(c_pre.flatten())
+
+    hidden = np.array(hidden).astype('float64')
+    cell = np.array(cell).astype('float64')
+
+    hidden = _reverse(hidden, offset) if is_reverse else hidden
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert hidden.shape == (input.shape[0], input.shape[1] / 4)
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)
+    return hidden, cell
+
+
+class TestLstmOp(OpTest):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = False
+        self.use_peepholes = True
+
+    def setUp(self):
+        self.set_argument()
+        self.op_type = 'lstm'
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
+        w = np.random.normal(size=(self.D, 4 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+        h, c = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
+                    ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                    ACTIVATION[self.act_cand])
+
+        self.inputs = {'Input': (x, self.lod), 'Weight': w}
+
+        self.inputs['Bias'] = b
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Hidden'], max_relative_error=5e-4)
+
+
+class TestLstmOpHasInitial(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = True
+        self.is_reverse = True
+        self.use_peepholes = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0', 'C0'], ['Hidden'],
+            max_relative_error=5e-4)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'Bias'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'C0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias', 'H0'], ['Hidden'],
+            max_relative_error=5e-4,
+            no_grad_set=set('C0'))
+
+
+class TestLstmOpRerverse(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = True
+
+
+class TestLstmOpNotUsePeepholes(TestLstmOp):
+    def set_argument(self):
+        self.lod = [[0, 2, 5, 7]]
+        self.D = 16
+
+        self.act_gate = 'sigmoid'
+        self.act_cell = 'tanh'
+        self.act_cand = 'tanh'
+
+        self.has_initial_state = False
+        self.is_reverse = True
+        self.use_peepholes = False
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstm_unit_op.py b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
new file mode 100644
index 0000000000..d6348ea0ec
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstm_unit_op.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sigmoid_np(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh_np(x):
+    return 2 * sigmoid_np(2. * x) - 1.
+
+
+class LstmUnitTest(OpTest):
+    def setUp(self):
+        self.op_type = "lstm_unit"
+        x_np = np.random.normal(size=(5, 16)).astype("float64")
+        c_np = np.random.normal(size=(5, 4)).astype("float64")
+        i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
+        forget_bias_np = 0.
+        self.attrs = {'forget_bias': 0.}
+
+        new_c = c_np * sigmoid_np(f_np + forget_bias_np) + sigmoid_np(
+            i_np) * tanh_np(j_np)
+        new_h = tanh_np(new_c) * sigmoid_np(o_np)
+
+        self.inputs = {'X': x_np, 'C_prev': c_np}
+        self.outputs = {'C': new_c, 'H': new_h}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'C_prev'], ['C', 'H'])
+
+
+if __name__ == "__main__":
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185
+    exit(0)
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_lstmp_op.py b/python/paddle/v2/fluid/tests/test_lstmp_op.py
new file mode 100644
index 0000000000..92a954a9aa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_lstmp_op.py
@@ -0,0 +1,286 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import test_lstm_op as LstmTest
+
+ACTIVATION = {
+    'identity': LstmTest.identity,
+    'sigmoid': LstmTest.sigmoid,
+    'tanh': LstmTest.tanh,
+    'relu': LstmTest.relu
+}
+
+
+# LSTM with recurrent projection Layer
+def lstmp(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_r=None,  # P x 4D
+        w_rh=None,  # D x P
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None,
+        act_proj=None):
+    def _step(x, w_r, w_rh, w_c, r_pre, c_pre, act_gate, act_cell, act_cand,
+              act_proj):
+        g = np.dot(r_pre, w_r)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, _ = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        # projection
+        r = np.dot(h, w_rh)
+        r = act_proj(r)
+        return r, c
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    # recurrent projection state
+    projection = []
+    cell = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        r_pre = np.dot(h0[i], w_rh)  # 1 x P
+        r_pre = act_proj(r_pre)
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            r_pre, c_pre = _step(x[j], w_r, w_rh, w_c, r_pre, c_pre, act_gate,
+                                 act_cell, act_cand, act_proj)
+            projection.append(r_pre.flatten())
+            cell.append(c_pre.flatten())
+
+    projection = np.array(projection).astype('float64')
+    cell = np.array(cell).astype('float64')
+
+    projection = _reverse(projection, offset) if is_reverse else projection
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert projection.shape == (input.shape[0], w_r.shape[0])  # T x P
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)  # T x D
+    return projection, cell
+
+
+class TestLstmpOp(LstmTest.TestLstmOp):
+    def reset_argument(self):
+        pass
+
+    def setUp(self):
+        self.set_argument()
+        # projection size
+        self.P = 10
+        self.act_proj = self.act_cell
+
+        self.reset_argument()
+        self.op_type = 'lstmp'
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype('float64')
+        if self.has_initial_state:
+            h0 = np.random.normal(size=(N, self.D)).astype('float64')
+            c0 = np.random.normal(size=(N, self.D)).astype('float64')
+        else:
+            h0 = np.zeros((N, self.D)).astype('float64')
+            c0 = np.zeros((N, self.D)).astype('float64')
+        w = np.random.normal(size=(self.P, 4 * self.D)).astype('float64')
+        if self.use_peepholes:
+            b = np.random.normal(size=(1, 7 * self.D)).astype('float64')
+        else:
+            b = np.random.normal(size=(1, 4 * self.D)).astype('float64')
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:] if self.use_peepholes else None
+        w_rh = np.random.normal(size=(self.D, self.P)).astype('float64')
+        r, c = lstmp(x, self.lod, h0, c0, w, w_rh, w_b, w_c, self.is_reverse,
+                     ACTIVATION[self.act_gate], ACTIVATION[self.act_cell],
+                     ACTIVATION[self.act_cand], ACTIVATION[self.act_proj])
+
+        self.inputs = {'Input': (x, self.lod), 'Weight': w, 'ProjWeight': w_rh}
+
+        self.inputs['Bias'] = b
+
+        if self.has_initial_state:
+            self.inputs['H0'] = h0
+            self.inputs['C0'] = c0
+
+        self.outputs = {
+            'Projection': (r, self.lod),
+            'Cell': (c, self.lod),
+        }
+        self.attrs = {
+            'use_peepholes': self.use_peepholes,
+            'is_reverse': self.is_reverse,
+            'gate_activation': self.act_gate,
+            'cell_activation': self.act_cell,
+            'candidate_activation': self.act_cand,
+            'proj_activation': self.act_proj
+        }
+
+    def test_check_output(self):
+        self.check_output(atol=1e-8)
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2)
+
+
+class TestLstmpOpHasInitial(TestLstmpOp):
+    def reset_argument(self):
+        self.has_initial_state = True
+
+    def test_check_grad(self):
+        # TODO(qingqing) remove folowing lines after the check_grad is refined.
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0', 'C0'],
+            ['Projection'],
+            max_relative_error=1e-2)
+
+    def test_check_grad_ingore_bias(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Weight'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Bias'))
+
+    def test_check_grad_ingore_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Weight'))
+
+    def test_check_grad_ingore_proj_weight(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('ProjWeight'))
+
+    def test_check_grad_ingore_input(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Weight', 'ProjWeight', 'Bias'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('Input'))
+
+    def test_check_grad_ingore_h0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'C0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('H0'))
+
+    def test_check_grad_ingore_c0(self):
+        N = len(self.lod[0]) - 1
+        self.outputs['OrderedP0'] = np.zeros((N, self.P)).astype('float64')
+        self.outputs['BatchGate'] = np.zeros((N, 4 * self.D)).astype('float64')
+        self.outputs['BatchHidden'] = np.zeros((N, self.D)).astype('float64')
+        self.outputs['BatchCellPreAct'] = np.zeros(
+            (N, self.D)).astype('float64')
+        self.check_grad(
+            ['Input', 'Weight', 'ProjWeight', 'Bias', 'H0'], ['Projection'],
+            max_relative_error=1e-2,
+            no_grad_set=set('C0'))
+
+
+class TestLstmpOpRerverse(TestLstmpOp):
+    def reset_argument(self):
+        self.is_reverse = True
+
+
+class TestLstmpOpNotUsePeepholes(TestLstmpOp):
+    def reset_argument(self):
+        self.use_peepholes = False
+
+
+class TestLstmpOpLinearProjection(TestLstmpOp):
+    def reset_argument(self):
+        self.act_proj = 'identity'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
new file mode 100644
index 0000000000..694ce20712
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_margin_rank_loss_op.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMarginRankLossOp(OpTest):
+    def setUp(self):
+        self.op_type = "margin_rank_loss"
+        batch_size = 5
+        margin = 0.5
+        # labels_{i} = {-1, 1}
+        label = 2 * np.random.randint(
+            0, 2, size=(batch_size, 1)).astype("float32") - 1
+        x1 = np.random.random((batch_size, 1)).astype("float32")
+        x2 = np.random.random((batch_size, 1)).astype("float32")
+        # loss = max(0, -label * (x1 - x2) + margin)
+        loss = -label * (x1 - x2) + margin
+        loss = np.where(loss > 0, loss, 0)
+        act = np.where(loss > 0, 1., 0.)
+
+        self.attrs = {'margin': margin}
+        self.inputs = {'Label': label, 'X1': x1, 'X2': x2}
+        self.outputs = {'Activated': act, 'Out': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X1", "X2"], "Out")
+
+    def test_check_grad_ignore_x1(self):
+        self.check_grad(["X2"], "Out", no_grad_set=set('X1'))
+
+    def test_check_grad_ignore_x2(self):
+        self.check_grad(["X1"], "Out", no_grad_set=set('X2'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_math_op_patch.py b/python/paddle/v2/fluid/tests/test_math_op_patch.py
new file mode 100644
index 0000000000..2e77639a4c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_math_op_patch.py
@@ -0,0 +1,181 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import decorators
+import paddle.v2.fluid as fluid
+import numpy
+
+
+class TestMathOpPatches(unittest.TestCase):
+    @decorators.prog_scope()
+    def test_add_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = a + 10
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np + 10, b_np))
+
+    @decorators.prog_scope()
+    def test_radd_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = 10 + a
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np + 10, b_np))
+
+    @decorators.prog_scope()
+    def test_sub_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = a - 10
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np - 10, b_np))
+
+    @decorators.prog_scope()
+    def test_radd_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = 10 - a
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(10 - a_np, b_np))
+
+    @decorators.prog_scope()
+    def test_mul_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = a * 10
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np * 10, b_np))
+
+    @decorators.prog_scope()
+    def test_rmul_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = 10 * a
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(10 * a_np, b_np))
+
+    @decorators.prog_scope()
+    def test_div_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = a / 10
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(a_np / 10, b_np))
+
+    @decorators.prog_scope()
+    def test_rdiv_scalar(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = 10 / a
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2
+
+        b_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np},
+                       fetch_list=[b])
+        self.assertTrue(numpy.allclose(10 / a_np, b_np))
+
+    @decorators.prog_scope()
+    def test_div_two_tensor(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = fluid.layers.data(name="b", shape=[1])
+        c = a / b
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = numpy.random.random(size=[10, 1]).astype('float32') + 1e-2
+        c_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np,
+                             'b': b_np},
+                       fetch_list=[c])
+        self.assertTrue(numpy.allclose(a_np / b_np, c_np))
+
+    @decorators.prog_scope()
+    def test_mul_two_tensor(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = fluid.layers.data(name="b", shape=[1])
+        c = a * b
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = numpy.random.random(size=[10, 1]).astype('float32')
+        c_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np,
+                             'b': b_np},
+                       fetch_list=[c])
+        self.assertTrue(numpy.allclose(a_np * b_np, c_np))
+
+    @decorators.prog_scope()
+    def test_add_two_tensor(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = fluid.layers.data(name="b", shape=[1])
+        c = a + b
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = numpy.random.random(size=[10, 1]).astype('float32')
+        c_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np,
+                             'b': b_np},
+                       fetch_list=[c])
+        self.assertTrue(numpy.allclose(a_np + b_np, c_np))
+
+    @decorators.prog_scope()
+    def test_sub_two_tensor(self):
+        a = fluid.layers.data(name="a", shape=[1])
+        b = fluid.layers.data(name="b", shape=[1])
+        c = a - b
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        a_np = numpy.random.random(size=[10, 1]).astype('float32')
+        b_np = numpy.random.random(size=[10, 1]).astype('float32')
+        c_np = exe.run(fluid.default_main_program(),
+                       feed={"a": a_np,
+                             'b': b_np},
+                       fetch_list=[c])
+        self.assertTrue(numpy.allclose(a_np - b_np, c_np))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_matmul_op.py b/python/paddle/v2/fluid/tests/test_matmul_op.py
new file mode 100644
index 0000000000..5138af38f4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_matmul_op.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, 1))
+        elif X.ndim == 2:
+            X = X.T
+        else:
+            dim = [i for i in range(len(X.shape))]
+            dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1]
+            X = np.transpose(X, tuple(dim))
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((1, Y.size))
+        else:
+            dim = [i for i in range(len(Y.shape))]
+            dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1]
+            Y = np.transpose(Y, tuple(dim))
+
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float32")
+    return Out
+
+
+class Generator(object):
+    def setUp(self):
+        self.op_type = "matmul"
+        X = np.random.random(self.shape_X).astype("float32")
+        Y = np.random.random(self.shape_Y).astype("float32")
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y
+        }
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output(atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y'))
+
+
+# Generate test cases for all possibilities
+for dim_X in [1, 2, 3]:
+    for dim_Y in [1, 2, 3]:
+        for transpose_X in [False, True]:
+            for transpose_Y in [False, True]:
+                test_name = (
+                    'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                        dim_X, dim_Y, transpose_X, transpose_Y))
+                shape_X, shape_Y = generate_compatible_shapes(
+                    dim_X, dim_Y, transpose_X, transpose_Y)
+                globals()[test_name] = type(test_name, (Generator, OpTest), {
+                    'shape_X': shape_X,
+                    'shape_Y': shape_Y,
+                    'transpose_X': transpose_X,
+                    'transpose_Y': transpose_Y,
+                })
+
+
+# Test case n-dim
+def generate_compatible_shapes(dim, transpose_X, transpose_Y):
+    M = 2
+    N = 4
+    K = 3
+    shape_X = [2 for _ in range(dim - 2)]
+    shape_Y = [2 for _ in range(dim - 2)]
+
+    if transpose_X:
+        shape_X += [K, M]
+    else:
+        shape_X += [M, K]
+
+    if transpose_Y:
+        shape_Y += [N, K]
+    else:
+        shape_Y += [K, N]
+
+    return shape_X, shape_Y
+
+
+# Test case n-dim
+for dim in [4]:
+    for transpose_X in [False, True]:
+        for transpose_Y in [False, True]:
+            test_name = (
+                'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                    dim, dim, transpose_X, transpose_Y))
+            shape_X, shape_Y = generate_compatible_shapes(dim, transpose_X,
+                                                          transpose_Y)
+            globals()[test_name] = type(test_name, (Generator, OpTest), {
+                'shape_X': shape_X,
+                'shape_Y': shape_Y,
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+            })
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
new file mode 100644
index 0000000000..5cd7fbde84
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def maxout_forward_naive(input, groups):
+    s0, s1, s2, s3 = input.shape
+    return np.ndarray([s0, s1 / groups, groups, s2, s3], \
+        buffer = input, dtype=input.dtype).max(axis=(2))
+
+
+class TestMaxOutOp(OpTest):
+    def setUp(self):
+        self.op_type = "maxout"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.MaxOut_forward_naive(input, self.groups).astype("float32")
+
+        self.inputs = {'X': input}
+        self.attrs = {'groups': self.groups}
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.MaxOut_forward_naive = maxout_forward_naive
+        self.shape = [100, 6, 2, 2]
+        self.groups = 2
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_mean_op.py b/python/paddle/v2/fluid/tests/test_mean_op.py
new file mode 100644
index 0000000000..81e8421635
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mean_op.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = "mean"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.outputs = {'Out': np.mean(self.inputs["X"])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_checkout_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
new file mode 100644
index 0000000000..2e9ed78ffd
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_memory_optimization_transpiler.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.memory_optimization_transpiler import memory_optimize
+
+
+class TestControlFlowGraph(unittest.TestCase):
+    def setUp(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            opt = optimizer.SGD(learning_rate=0.001)
+            opt = opt.minimize(avg_cost)
+
+        self.program = program
+
+    def test_control_flow_graph(self):
+        print("before optimization")
+        print(str(self.program))
+        result_program = memory_optimize(self.program)
+        print("after optimization")
+        print(str(result_program))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_minus_op.py b/python/paddle/v2/fluid/tests/test_minus_op.py
new file mode 100644
index 0000000000..aee909f56c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_minus_op.py
@@ -0,0 +1,37 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMinusOp(OpTest):
+    def setUp(self):
+        self.op_type = "minus"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((32, 84)).astype("float32")
+        }
+        self.outputs = {'Out': (self.inputs['X'] - self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
new file mode 100644
index 0000000000..3288a0f007
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program, program_guard, default_main_program, default_startup_program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.optimizer import MomentumOptimizer
+import paddle.v2.fluid.core as core
+import paddle.v2 as paddle
+import unittest
+import numpy as np
+
+
+class TestMNISTIfElseOp(unittest.TestCase):
+    def test_raw_api(self):
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            true_image, false_image = layers.split_lod_tensor(
+                input=image, mask=cond)
+
+            true_out = layers.create_tensor(dtype='float32')
+            true_cond = layers.ConditionalBlock([true_image])
+
+            with true_cond.block():
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=true_out)
+
+            false_out = layers.create_tensor(dtype='float32')
+            false_cond = layers.ConditionalBlock([false_image])
+
+            with false_cond.block():
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                layers.assign(input=prob, output=false_out)
+
+            prob = layers.merge_lod_tensor(
+                in_true=true_out, in_false=false_out, mask=cond, x=image)
+            loss = layers.cross_entropy(input=prob, label=label)
+            avg_loss = layers.mean(x=loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
+
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(startup_prog)
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = np.expand_dims(y_data, axis=1)
+
+                outs = exe.run(prog,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+    def test_ifelse(self):
+        prog = Program()
+        startup_prog = Program()
+        with program_guard(prog, startup_prog):
+            image = layers.data(name='x', shape=[784], dtype='float32')
+
+            label = layers.data(name='y', shape=[1], dtype='int64')
+
+            limit = layers.fill_constant_batch_size_like(
+                input=label, dtype='int64', shape=[1], value=5.0)
+            cond = layers.less_than(x=label, y=limit)
+            ie = layers.IfElse(cond)
+
+            with ie.true_block():
+                true_image = ie.input(image)
+                hidden = layers.fc(input=true_image, size=100, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            with ie.false_block():
+                false_image = ie.input(image)
+                hidden = layers.fc(input=false_image, size=200, act='tanh')
+                prob = layers.fc(input=hidden, size=10, act='softmax')
+                ie.output(prob)
+
+            prob = ie()
+            loss = layers.cross_entropy(input=prob[0], label=label)
+            avg_loss = layers.mean(x=loss)
+
+            optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+            optimizer.minimize(avg_loss, startup_prog)
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=200)
+
+        place = core.CPUPlace()
+        exe = Executor(place)
+
+        exe.run(kwargs['startup_program'])
+        PASS_NUM = 100
+        for pass_id in range(PASS_NUM):
+            for data in train_reader():
+                x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape((y_data.shape[0], 1))
+
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
+                print outs[0]
+                if outs[0] < 1.0:
+                    return
+        self.assertFalse(True)
+
+
+if __name__ == '__main__':
+    # temp disable if else unittest since it could be buggy.
+    exit(0)
diff --git a/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
new file mode 100644
index 0000000000..eb3873b9ea
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_modified_huber_loss_op.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def modified_huber_loss_forward(val):
+    if val < -1:
+        return -4. * val
+    elif val < 1:
+        return (1. - val) * (1. - val)
+    else:
+        return 0.
+
+
+class TestModifiedHuberLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'modified_huber_loss'
+        samples_num = 32
+
+        x_np = np.random.uniform(-2., 2., (samples_num, 1)).astype('float32')
+        y_np = np.random.choice([0, 1], samples_num).reshape(
+            (samples_num, 1)).astype('float32')
+        product_res = x_np * (2. * y_np - 1.)
+        # keep away from the junction of piecewise function
+        for pos, val in np.ndenumerate(product_res):
+            while abs(val - 1.) < 0.05:
+                x_np[pos] = np.random.uniform(-2., 2.)
+                y_np[pos] = np.random.choice([0, 1])
+                product_res[pos] = x_np[pos] * (2 * y_np[pos] - 1)
+                val = product_res[pos]
+
+        self.inputs = {'X': x_np, 'Y': y_np}
+        loss = np.vectorize(modified_huber_loss_forward)(product_res)
+
+        self.outputs = {
+            'IntermediateVal': product_res.astype('float32'),
+            'Out': loss.reshape((samples_num, 1)).astype('float32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.01)
+
+
+if __name__ == '__main__':
+    exit(0)
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_momentum_op.py b/python/paddle/v2/fluid/tests/test_momentum_op.py
new file mode 100644
index 0000000000..048eaae06b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_momentum_op.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMomentumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        use_nesterov = False
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMomentumOp2(OpTest):
+    '''Test Momentum with default values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        use_nesterov = True
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_mul_op.py b/python/paddle/v2/fluid/tests/test_mul_op.py
new file mode 100644
index 0000000000..83715f0e27
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mul_op.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMulOp(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.inputs = {
+            'X': np.random.random((32, 84)).astype("float32"),
+            'Y': np.random.random((84, 100)).astype("float32")
+        }
+        self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+class TestMulOp2(OpTest):
+    def setUp(self):
+        self.op_type = "mul"
+        self.inputs = {
+            'X': np.random.random((15, 4, 12, 10)).astype("float32"),
+            'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
+        }
+        self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
+        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
+                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
+        result = result.reshape(15, 4, 8, 2, 9)
+        self.outputs = {'Out': result}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_multihead_attention.py b/python/paddle/v2/fluid/tests/test_multihead_attention.py
new file mode 100644
index 0000000000..a2b300a645
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_multihead_attention.py
@@ -0,0 +1,98 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestMultiheadAttention(unittest.TestCase):
+    def gen_random_input(self):
+        """Generate random input data.
+        """
+        # batch_size, max_sequence_length, hidden dimension
+        self.input_shape = (3, 13, 16)
+        self.queries = np.random.random(size=self.input_shape).astype("float32")
+        self.keys = np.random.random(size=self.input_shape).astype("float32")
+
+    def set_program(self):
+        """Build the test program.
+        """
+        queries = fluid.layers.data(
+            name="queries",
+            shape=self.input_shape,
+            dtype="float32",
+            append_batch_size=False)
+        queries.stop_gradient = False
+        keys = fluid.layers.data(
+            name="keys",
+            shape=self.input_shape,
+            dtype="float32",
+            append_batch_size=False)
+        keys.stop_gradient = False
+
+        contexts = fluid.nets.scaled_dot_product_attention(
+            queries=queries,
+            keys=keys,
+            values=keys,
+            num_heads=8,
+            dropout_rate=0.)
+        out = fluid.layers.reduce_sum(contexts, dim=None)
+        fluid.backward.append_backward(loss=out)
+
+        self.fetch_list = [contexts]
+
+    def run_program(self):
+        """Run the test program.
+        """
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+
+            exe.run(fluid.default_startup_program())
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=True)
+            self.op_output = output
+
+    def set_inputs(self, place):
+        """Set the randomly generated data to the test program.
+        """
+        self.inputs = {}
+        queries = fluid.Tensor()
+        queries.set(self.queries, place)
+
+        keys = fluid.Tensor()
+        keys.set(self.keys, place)
+
+        self.inputs["keys"] = keys
+        self.inputs["queries"] = queries
+
+    def test_multihead_attention(self):
+        self.gen_random_input()
+
+        self.set_program()
+        self.run_program()
+
+        #fixme(caoying) add more meaningfull unittest.
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_multiplex_op.py b/python/paddle/v2/fluid/tests/test_multiplex_op.py
new file mode 100644
index 0000000000..a06aef94a5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_multiplex_op.py
@@ -0,0 +1,59 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMultiplexOp(OpTest):
+    def setUp(self):
+        self.op_type = "multiplex"
+        rows = 4
+        index = np.arange(0, rows).astype('int32')
+        np.random.shuffle(index)
+        index = np.reshape(index, (rows, 1))
+        ins1 = np.random.random((rows, 10)).astype("float32")
+        ins2 = np.random.random((rows, 10)).astype("float32")
+        ins3 = np.random.random((rows, 10)).astype("float32")
+        ins4 = np.random.random((rows, 10)).astype("float32")
+        self.inputs = {
+            'Ids': index,
+            'X': [('x1', ins1), ('x2', ins2), ('x3', ins3), ('x4', ins4)]
+        }
+        # multiplex output
+        output = np.zeros_like(ins1)
+        for i in range(0, rows):
+            k = index[i][0]
+            output[i] = self.inputs['X'][k][1][i]
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x1', 'x2', 'x3', 'x4'], 'Out')
+
+    def test_check_grad_ignore_x1(self):
+        self.check_grad(['x2', 'x3', 'x4'], 'Out', no_grad_set=set('x1'))
+
+    def test_check_grad_ignore_x1_x2(self):
+        self.check_grad(['x3', 'x4'], 'Out', no_grad_set=set(['x1', 'x2']))
+
+    def test_check_grad_ignore_x3(self):
+        self.check_grad(['x1', 'x2', 'x4'], 'Out', no_grad_set=set('x3'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py
new file mode 100644
index 0000000000..9a51c1f612
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_nce.py
@@ -0,0 +1,114 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def nce(input, weight, bias, sample_weight, labels, num_classes,
+        num_sample_class):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (out[:, np.newaxis], np.array(sample_out).reshape(
+        batch_size, num_sample_class + num_true_class),
+            np.array(sample_labels).reshape(batch_size,
+                                            num_sample_class + num_true_class))
+
+
+class TestNCE(OpTest):
+    def generate_data(self, dim, batch_size, num_classes, num_true_class,
+                      num_neg_samples):
+        input = np.random.randn(batch_size, dim).astype(np.float32)
+        weight = np.random.randn(num_classes, dim).astype(np.float32)
+        bias = np.random.randn(num_classes).astype(np.float32)
+        sample_weight = np.random.randn(batch_size).astype(np.float32)
+        labels = np.random.randint(0, num_classes, (batch_size, num_true_class))
+        self.attrs = {
+            'num_total_classes': num_classes,
+            'num_neg_samples': num_neg_samples,
+            'custom_neg_classes': range(num_neg_samples)
+        }
+        self.inputs = {
+            'Input': input,
+            'Label': labels,
+            'Weight': weight,
+            'Bias': bias,
+            'SampleWeight': sample_weight
+        }
+
+    def set_data(self):
+        self.generate_data(5, 5, 4, 1, 2)
+
+    def compute(self):
+        out = nce(self.inputs['Input'], self.inputs['Weight'],
+                  self.inputs['Bias'], self.inputs['SampleWeight'],
+                  self.inputs['Label'], self.attrs['num_total_classes'],
+                  self.attrs['num_neg_samples'])
+        self.outputs = {
+            'Cost': out[0],
+            'SampleLogits': out[1],
+            'SampleLabels': out[2]
+        }
+
+    def setUp(self):
+        self.op_type = 'nce'
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
+
+
+class TestNCECase1(TestNCE):
+    def set_data(self):
+        self.generate_data(10, 20, 10, 2, 5)
+
+
+if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/7778
+    exit(0)
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_net.py b/python/paddle/v2/fluid/tests/test_net.py
new file mode 100644
index 0000000000..69d95d4f70
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_net.py
@@ -0,0 +1,53 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+import unittest
+
+
+def fc(X, W, Y):
+    ret_v = core.Net.create()
+
+    ret_v.append_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
+    ret_v.append_op(Operator("sigmoid", X="pre_activation", Out=Y))
+    ret_v.complete_add_op(True)
+    return ret_v
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = core.Net.create()
+        op1 = Operator("sum", X=["X", "Y"], Out="Out")
+        net.append_op(op1)
+
+        net2 = core.Net.create()
+        net2.append_op(fc(X="X", W="w", Y="fc.out"))
+        net2.complete_add_op(True)
+        net.append_op(net2)
+        net.complete_add_op(True)
+
+        expected = '''
+Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
+    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
+    Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
+        Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
+            Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
+            Op(sigmoid), inputs:{X[pre_activation]}, outputs:{Out[fc.out]}.
+'''
+        self.assertEqual(expected, "\n" + str(net))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_norm_op.py b/python/paddle/v2/fluid/tests/test_norm_op.py
new file mode 100644
index 0000000000..dd1cd5a31c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_norm_op.py
@@ -0,0 +1,71 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def norm(input, scale, epsilon):
+    s0, s1, s2, s3 = input.shape
+    x_square = input * input
+    for i in xrange(s0):
+        input_batch = input[i:i + 1, :, :, :]
+        input_batch = input_batch.reshape(s1, s2 * s3)
+        x_square_batch = x_square[i:i + 1, :, :, :]
+        x_square_batch = x_square_batch.reshape(s1, s2 * s3)
+        square_colsum = x_square_batch.sum(axis=0) + epsilon
+        tmp = pow(square_colsum, 0.5)
+        tmp = np.reciprocal(tmp)
+        tmp_tile = np.tile(tmp, s1)
+        tmp_tile = tmp_tile.reshape(s1, s2 * s3)
+        scale_tile = np.tile(scale, (1, s2 * s3))
+        scale_tile = scale_tile.reshape(s1, s2 * s3)
+        out_batch = input_batch * tmp_tile * scale_tile
+        out_batch = out_batch.reshape(1, s1, s2, s3)
+        if i == 0:
+            out = out_batch
+        else:
+            out = np.concatenate((out, out_batch), 0)
+    out.reshape(s0, s1, s2, s3)
+    return out
+
+
+class TestNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "norm"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        scale = np.array([10, 10, 10])
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Scale': scale.astype('float32')
+        }
+        self.attrs = {'epsilon': self.epsilon}
+        output = norm(input, scale, self.epsilon)
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.shape = [2, 3, 2, 2]
+        self.epsilon = 1e-6
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_normalization_wrapper.py b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
new file mode 100644
index 0000000000..6b71f2a923
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_normalization_wrapper.py
@@ -0,0 +1,96 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestNormalization(unittest.TestCase):
+    data_desc = {"name": "input", "shape": (2, 3, 7)}
+
+    def gen_random_input(self):
+        """Generate random input data.
+        """
+        self.data = np.random.random(
+            size=self.data_desc["shape"]).astype("float32")
+
+    def set_program(self, axis, epsilon):
+        """Build the test program.
+        """
+        data = fluid.layers.data(
+            name=self.data_desc["name"],
+            shape=self.data_desc["shape"],
+            dtype="float32",
+            append_batch_size=False)
+        data.stop_gradient = False
+        l2_norm = fluid.layers.l2_normalize(x=data, axis=axis, epsilon=epsilon)
+        out = fluid.layers.reduce_sum(l2_norm, dim=None)
+
+        fluid.backward.append_backward(loss=out)
+        self.fetch_list = [l2_norm]
+
+    def run_program(self):
+        """Run the test program.
+        """
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=True)
+            self.op_output = output
+
+    def set_inputs(self, place):
+        """Set the randomly generated data to the test program.
+        """
+        self.inputs = {}
+        tensor = fluid.Tensor()
+        tensor.set(self.data, place)
+        self.inputs[self.data_desc["name"]] = tensor
+
+    def l2_normalize(self, data, axis, epsilon):
+        """ Compute the groundtruth.
+        """
+        output = data * np.reciprocal(
+            np.sum(np.square(data), axis=axis, keepdims=True))
+        return output
+
+    def test_l2_normalize(self):
+        """ Test the python wrapper for l2_normalize.
+        """
+        axis = 1
+        #TODO(caoying) epsilon is not supported due to lack of a maximum_op.
+        epsilon = 1e-6
+
+        self.gen_random_input()
+
+        self.set_program(axis, epsilon)
+        self.run_program()
+
+        expect_output = self.l2_normalize(self.data, axis, epsilon)
+
+        # check output
+        self.assertTrue(np.allclose(self.op_output, expect_output, atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_one_hot_op.py b/python/paddle/v2/fluid/tests/test_one_hot_op.py
new file mode 100644
index 0000000000..e51ea27d14
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_one_hot_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.framework as framework
+from paddle.v2.fluid.framework import Program, program_guard
+
+
+class TestOneHotOp(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth, 'dtype': int(core.DataType.FP32)}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_default_dtype(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        depth = 10
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = [np.random.randint(0, depth - 1) for i in xrange(x_lod[0][-1])]
+        x = np.array(x).astype('int').reshape([x_lod[0][-1], 1])
+
+        out = np.zeros(shape=(np.product(x.shape[:-1]),
+                              depth)).astype('float32')
+
+        for i in xrange(np.product(x.shape)):
+            out[i, x[i]] = 1.0
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'depth': depth}
+        self.outputs = {'Out': (out, x_lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestOneHotOp_exception(OpTest):
+    def setUp(self):
+        self.op_type = 'one_hot'
+        self.depth = 10
+        self.place = core.CPUPlace()
+        self.dimension = 12
+        self.x = core.LoDTensor()
+        x_lod = [[0, 4, 5, 8, 11]]
+        data = [np.random.randint(11, 20) for i in xrange(x_lod[0][-1])]
+        data = np.array(data).astype('int').reshape([x_lod[0][-1], 1])
+        self.x.set(data, self.place)
+        self.x.set_lod(x_lod)
+
+    def test_check_output(self):
+        program = Program()
+        with program_guard(program):
+            x = fluid.layers.data(
+                name='x', shape=[self.dimension], dtype='float32', lod_level=1)
+            block = program.current_block()
+            one_hot_out = block.create_var(
+                name="one_hot_out",
+                type=core.VarDesc.VarType.LOD_TENSOR,
+                dtype='float32')
+            block.append_op(
+                type='one_hot',
+                inputs={'X': x},
+                attrs={'depth': self.depth},
+                outputs={'Out': one_hot_out})
+            exe = fluid.Executor(self.place)
+
+            def run():
+                exe.run(feed={'x': self.x},
+                        fetch_list=[one_hot_out],
+                        return_numpy=False)
+
+            self.assertRaises(core.EnforceNotMet, run)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/v1_api_demo/quick_start/cluster/pserver.sh b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
old mode 100755
new mode 100644
similarity index 61%
rename from v1_api_demo/quick_start/cluster/pserver.sh
rename to python/paddle/v2/fluid/tests/test_op_support_gpu.py
index b187c1d9b9..7de02a8fda
--- a/v1_api_demo/quick_start/cluster/pserver.sh
+++ b/python/paddle/v2/fluid/tests/test_op_support_gpu.py
@@ -1,5 +1,4 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,15 +11,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-set -e
-bin_dir=$(cd `dirname $0`; pwd)
-source "$bin_dir/env.sh"
 
-paddle pserver \
-  --nics=`get_nics` \
-  --port=7164 \
-  --ports_num=1 \
-  --ports_num_for_sparse=1 \
-  --num_gradient_servers=1 \
-  --comment="paddle_pserver" \
-  2>&1 | tee 'pserver.log'
+import unittest
+import paddle.v2.fluid.core as core
+
+
+class TestOpSupportGPU(unittest.TestCase):
+    def test_case(self):
+        self.assertEqual(core.is_compiled_with_cuda(),
+                         core.op_support_gpu("sum"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_operator.py b/python/paddle/v2/fluid/tests/test_operator.py
new file mode 100644
index 0000000000..b82cf580e8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_operator.py
@@ -0,0 +1,218 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.op as op
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+
+
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
+        all_protos = op.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "test"
+        ipt = op_proto.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op_proto.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op_proto.comment = "not matter"
+
+        self.assertTrue(op_proto.IsInitialized())
+
+        method = op.OpDescCreationMethod(op_proto)
+        output = method(X="a", Y="b", Z="c")
+        expected = framework_pb2.OpDesc()
+        expected.type = "test"
+        ipt_0 = expected.inputs.add()
+        ipt_0.parameter = "X"
+        ipt_0.arguments.extend(["a"])
+        ipt_1 = expected.inputs.add()
+        ipt_1.parameter = 'Y'
+        ipt_1.arguments.extend(['b'])
+        opt = expected.outputs.add()
+        opt.parameter = "Z"
+        opt.arguments.extend(["c"])
+
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "fc"
+        ipt = op_proto.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.duplicable = True
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.duplicable = True
+
+        ipt = op_proto.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op_proto.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op_proto.comment = ""
+        self.assertTrue(op_proto.IsInitialized())
+        method = op.OpDescCreationMethod(op_proto)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = framework_pb2.OpDesc()
+        tmp = expected1.inputs.add()
+        tmp.parameter = "X"
+        tmp.arguments.extend(['x'])
+
+        tmp = expected1.inputs.add()
+        tmp.parameter = 'W'
+        tmp.arguments.extend(['w'])
+
+        tmp = expected1.inputs.add()
+        tmp.parameter = 'b'
+        tmp.arguments.extend(['b'])
+
+        tmp = expected1.outputs.add()
+        tmp.parameter = 'Y'
+        tmp.arguments.extend(['y'])
+        expected1.type = 'fc'
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = framework_pb2.OpDesc()
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = "X"
+        tmp.arguments.extend(['x1', 'x2', 'x3'])
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = 'W'
+        tmp.arguments.extend(['w1', 'w2', 'w3'])
+
+        tmp = expected2.inputs.add()
+        tmp.parameter = 'b'
+        tmp.arguments.extend(['b'])
+
+        tmp = expected2.outputs.add()
+        tmp.parameter = 'Y'
+        tmp.arguments.extend(['y'])
+
+        expected2.type = 'fc'
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op_proto = framework_pb2.OpProto()
+        op_proto.type = "test"
+        ipt = op_proto.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op_proto.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", framework_pb2.INT)
+        __add_attr__("float_attr", framework_pb2.FLOAT)
+        __add_attr__("string_attr", framework_pb2.STRING)
+        __add_attr__("ints_attr", framework_pb2.INTS)
+        __add_attr__("floats_attr", framework_pb2.FLOATS)
+        __add_attr__("strings_attr", framework_pb2.STRINGS)
+
+        op_proto.comment = ""
+        self.assertTrue(op_proto.IsInitialized())
+
+        method = op.OpDescCreationMethod(op_proto)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = framework_pb2.OpDesc()
+        expected.type = "test"
+
+        ipt = expected.inputs.add()
+        ipt.parameter = "X"
+        ipt.arguments.extend(['a'])
+
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = framework_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = framework_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = framework_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = framework_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = framework_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = framework_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = op.Operator("sum", X=["a", "b"], Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
+                         str(add_op))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
new file mode 100644
index 0000000000..2c8665ffa2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.core as core
+
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
+
+class TestOperator(unittest.TestCase):
+    def test_error_type(self):
+        block = main_program.create_block()
+        try:
+            block.append_op()
+            self.assertFail()
+        except ValueError as v_err:
+            self.assertEqual(
+                v_err.message,
+                "`type` to initilized an Operator can not be None.")
+        try:
+            block.append_op(type="no_such_op")
+            self.assertFail()
+        except ValueError as a_err:
+            self.assertEqual(a_err.message,
+                             "Operator \"no_such_op\" has not been registered.")
+
+    def test_op_desc_creation(self):
+        program = Program()
+        block = program.current_block()
+        mul_x = block.create_var(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+
+        self.assertNotEqual(str(mul_op), "")
+        self.assertEqual(mul_op.type, "mul")
+        self.assertEqual(mul_op.input_names, ["X", "Y"])
+        self.assertEqual(mul_op.input("X"), ["mul.x"])
+        self.assertEqual(mul_op.input("Y"), ["mul.y"])
+        self.assertEqual(mul_op.output_names, ["Out"])
+        self.assertEqual(mul_op.output("Out"), ["mul.out"])
+        self.assertEqual(
+            set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"]))
+        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
+        self.assertEqual(mul_op.has_attr("y_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("y_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
+        self.assertEqual(mul_op.idx, 0)
+        self.assertEqual(mul_out.op, mul_op)
+
+    def test_mult_input(self):
+        program = Program()
+        block = program.current_block()
+        sum_x1 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x1")
+        sum_x2 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x2")
+        sum_x3 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x3")
+        sum_out = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.out")
+        sum_op = block.append_op(
+            type="sum",
+            inputs={"X": [sum_x1, sum_x2, sum_x3]},
+            outputs={"Out": sum_out})
+        self.assertEqual(sum_op.type, "sum")
+        self.assertEqual(sum_op.input_names, ["X"])
+        self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"])
+        self.assertEqual(sum_op.output_names, ["Out"])
+        self.assertEqual(sum_op.output("Out"), ["sum.out"])
+        self.assertEqual(sum_op.idx, 0)
+        self.assertEqual(sum_out.op, sum_op)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_optimizer.py b/python/paddle/v2/fluid/tests/test_optimizer.py
new file mode 100644
index 0000000000..480ee70915
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_optimizer.py
@@ -0,0 +1,435 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+from paddle.v2.fluid.backward import append_backward
+
+
+class TestOptimizer(unittest.TestCase):
+    def test_sgd_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "sgd")
+
+    def test_sgd_optimizer_with_global_step(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        global_step = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="step")
+        learning_rate = 0.01
+        sgd_optimizer = optimizer.SGDOptimizer(
+            learning_rate=learning_rate, global_step=global_step)
+        opts, _ = sgd_optimizer.minimize(mean_out, init_program)
+        self.assertEqual(len(opts), 2)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "sgd")
+        increment_op = opts[1]
+        self.assertEqual(increment_op.type, "increment")
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 1)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+
+
+class TestMomentumOptimizer(unittest.TestCase):
+    class MockMomentum(optimizer.MomentumOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_velocity_str(self):
+            return self._velocity_acc_str
+
+    def test_vanilla_momentum_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        learning_rate = 0.01
+        momentum_optimizer = self.MockMomentum(
+            learning_rate=learning_rate, momentum=0.2)
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(
+            params_grads, mul_out, init_program)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+        self.assertFalse(sgd_op.attr('use_nesterov'))
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+
+    def test_nesterov_momentum_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        momentum_optimizer = self.MockMomentum(
+            learning_rate=learning_rate, momentum=0.2, use_nesterov=True)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(
+            params_grads, mul_out, init_program)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+        self.assertTrue(sgd_op.attr('use_nesterov'))
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+
+
+class TestAdagradOptimizer(unittest.TestCase):
+    class MockAdagrad(optimizer.AdagradOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+    def test_adagrad_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        adagrad_optimizer = self.MockAdagrad(
+            learning_rate=learning_rate, epsilon=1.0e-6)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
+        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                          init_program)
+        self.assertEqual(len(opts), 1)
+        adagrad_op = opts[0]
+        self.assertEqual(adagrad_op.type, "adagrad")
+
+        # Check accumulators
+        accumulators = adagrad_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
+        moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+
+
+class TestAdamOptimizer(unittest.TestCase):
+    class MockAdam(optimizer.AdamOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment1_str(self):
+            return self._moment1_acc_str
+
+        def get_moment2_str(self):
+            return self._moment2_acc_str
+
+    def test_adam_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        adam_optimizer = self.MockAdam(
+            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
+        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                       init_program)
+        self.assertEqual(len(opts), 3)
+        adam_op = opts[0]
+        self.assertEqual(adam_op.type, "adam")
+
+        # Check accumulators
+        accumulators = adam_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
+        self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
+        moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
+        moment2_acc = accumulators[adam_optimizer.get_moment2_str()]
+        self.assertEqual(len(moment1_acc), 1)
+        self.assertEqual(len(moment2_acc), 1)
+        self.assertTrue(mul_x.name in moment1_acc)
+        self.assertTrue(mul_x.name in moment2_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 5)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+
+
+class TestAdamaxOptimizer(unittest.TestCase):
+    class MockAdamax(optimizer.AdamaxOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+        def get_inf_norm_str(self):
+            return self._inf_norm_acc_str
+
+    def test_adamax_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        adamax_optimizer = self.MockAdamax(
+            learning_rate=learning_rate, beta1=0.9, beta2=0.999)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
+        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
+                                                         init_program)
+        self.assertEqual(len(opts), 2)
+        adam_op = opts[0]
+        self.assertEqual(adam_op.type, "adamax")
+
+        # Check accumulators
+        accumulators = adamax_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(adamax_optimizer.get_moment_str() in accumulators)
+        self.assertTrue(adamax_optimizer.get_inf_norm_str() in accumulators)
+        moment_acc = accumulators[adamax_optimizer.get_moment_str()]
+        inf_norm_acc = accumulators[adamax_optimizer.get_inf_norm_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertEqual(len(inf_norm_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+        self.assertTrue(mul_x.name in inf_norm_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 4)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+
+
+class TestDecayedAdagradOptimizer(unittest.TestCase):
+    class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+    def test_decayed_adagrad_optimizer(self):
+        init_program = framework.Program()
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        learning_rate = 0.01
+        decayed_adagrad_optimizer = self.MockDecayedAdagrad(
+            learning_rate=learning_rate, decay=0.95, epsilon=1.0e-6)
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
+        opts = decayed_adagrad_optimizer.create_optimization_pass(
+            params_grads, mul_out, init_program)
+        self.assertEqual(len(opts), 1)
+        decayed_adagrad_op = opts[0]
+        self.assertEqual(decayed_adagrad_op.type, "decayed_adagrad")
+
+        # Check accumulators
+        accumulators = decayed_adagrad_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(
+            decayed_adagrad_optimizer.get_moment_str() in accumulators)
+        moment_acc = accumulators[decayed_adagrad_optimizer.get_moment_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+
+        # Check init_program
+        init_ops = init_program.global_block().ops
+        self.assertEqual(len(init_ops), 2)
+        self.assertEqual(init_ops[0].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
+        self.assertEqual(init_ops[1].type, "fill_constant")
+        self.assertAlmostEqual(init_ops[1].attr('value'), 0.0)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_pad_op.py b/python/paddle/v2/fluid/tests/test_pad_op.py
new file mode 100644
index 0000000000..0bd4800055
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_pad_op.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestPadOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "pad"
+        self.inputs = {'X': np.random.random(self.shape).astype("float32"), }
+        self.attrs = {}
+        self.attrs['paddings'] = np.array(self.paddings).flatten()
+        self.attrs['pad_value'] = self.pad_value
+        self.outputs = {
+            'Out': np.pad(self.inputs['X'],
+                          self.paddings,
+                          mode='constant',
+                          constant_values=self.pad_value)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X'], 'Out', max_relative_error=0.006)
+
+    def initTestCase(self):
+        self.shape = (16, 16)
+        self.paddings = [(0, 1), (2, 3)]
+        self.pad_value = 0.0
+
+
+class TestCase1(TestPadOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 4)
+        self.paddings = [(0, 1), (2, 3), (2, 1), (1, 1)]
+        self.pad_value = 0.5
+
+
+class TestCase2(TestPadOp):
+    def initTestCase(self):
+        self.shape = (2, 2, 2)
+        self.paddings = [(0, 0), (0, 0), (1, 2)]
+        self.pad_value = 1.0
+
+
+class TestCase3(TestPadOp):
+    def initTestCase(self):
+        self.shape = (8)
+        self.paddings = [(0, 1)]
+        self.pad_value = 0.9
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
new file mode 100644
index 0000000000..367cc8b1aa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -0,0 +1,201 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid as fluid
+import numpy
+
+
+class BaseParallelForTest(unittest.TestCase):
+    def run_test(self, callback, feed, fetch):
+        """
+        Run the unittest for parallel.for
+        Args:
+            callback(callable): A callable function returns a generator. There 
+                are two yields in the generator function. The first yield 
+                returns the data layers, and the second yield returns the loss. 
+                The modified data variables will be sent back during the first 
+                yield.
+
+            feed(dict): The executor feeding dictionary.
+            fetch(list|basestr): The fetch name lists. 
+
+        Returns:
+            None
+
+        Raises:
+            AssertionError when the computation of cpu, parallel.for in cpu, 
+                gpu, parallel.for in gpu are different.
+
+        """
+        cpu = fluid.CPUPlace()
+        result_cpu = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=False)
+        result_cpu_parallel = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=True)
+        if fluid.core.is_compiled_with_cuda():
+            gpu = fluid.CUDAPlace(0)
+            result_gpu = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=False)
+            result_gpu_parallel = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=True)
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
+                               result_gpu, result_gpu_parallel)
+        else:
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
+
+    def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False):
+        """
+        Run a single test, returns the fetch values
+        Args:
+            place(Place): the computation place. 
+            use_parallel(bool): Whether use parallel.for or not. 
+
+        Returns:
+            Fetched numpy arrays.
+
+        """
+        if isinstance(fetch, basestring):
+            fetch = [fetch]
+        main = fluid.Program()
+        startup = fluid.Program()
+        # Fix seed
+        main.random_seed = 10
+        startup.random_seed = 10
+
+        with fluid.program_guard(main, startup):
+            generator = callback()
+            # Automatically insert parallel do if use_parallel = True
+            if use_parallel:
+                places = fluid.layers.get_places()
+                pd = fluid.layers.ParallelDo(places)
+                data = next(generator)
+
+                if isinstance(data, fluid.Variable):
+                    data = [data]
+
+                with pd.do():
+                    ins = map(pd.read_input, data)
+                    if len(ins) == 1:
+                        ins = ins[0]
+                    loss = generator.send(ins)  # patch input
+                    pd.write_output(loss)
+
+                loss = pd()
+            else:
+                data = next(generator)
+                loss = generator.send(data)
+            self.assertIsNotNone(loss)
+            avg_loss = fluid.layers.mean(x=loss)
+            fluid.backward.append_backward(loss=avg_loss)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        return exe.run(main, feed=feed, fetch_list=fetch)
+
+    def _assert_same_(self, fetch, *args):
+        """
+        Assert the return values of `run_test` are same.
+        Args:
+            fetch: Fetch list. Used for print error message
+            *args: The fetch result lists of each situations.
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError
+
+        """
+
+        def _impl_(a, b, fetch_id, item_id):
+            item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU']
+            flag = numpy.allclose(a, b, rtol=0.1)
+            self.assertTrue(flag, "The {0} are different in {1}".format(
+                fetch[fetch_id], item_str[item_id]))
+
+        for i, items in enumerate(zip(*args)):
+            self.assertGreater(len(items), 0)
+            for j in range(1, len(items)):
+                _impl_(items[0], items[j], fetch_id=i, item_id=j)
+
+
+class ParallelOpTest(BaseParallelForTest):
+    @staticmethod
+    def __network__():
+        x = fluid.layers.data(shape=[784], dtype='float32', name='img')
+        x = yield x
+        hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+        loss = fluid.layers.mean(x=hidden)
+        yield loss
+
+    def test_simple_fc(self):
+        self.run_test(
+            callback=self.__network__,
+            feed={
+                'img': numpy.random.random(size=(51, 784)).astype('float32')
+            },
+            fetch=['fc1.w@GRAD'])
+
+    def test_fc_with_tiny_data(self):
+        self.run_test(
+            callback=self.__network__,
+            feed={'img': numpy.random.random(size=(1, 784)).astype('float32')},
+            fetch=['fc1.w@GRAD'])
+
+
+class ParallelOpTestMultipleInput(BaseParallelForTest):
+    @staticmethod
+    def __network__():
+        x = fluid.layers.data(
+            shape=[784], dtype='float32', name='img1', stop_gradient=False)
+        y = fluid.layers.data(
+            shape=[784], dtype='float32', name='img2', stop_gradient=False)
+        yield [x, y]
+        x = x + y
+        hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+        hidden2 = fluid.layers.fc(input=hidden1, size=200, param_attr='fc2.w')
+        hidden3 = fluid.layers.fc(input=hidden2, size=200, param_attr='fc3.w')
+        loss = fluid.layers.mean(x=hidden3)
+        yield loss
+
+    def test_simple_fc(self):
+        self.run_test(
+            callback=self.__network__,
+            feed={
+                'img1': numpy.random.random(size=(51, 784)).astype('float32'),
+                'img2': numpy.random.random(size=(51, 784)).astype('float32')
+            },
+            fetch=['fc1.w@GRAD', 'fc2.w@GRAD', 'fc3.w@GRAD'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
new file mode 100644
index 0000000000..dfecdf939b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.v2.fluid.framework import default_main_program
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.io as io
+from paddle.v2.fluid.initializer import ConstantInitializer
+import numpy as np
+
+main_program = default_main_program()
+
+
+class TestParameter(unittest.TestCase):
+    def test_param(self):
+        shape = [784, 100]
+        val = 1.0625
+        b = main_program.global_block()
+        param = b.create_parameter(
+            name='fc.w',
+            shape=shape,
+            dtype='float32',
+            initializer=ConstantInitializer(val))
+        self.assertIsNotNone(param)
+        self.assertEqual('fc.w', param.name)
+        self.assertEqual((784, 100), param.shape)
+        self.assertEqual(core.DataType.FP32, param.dtype)
+        self.assertEqual(0, param.block.idx)
+        exe = Executor(core.CPUPlace())
+        p = exe.run(main_program, fetch_list=[param])[0]
+        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
+        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_pool2d_op.py b/python/paddle/v2/fluid/tests/test_pool2d_op.py
new file mode 100644
index 0000000000..2f43be8a0f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_pool2d_op.py
@@ -0,0 +1,221 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+
+
+class TestPool2d_Op(OpTest):
+    def setUp(self):
+        self.use_cudnn = False
+        self.init_test_case()
+        self.init_global_pool()
+        self.init_op_type()
+        self.init_pool_type()
+        if self.global_pool:
+            self.paddings = [0 for _ in range(len(self.paddings))]
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings,
+                                           self.global_pool).astype("float32")
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn and self.pool_type != "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=0.07)
+        elif self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def init_test_case(self):
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
+
+class TestCase1(TestPool2d_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2(TestPool2d_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase3(TestPool2d_Op):
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase4(TestCase1):
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+class TestCase5(TestCase2):
+    def init_op_type(self):
+        self.op_type = "pool2d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+
+
+#--------------------test pool2d--------------------
+class TestCUDNNCase1(TestPool2d_Op):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+class TestCUDNNCase2(TestCase1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+class TestCUDNNCase3(TestCase2):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+class TestCUDNNCase4(TestCase3):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+class TestCUDNNCase5(TestCase4):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+class TestCUDNNCase6(TestCase5):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool2d"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_pool3d_op.py b/python/paddle/v2/fluid/tests/test_pool3d_op.py
new file mode 100644
index 0000000000..c93711e051
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_pool3d_op.py
@@ -0,0 +1,231 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+import paddle.v2.fluid.core as core
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+
+
+def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
+                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+    return out
+
+
+class TestPool3d_Op(OpTest):
+    def setUp(self):
+        self.use_cudnn = False
+        self.init_test_case()
+        self.init_global_pool()
+        self.init_op_type()
+        self.init_pool_type()
+
+        if self.global_pool:
+            self.paddings = [0 for _ in range(len(self.paddings))]
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings,
+                                           self.global_pool).astype("float32")
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+            'use_cudnn': self.use_cudnn,
+            'data_format': 'AnyLayout'  # TODO(dzhwinter) : should be fix latter
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        if self.use_cudnn:
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=1e-5)
+        else:
+            self.check_output()
+
+    def test_check_grad(self):
+        if self.use_cudnn and self.pool_type != "max":
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place, set(['X']), 'Out', max_relative_error=0.07)
+        elif self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def init_test_case(self):
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = True
+
+
+class TestCase1(TestPool3d_Op):
+    def init_test_case(self):
+        self.op_type = "pool3d"
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase2(TestPool3d_Op):
+    def init_test_case(self):
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+
+    def init_global_pool(self):
+        self.global_pool = False
+
+
+class TestCase3(TestPool3d_Op):
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+
+
+class TestCase4(TestCase1):
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+
+
+class TestCase5(TestCase2):
+    def init_op_type(self):
+        self.op_type = "pool3d"
+
+    def init_pool_type(self):
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+
+
+#--------------------test pool3d--------------------
+class TestCUDNNCase1(TestPool3d_Op):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+class TestCUDNNCase2(TestCase1):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+class TestCUDNNCase3(TestCase2):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+class TestCUDNNCase4(TestCase3):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+class TestCUDNNCase5(TestCase4):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+class TestCUDNNCase6(TestCase5):
+    def init_op_type(self):
+        self.use_cudnn = True
+        self.op_type = "pool3d"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_pool_max_op.py b/python/paddle/v2/fluid/tests/test_pool_max_op.py
new file mode 100644
index 0000000000..330ad24bd4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_pool_max_op.py
@@ -0,0 +1,192 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+
+    N, C, D, H, W = x.shape
+    if global_pool:
+        ksize = [D, H, W]
+        paddings = [0, 0, 0]
+
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    mask = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                for n in xrange(N):
+                    for c in xrange(C):
+                        arr = x_masked[n, c, :, :, :]
+                        index = np.where(arr == np.max(arr))
+                        sub_deep = index[0][0]
+                        sub_row = index[1][0]
+                        sub_col = index[2][0]
+                        index = ((d_start + sub_deep) * H +
+                                 (h_start + sub_row)) * W + w_start + sub_col
+                        mask[n, c, k, i, j] = index
+
+    return out, mask
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False):
+
+    N, C, H, W = x.shape
+    if global_pool:
+        ksize = [H, W]
+        paddings = [0, 0]
+
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    mask = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+
+            for n in xrange(N):
+                for c in xrange(C):
+                    arr = x_masked[n, c, :, :]
+                    index = np.where(arr == np.max(arr))
+                    sub_row = index[0][0]
+                    sub_col = index[1][0]
+                    index = (r_start + sub_row) * W + c_start + sub_col
+                    mask[n, c, i, j] = index
+
+    return out, mask
+
+
+class TestMaxPoolWithIndex_Op(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.init_global()
+
+        input = np.random.random(self.shape).astype("float32")
+        output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
+                                               self.paddings, self.global_pool)
+        output = output.astype("float32")
+        mask = mask.astype("int32")
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'global_pooling': self.global_pool,
+        }
+
+        self.inputs = {'X': input}
+        self.outputs = {'Out': output, "Mask": mask}
+
+    def test_check_output(self):
+        self.check_output()
+
+    # def test_check_grad(self):
+    #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
+
+    def init_test_case(self):
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+    def init_global(self):
+        self.global_pool = False
+
+
+class TestCase1(TestMaxPoolWithIndex_Op):
+    def init_global(self):
+        self.global_pool = True
+
+
+class TestCase2(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+
+    def init_global(self):
+        self.global_pool = True
+
+
+class TestCase3(TestCase2):
+    def init_global(self):
+        self.global_pool = False
+
+
+#----------------max_pool2d_with_index----------------
+class TestCase4(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+    def init_global(self):
+        self.global_pool = True
+
+
+class TestCase5(TestCase4):
+    def init_global(self):
+        self.global_pool = False
+
+
+class TestCase6(TestMaxPoolWithIndex_Op):
+    def init_test_case(self):
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+    def init_global(self):
+        self.global_pool = True
+
+
+class TestCase7(TestCase6):
+    def init_global(self):
+        self.global_pool = False
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
new file mode 100644
index 0000000000..9b5e544655
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_positive_negative_pair_op.py
@@ -0,0 +1,120 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import itertools
+import numpy as np
+from op_test import OpTest
+
+
+def py_pnpair_op(score, label, query, column=-1, weight=None):
+    # group by query id
+    predictions = {}
+    batch_size = label.shape[0]
+    if weight is None:
+        weight = np.ones(shape=(batch_size, 1)).astype('float32')
+    for s, l, q, w in zip(score, label, query, weight):
+        s, l, q, w = s[column], l[0], q[0], w[0]
+        if q not in predictions:
+            predictions[q] = []
+        predictions[q].append((s, l, w))
+
+    # accumulate statistics
+    pos, neg, neu = 0, 0, 0
+    for _, ranks in predictions.items():
+        for e1, e2 in itertools.combinations(ranks, 2):
+            s1, s2, l1, l2, w1, w2 = e1[0], e2[0], e1[1], e2[1], e1[2], e2[2]
+            w = (w1 + w2) * 0.5
+            if l1 == l2:
+                continue
+            if s1 == s2:
+                neu += w
+            elif (s1 - s2) * (l1 - l2) > 0:
+                pos += w
+            else:
+                neg += w
+
+    return np.array(pos).astype('float32'), np.array(neg).astype(
+        'float32'), np.array(neu).astype('float32')
+
+
+class TestPositiveNegativePairOp(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        score = np.random.normal(size=(batch_size, 1)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+
+        pos, neg, neu = py_pnpair_op(score, label, query)
+        self.inputs = {'Score': score, 'Label': label, 'QueryID': query}
+        self.attrs = {'column': -1}
+        self.outputs = {
+            'PositivePair': pos,
+            'NegativePair': neg,
+            'NeutralPair': neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestPositiveNegativePairOpAccumulateWeight(OpTest):
+    def setUp(self):
+        self.op_type = 'positive_negative_pair'
+        batch_size = 20
+        max_query_id = 5
+        max_random_num = 2 << 15
+        score_dim = 2
+        score = np.random.normal(size=(batch_size, 2)).astype('float32')
+        label = np.random.normal(size=(batch_size, 1)).astype('float32')
+        weight = np.random.normal(size=(batch_size, 1)).astype('float32')
+        query = np.array(
+            [np.random.randint(max_query_id) for i in range(batch_size)])
+        query = np.reshape(query, newshape=(batch_size, 1)).astype('int64')
+        acc_pos = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neg = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        acc_neu = np.reshape(
+            np.random.randint(max_random_num), newshape=(1)).astype('float32')
+        column = np.random.randint(score_dim)
+
+        pos, neg, neu = py_pnpair_op(
+            score, label, query, column=column, weight=weight)
+        self.inputs = {
+            'Score': score,
+            'Label': label,
+            'QueryID': query,
+            'AccumulatePositivePair': acc_pos,
+            'AccumulateNegativePair': acc_neg,
+            'AccumulateNeutralPair': acc_neu,
+            'Weight': weight
+        }
+        self.attrs = {'column': column}
+        self.outputs = {
+            'PositivePair': pos + acc_pos,
+            'NegativePair': neg + acc_neg,
+            'NeutralPair': neu + acc_neu
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_precision_recall_op.py b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
new file mode 100644
index 0000000000..188b7af559
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_precision_recall_op.py
@@ -0,0 +1,187 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def calc_precision(tp_count, fp_count):
+    if tp_count > 0.0 or fp_count > 0.0:
+        return tp_count / (tp_count + fp_count)
+    return 1.0
+
+
+def calc_recall(tp_count, fn_count):
+    if tp_count > 0.0 or fn_count > 0.0:
+        return tp_count / (tp_count + fn_count)
+    return 1.0
+
+
+def calc_f1_score(precision, recall):
+    if precision > 0.0 or recall > 0.0:
+        return 2 * precision * recall / (precision + recall)
+    return 0.0
+
+
+def get_states(idxs, labels, cls_num, weights=None):
+    ins_num = idxs.shape[0]
+    # TP FP TN FN
+    states = np.zeros((cls_num, 4)).astype('float32')
+    for i in xrange(ins_num):
+        w = weights[i] if weights is not None else 1.0
+        idx = idxs[i][0]
+        label = labels[i][0]
+        if idx == label:
+            states[idx][0] += w
+            for j in xrange(cls_num):
+                states[j][2] += w
+            states[idx][2] -= w
+        else:
+            states[label][3] += w
+            states[idx][1] += w
+            for j in xrange(cls_num):
+                states[j][2] += w
+            states[label][2] -= w
+            states[idx][2] -= w
+    return states
+
+
+def compute_metrics(states, cls_num):
+    total_tp_count = 0.0
+    total_fp_count = 0.0
+    total_fn_count = 0.0
+    macro_avg_precision = 0.0
+    macro_avg_recall = 0.0
+    for i in xrange(cls_num):
+        total_tp_count += states[i][0]
+        total_fp_count += states[i][1]
+        total_fn_count += states[i][3]
+        macro_avg_precision += calc_precision(states[i][0], states[i][1])
+        macro_avg_recall += calc_recall(states[i][0], states[i][3])
+    metrics = []
+    macro_avg_precision /= cls_num
+    macro_avg_recall /= cls_num
+    metrics.append(macro_avg_precision)
+    metrics.append(macro_avg_recall)
+    metrics.append(calc_f1_score(macro_avg_precision, macro_avg_recall))
+    micro_avg_precision = calc_precision(total_tp_count, total_fp_count)
+    metrics.append(micro_avg_precision)
+    micro_avg_recall = calc_recall(total_tp_count, total_fn_count)
+    metrics.append(micro_avg_recall)
+    metrics.append(calc_f1_score(micro_avg_precision, micro_avg_recall))
+    return np.array(metrics).astype('float32')
+
+
+class TestPrecisionRecallOp_0(OpTest):
+    def setUp(self):
+        self.op_type = "precision_recall"
+        ins_num = 64
+        cls_num = 10
+        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
+        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+        states = get_states(idxs, labels, cls_num)
+        metrics = compute_metrics(states, cls_num)
+
+        self.attrs = {'class_number': cls_num}
+
+        self.inputs = {'MaxProbs': max_probs, 'Indices': idxs, 'Labels': labels}
+
+        self.outputs = {
+            'BatchMetrics': metrics,
+            'AccumMetrics': metrics,
+            'AccumStatesInfo': states
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestPrecisionRecallOp_1(OpTest):
+    def setUp(self):
+        self.op_type = "precision_recall"
+        ins_num = 64
+        cls_num = 10
+        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
+        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
+        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+
+        states = get_states(idxs, labels, cls_num, weights)
+        metrics = compute_metrics(states, cls_num)
+
+        self.attrs = {'class_number': cls_num}
+
+        self.inputs = {
+            'MaxProbs': max_probs,
+            'Indices': idxs,
+            'Labels': labels,
+            'Weights': weights
+        }
+
+        self.outputs = {
+            'BatchMetrics': metrics,
+            'AccumMetrics': metrics,
+            'AccumStatesInfo': states
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestPrecisionRecallOp_2(OpTest):
+    def setUp(self):
+        self.op_type = "precision_recall"
+        ins_num = 64
+        cls_num = 10
+        max_probs = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
+        idxs = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+        weights = np.random.uniform(0, 1.0, (ins_num, 1)).astype('float32')
+        labels = np.random.choice(xrange(cls_num), ins_num).reshape(
+            (ins_num, 1)).astype('int32')
+        states = np.random.randint(0, 30, (cls_num, 4)).astype('float32')
+
+        accum_states = get_states(idxs, labels, cls_num, weights)
+        batch_metrics = compute_metrics(accum_states, cls_num)
+        accum_states += states
+        accum_metrics = compute_metrics(accum_states, cls_num)
+
+        self.attrs = {'class_number': cls_num}
+
+        self.inputs = {
+            'MaxProbs': max_probs,
+            'Indices': idxs,
+            'Labels': labels,
+            'Weights': weights,
+            'StatesInfo': states
+        }
+
+        self.outputs = {
+            'BatchMetrics': batch_metrics,
+            'AccumMetrics': accum_metrics,
+            'AccumStatesInfo': accum_states
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_prelu_op.py b/python/paddle/v2/fluid/tests/test_prelu_op.py
new file mode 100644
index 0000000000..848036234c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_prelu_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class PReluTest(OpTest):
+    def setUp(self):
+        self.op_type = "prelu"
+        x_np = np.random.normal(size=(10, 10)).astype("float32")
+
+        for pos, val in np.ndenumerate(x_np):
+            # Since zero point in prelu is not differentiable, avoid randomize
+            # zero.
+            while abs(val) < 1e-3:
+                x_np[pos] = np.random.normal()
+                val = x_np[pos]
+
+        x_np_sign = np.sign(x_np)
+        x_np = x_np_sign * np.maximum(x_np, .005)
+        alpha_np = np.array([.1], dtype="float32")
+        self.inputs = {'X': x_np, 'Alpha': alpha_np}
+        out_np = np.maximum(self.inputs['X'], 0.)
+        out_np = out_np + np.minimum(self.inputs['X'],
+                                     0.) * self.inputs['Alpha']
+        assert out_np is not self.inputs['X']
+        self.outputs = {'Out': out_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_print_op.py b/python/paddle/v2/fluid/tests/test_print_op.py
new file mode 100644
index 0000000000..3177700dfa
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_print_op.py
@@ -0,0 +1,69 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import switch_main_program
+from paddle.v2.fluid.framework import Program
+import numpy as np
+
+
+class TestPrintOpCPU(unittest.TestCase):
+    def setUp(self):
+        self.place = core.CPUPlace()
+        self.x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_lod([[0, 1, 1]])
+
+    def build_network(self, only_forward, **kargs):
+        x = layers.data('x', shape=[3], dtype='float32', lod_level=1)
+        x.stop_gradient = False
+        printed = layers.Print(input=x, **kargs)
+        if only_forward: return printed
+        loss = layers.mean(x=printed)
+        append_backward(loss=loss)
+        return loss
+
+    def test_forward(self):
+        switch_main_program(Program())
+        printed = self.build_network(True, print_phase='forward')
+        exe = Executor(self.place)
+        outs = exe.run(feed={'x': self.x_tensor},
+                       fetch_list=[printed],
+                       return_numpy=False)
+
+    def test_backward(self):
+        switch_main_program(Program())
+        loss = self.build_network(False, print_phase='backward')
+        exe = Executor(self.place)
+        outs = exe.run(feed={'x': self.x_tensor},
+                       fetch_list=[loss],
+                       return_numpy=False)
+
+
+class TestPrintOpGPU(TestPrintOpCPU):
+    def setUp(self):
+        self.place = core.CUDAPlace(0)
+        self.x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(2, 3)).astype('float32')
+        self.x_tensor.set(tensor_np, self.place)
+        self.x_tensor.set_lod([[0, 1, 1]])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_prior_box_op.py b/python/paddle/v2/fluid/tests/test_prior_box_op.py
new file mode 100644
index 0000000000..ca8d2bca74
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_prior_box_op.py
@@ -0,0 +1,148 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestPriorBoxOp(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'max_sizes': self.max_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset
+        }
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        return
+
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.set_data()
+
+    def init_test_params(self):
+        self.layer_w = 4
+        self.layer_h = 4
+
+        self.image_w = 20
+        self.image_h = 20
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('int64')
+        self.max_sizes = [5, 10]
+        self.max_sizes = np.array(self.max_sizes).astype('int64')
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float).flatten()
+
+        self.clip = True
+
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 1:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w,
+             self.image_h)).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w,
+             self.layer_h)).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    c_w = c_h = min_size / 2.
+                    out_boxes[h, w, idx, :] = [
+                        (c_x - c_w) / self.image_w, (c_y - c_h) / self.image_h,
+                        (c_x + c_w) / self.image_w, (c_y + c_h) / self.image_h
+                    ]
+                    idx += 1
+
+                    if len(self.max_sizes) > 0:
+                        max_size = self.max_sizes[s]
+                        # second prior: aspect_ratio = 1,
+                        c_w = c_h = math.sqrt(min_size * max_size) / 2
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+
+                    # rest of priors
+                    for r in range(len(self.real_aspect_ratios)):
+                        ar = self.real_aspect_ratios[r]
+                        if math.fabs(ar - 1.) < 1e-6:
+                            continue
+                        c_w = min_size * math.sqrt(ar) / 2
+                        c_h = (min_size / math.sqrt(ar)) / 2
+                        out_boxes[h, w, idx, :] = [(c_x - c_w) / self.image_w,
+                                                   (c_y - c_h) / self.image_h,
+                                                   (c_x + c_w) / self.image_w,
+                                                   (c_y + c_h) / self.image_h]
+                        idx += 1
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(self.variances, (self.layer_h, self.layer_w,
+                                           self.num_priors, 1))
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000..09b2d08401
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.core as core
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compiled_with_cuda():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        output_file = 'cuda_profiler.txt'
+        with profiler.cuda_profiler(output_file, 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+        os.remove(output_file)
+
+    def net_profiler(self, state):
+        if state == 'GPU' and not core.is_compiled_with_cuda():
+            return
+        startup_program = fluid.Program()
+        main_program = fluid.Program()
+
+        with fluid.program_guard(main_program, startup_program):
+            image = fluid.layers.data(name='x', shape=[784], dtype='float32')
+            hidden1 = fluid.layers.fc(input=image, size=128, act='relu')
+            hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu')
+            predict = fluid.layers.fc(input=hidden2, size=10, act='softmax')
+            label = fluid.layers.data(name='y', shape=[1], dtype='int64')
+            cost = fluid.layers.cross_entropy(input=predict, label=label)
+            avg_cost = fluid.layers.mean(x=cost)
+            accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+        optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
+        opts = optimizer.minimize(avg_cost, startup_program=startup_program)
+
+        place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(startup_program)
+
+        accuracy.reset(exe)
+        with profiler.profiler(state, 'total') as prof:
+            for iter in range(10):
+                if iter == 2:
+                    profiler.reset_profiler()
+                x = np.random.random((32, 784)).astype("float32")
+                y = np.random.randint(0, 10, (32, 1)).astype("int64")
+
+                outs = exe.run(main_program,
+                               feed={'x': x,
+                                     'y': y},
+                               fetch_list=[avg_cost] + accuracy.metrics)
+                acc = np.array(outs[1])
+                pass_acc = accuracy.eval(exe)
+
+    def test_cpu_profiler(self):
+        self.net_profiler('CPU')
+
+    def test_cuda_profiler(self):
+        self.net_profiler('GPU')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
new file mode 100644
index 0000000000..9967da1593
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -0,0 +1,154 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+from paddle.v2.fluid.framework import Program, default_main_program, program_guard, grad_var_name
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
+
+
+class TestProgram(unittest.TestCase):
+    def test_program(self):
+        b = main_program.current_block()
+        self.assertEqual(-1, b.parent_idx)
+        self.assertEqual(0, b.idx)
+
+        b = main_program.create_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+        b = main_program.create_block()
+        self.assertEqual(2, b.idx)
+        self.assertEqual(1, b.parent_idx)
+
+        main_program.rollback()
+
+        b = main_program.current_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+        b = main_program.create_block()
+        self.assertEqual(3, b.idx)
+        self.assertEqual(1, b.parent_idx)
+
+        main_program.rollback()
+        b = main_program.current_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+    def test_program_clone(self):
+        prog = Program()
+
+        x = prog.global_block().create_var(
+            name='X', shape=[1000, 784], dtype='float32')
+
+        y = prog.global_block().create_var(
+            name='Y', shape=[784, 100], dtype='float32')
+        out = prog.global_block().create_var(name='Out', dtype='float32')
+        prog.global_block().append_op(
+            type="mul", inputs={'X': [x],
+                                'Y': [y]}, outputs={'Out': [out]})
+
+        # FIXME(yuyang18): We manual compare the output string, since the order
+        # of variable could be changed.
+        print(prog)
+        print(prog.clone())
+
+    def test_parse_program_from_string(self):
+        prog = Program()
+
+        x = prog.global_block().create_var(
+            name='X', shape=[1000, 784], dtype='float32')
+
+        y = prog.global_block().create_var(
+            name='Y', shape=[784, 100], dtype='float32')
+        out = prog.global_block().create_var(name='Out', dtype='float32')
+        prog.global_block().append_op(
+            type="mul", inputs={'X': [x],
+                                'Y': [y]}, outputs={'Out': [out]})
+
+        binary_str = prog.desc.serialize_to_string()
+        prog_restored = Program.parse_from_string(binary_str)
+
+        print(prog)
+        print(prog_restored)
+
+    def test_append_backward(self):
+        prog = Program()
+        block = prog.global_block()
+
+        mul_x = block.create_var(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+
+        add_y = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
+        add_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
+        add_op = block.append_op(
+            type="elementwise_add",
+            inputs={"X": mul_out,
+                    "Y": add_y},
+            outputs={"Out": add_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": add_out}, outputs={"Out": mean_out})
+
+        self.assertEqual(mul_op.idx, 0)
+        self.assertEqual(add_op.idx, 1)
+        param_to_grad = prog.append_backward(mean_out, set())
+
+        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out",
+                         "mean.out"):
+            self.assertEqual(param_to_grad[var_name][0],
+                             grad_var_name(var_name))
+            self.assertEqual(param_to_grad[var_name][1], 0)
+
+        expect_ops = [
+            "mul", "elementwise_add", "mean", "fill_constant", "mean_grad",
+            "elementwise_add_grad", "mul_grad"
+        ]
+        actual_ops = []
+        for op in block.ops:
+            actual_ops.append(op.type)
+        self.assertEqual(actual_ops, expect_ops)
+
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        with program_guard(main_program, startup_program):
+            d = layers.data(name='x', shape=[784], dtype='float32')
+            hidden = layers.fc(input=d, size=100)
+            layers.fc(input=hidden, size=100)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf.py b/python/paddle/v2/fluid/tests/test_protobuf.py
new file mode 100644
index 0000000000..48e6dedc58
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_protobuf.py
@@ -0,0 +1,41 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
+import unittest
+
+
+class TestFrameworkProto(unittest.TestCase):
+    def test_all(self):
+        op_proto = framework_pb2.OpProto()
+        ipt0 = op_proto.inputs.add()
+        ipt0.name = "a"
+        ipt0.comment = "the input of cosine op"
+        ipt1 = op_proto.inputs.add()
+        ipt1.name = "b"
+        ipt1.comment = "the other input of cosine op"
+        opt = op_proto.outputs.add()
+        opt.name = "output"
+        opt.comment = "the output of cosine op"
+        op_proto.comment = "cosine op, output = scale*cos(a, b)"
+        attr = op_proto.attrs.add()
+        attr.name = "scale"
+        attr.comment = "scale of cosine op"
+        attr.type = framework_pb2.FLOAT
+        op_proto.type = "cos"
+        self.assertTrue(op_proto.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
new file mode 100644
index 0000000000..9034b2f4ef
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -0,0 +1,157 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+
+
+class TestOpDesc(unittest.TestCase):
+    def test_op_desc(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        op = block.append_op()
+        self.assertIsNotNone(op)
+        op.set_type("test")
+        self.assertEqual("test", op.type())
+        op.set_input("X", ["a", "b", "c"])
+        self.assertEqual(["a", "b", "c"], op.input("X"))
+        self.assertEqual(["X"], op.input_names())
+
+        op.set_output("Out", ["z"])
+        self.assertEqual(['z'], op.output("Out"))
+        self.assertEqual(["Out"], op.output_names())
+
+        op.set_attr("int_attr", 1)
+        self.assertEqual(1, op.attr("int_attr"))
+        self.assertTrue(op.has_attr("int_attr"))
+        self.assertEqual(core.AttrType.INT, op.attr_type("int_attr"))
+
+        op.set_attr("float_attr", -1.32)
+        self.assertAlmostEqual(-1.32, op.attr("float_attr"), delta=1e-4)
+        self.assertTrue(op.has_attr("float_attr"))
+
+        op.set_attr("bool_attr", False)
+        self.assertFalse(op.attr("bool_attr"))
+
+        op.set_attr("string_attr", "abc")
+        self.assertEqual("abc", op.attr("string_attr"))
+        self.assertTrue(op.has_attr("string_attr"))
+
+        op.set_attr("ints_attr", [1, 2, 3])
+        self.assertEqual([1, 2, 3], op.attr("ints_attr"))
+
+        expected = [1.2, 2.3, 3.4]
+        op.set_attr("floats_attr", expected)
+        for e, a in zip(expected, op.attr("floats_attr")):
+            self.assertAlmostEqual(e, a, delta=1e-4)
+
+        op.set_attr("strings_attr", ["a", "b", "c"])
+        self.assertEqual(["a", "b", "c"], op.attr("strings_attr"))
+
+        op.set_attr("bools_attr", [True, False, True])
+        self.assertEqual([True, False, True], op.attr("bools_attr"))
+
+        self.assertEqual(8, len(op.attr_names()))
+
+        op.set_block_attr("block_attr", prog.block(0))
+        self.assertEqual(0, op.block_attr("block_attr"))
+
+        mul_op = block.append_op()
+        mul_op.set_type("mul")
+        mul_op.check_attrs()
+        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
+        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
+
+
+class TestProgramDesc(unittest.TestCase):
+    def test_instance(self):
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        del program_desc
+        program_desc = core.ProgramDesc()
+        self.assertIsNotNone(program_desc)
+        self.assertIsNotNone(program_desc.block(0))
+        del program_desc
+
+    def test_append_block(self):
+        prog_desc = core.ProgramDesc()
+        self.assertIsNotNone(prog_desc)
+        block_root = prog_desc.block(0)
+        self.assertIsNotNone(block_root)
+        self.assertEqual(block_root.id, 0)
+        block1 = prog_desc.append_block(block_root)
+        block2 = prog_desc.append_block(block1)
+        self.assertIsNotNone(block1)
+        self.assertEqual(block1.id, block2.parent)
+        self.assertEqual(block_root.id, block1.parent)
+        block3 = prog_desc.append_block(block_root)
+        self.assertEqual(block3.parent, block_root.id)
+        self.assertEqual(prog_desc.block(1).id, 1)
+        self.assertEqual(4, prog_desc.num_blocks())
+
+
+class TestVarDesc(unittest.TestCase):
+    def test_shape(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
+        src_shape = [3, 2, 10, 8]
+        var.set_shape(src_shape)
+        res_shape = var.shape()
+        self.assertEqual(src_shape, res_shape)
+        self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
+
+    def test_dtype(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        var.set_dtype(core.DataType.INT32)
+        self.assertEqual(core.DataType.INT32, var.dtype())
+        self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
+
+
+class TestBlockDesc(unittest.TestCase):
+    def test_add_var(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        var1 = block.var("var1")
+        var2 = block.var("var2")
+        var3 = block.var("var3")
+        all_vars = block.all_vars()
+        self.assertEqual(set(all_vars), {var1, var2, var3})
+        var2_re = block.find_var("var2")
+        self.assertEqual(var2_re, var2)
+
+    def test_add_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+        op1 = block.append_op()
+        op2 = block.append_op()
+        op0 = block.prepend_op()
+        all_ops = []
+        for idx in xrange(0, block.op_size()):
+            all_ops.append(block.op(idx))
+        self.assertEqual(all_ops, [op0, op1, op2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
new file mode 100644
index 0000000000..744d71bdcf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_proximal_adagrad_op.py
@@ -0,0 +1,50 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestProximalAdagradOp(OpTest):
+    def setUp(self):
+        self.op_type = "proximal_adagrad"
+        w = np.random.random((102, 105)).astype("float32")
+        m = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+
+        self.inputs = {'Param': w, 'Grad': g, 'Moment': m, 'LearningRate': lr}
+        self.attrs = {'l1': l1, 'l2': l2}
+        param_out = 0.0
+
+        moment_out = m + g * g
+        prox_param = w - lr * g / np.sqrt(moment_out)
+        if l1 > 0.0:
+            x = np.abs(prox_param) - lr * l1
+            x[x < 0] = 0
+            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
+        else:
+            param_out = prox_param / (1.0 + lr * l2)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_proximal_gd_op.py b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
new file mode 100644
index 0000000000..96540cf6cf
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_proximal_gd_op.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestProximalGDOp(OpTest):
+    def setUp(self):
+        self.op_type = "proximal_gd"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.attrs = {'l1': l1, 'l2': l2}
+        prox_param = w - lr * g
+        param_out = 0.0
+        if l1 > 0.0:
+            x = np.abs(prox_param) - lr * l1
+            x[x < 0] = 0
+            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
+        else:
+            param_out = prox_param / (1.0 + lr * l2)
+
+        self.outputs = {'ParamOut': param_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_rank_loss_op.py b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
new file mode 100644
index 0000000000..f31a2c2681
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_rank_loss_op.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRankLossOp(OpTest):
+    def setUp(self):
+        self.op_type = "rank_loss"
+        batch_size = 5
+        # labels_{i} = {0, 1.0} or {0, 0.5, 1.0}
+        label = np.random.randint(0, 2, size=(batch_size, 1)).astype("float32")
+        left = np.random.random((batch_size, 1)).astype("float32")
+        right = np.random.random((batch_size, 1)).astype("float32")
+        loss = np.log(1.0 + np.exp(left - right)) - label * (left - right)
+        self.inputs = {'Label': label, 'Left': left, 'Right': right}
+        self.outputs = {'Out': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Left", "Right"], "Out")
+
+    def test_check_grad_ignore_left(self):
+        self.check_grad(["Right"], "Out", no_grad_set=set('Left'))
+
+    def test_check_grad_ignore_right(self):
+        self.check_grad(["Left"], "Out", no_grad_set=set('Right'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
new file mode 100644
index 0000000000..6d59e199e2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -0,0 +1,473 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program, grad_var_name
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+import numpy as np
+import paddle.v2.fluid.core as core
+
+
+class PyRNNBase(object):
+    def __init__(self, input_shape, output_shape):
+        self.x = np.ones(shape=input_shape).astype("float32")
+        self.y = np.zeros(shape=output_shape).astype("float32")
+
+    def step(self, step_id, x):
+        raise NotImplementedError
+
+    def forward(self):
+        for step_id in range(self.x.shape[0]):
+            self.step(step_id, self.x[step_id])
+        return np.array([np.mean(self.y)])
+
+    def segment_inputs(self):
+        return [self.x[i] for i in range(self.x.shape[0])]
+
+
+class PySimpleRNN1(PyRNNBase):
+    def __init__(self, input_shape, output_shape):
+        super(PySimpleRNN1, self).__init__(input_shape, output_shape)
+
+        seq_len, batch_size, input_dim = input_shape
+        self.h_boot = np.random.normal(size=(batch_size,
+                                             input_dim)).astype("float32")
+
+        self.scale = 1.0 / 2.0
+        men_dim = (seq_len, batch_size, input_dim)
+        self.mems = np.zeros(shape=men_dim).astype("float32")
+
+    def step(self, step_id, x):
+        if step_id == 0:
+            pre_mem = self.h_boot
+        else:
+            pre_mem = self.mems[step_id - 1]
+        self.mems[step_id] = (pre_mem + x) * self.scale
+        self.y[step_id] = self.mems[step_id]
+
+
+class PySimpleRNN2(PyRNNBase):
+    def __init__(self, input_shape, output_shape):
+        super(PySimpleRNN2, self).__init__(input_shape, output_shape)
+
+        seq_len, batch_size, input_dim = input_shape
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.ones(shape=(batch_size, input_dim)).astype("float32")
+
+        men_dim = (seq_len, batch_size, input_dim)
+        self.mems = np.zeros(shape=men_dim).astype("float32")
+
+    def step(self, step_id, x):
+        if step_id > 0:
+            pre_mem = self.mems[step_id - 1]
+        else:
+            pre_mem = self.h_boot
+        xW = np.matmul(x, self.W).astype("float32")
+        hU = np.matmul(pre_mem, self.U).astype("float32")
+
+        def py_sigmoid(x):
+            return 1. / (1. + np.exp(-x))
+
+        self.mems[step_id] = py_sigmoid(xW + hU)
+        self.y[step_id] = self.mems[step_id]
+
+
+def create_tensor(np_data, place):
+    tensor = core.LoDTensor()
+    tensor.set(np_data, place)
+    return tensor
+
+
+class RecurrentOpTest1(unittest.TestCase):
+    '''
+    Test RNNOp
+    equation:
+        h_t = ( x_t + h_{t-1} ) / scale
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+        - h
+    '''
+
+    input_dim = 2
+    batch_size = 1
+    sent_len = 1
+
+    def setup_program(self):
+        self.main_program = Program()
+        self.startup_program = Program()
+        self.p_info = {
+            "main_program": self.main_program,
+            "startup_program": self.startup_program
+        }
+        self.place = core.CPUPlace()
+
+    def setUp(self):
+        self.setup_program()
+        self.data_field = {"x", "h_boot"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape)
+
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+        h_boot = layers.data(
+            shape=[self.input_dim],
+            dtype='float32',
+            name='h_boot',
+            **self.p_info)
+        h_boot.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            h_pre = rnn.memory(init=h_boot)
+            x_t = rnn.step_input(x)
+
+            h = layers.scale(
+                x=layers.elementwise_add(
+                    x=h_pre, y=x_t, **self.p_info),
+                scale=self.py_rnn.scale,
+                **self.p_info)
+
+            rnn.update_memory(h_pre, h)
+            rnn.output(h)
+
+        return rnn()
+
+    def forward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        exe = Executor(self.place)
+        out = exe.run(self.main_program,
+                      feed=self.feed_map,
+                      fetch_list=[self.output])
+
+        return out[0]
+
+    def backward(self):
+        self.feed_map = {
+            x: create_tensor(getattr(self.py_rnn, x), self.place)
+            for x in self.data_field
+        }
+        fetch_list = [
+            self.main_program.global_block().var(grad_var_name(x))
+            for x in self.data_field
+        ]
+
+        exe = Executor(self.place)
+        return exe.run(self.main_program,
+                       feed=self.feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
+
+    def test_backward(self):
+        self.check_forward()
+
+        append_backward(self.output)
+
+        ana_grad = [np.array(x) for x in self.backward()]
+
+        num_grad = self.get_numerical_gradient()
+        for idx, name in enumerate(self.data_field):
+            self.assertEqual(num_grad[idx].shape, ana_grad[idx].shape)
+            self.assertTrue(
+                np.isclose(
+                    num_grad[idx], ana_grad[idx], rtol=0.1).all())
+
+    def check_forward(self):
+        print 'test recurrent op forward'
+        pd_output = self.forward()
+        py_output = self.py_rnn.forward()
+        print 'pd_output', pd_output
+        print
+        print 'py_output', py_output
+        self.assertEqual(pd_output.shape, py_output.shape)
+        self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all())
+
+    def get_numerical_gradient(self, delta=0.005):
+        dloss_dout = 1.0
+        feed_list = [getattr(self.py_rnn, x) for x in self.data_field]
+        grad_list = [np.zeros_like(x) for x in feed_list]
+        for feed, grad in zip(feed_list, grad_list):
+            for f, g in np.nditer([feed, grad], op_flags=['readwrite']):
+                o = float(f)
+                f[...] = o + delta
+                y_pos = self.forward()
+
+                f[...] = o - delta
+                y_neg = self.forward()
+
+                f[...] = o
+                dout_dfeed = (y_pos - y_neg) / (delta * 2)
+                g[...] = dout_dfeed[0]
+
+        return grad_list
+
+
+class RecurrentOpTest2(RecurrentOpTest1):
+    '''
+    Test RNNOp
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+       - h
+    '''
+
+    input_dim = 2
+    batch_size = 10
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x", "h_boot", "W", "U"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape)
+
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+        h_boot = layers.data(
+            shape=[self.input_dim],
+            dtype='float32',
+            name='h_boot',
+            **self.p_info)
+        h_boot.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            h_pre = rnn.memory(init=h_boot)
+            x_t = rnn.step_input(x)
+
+            temp_l = layers.fc(input=x_t,
+                               size=self.input_dim,
+                               param_attr='W',
+                               bias_attr=False,
+                               **self.p_info)
+            temp_r = layers.fc(input=h_pre,
+                               size=self.input_dim,
+                               param_attr='U',
+                               bias_attr=False,
+                               **self.p_info)
+
+            h = layers.sigmoid(
+                x=layers.elementwise_add(
+                    x=temp_l, y=temp_r, **self.p_info),
+                **self.p_info)
+
+            rnn.update_memory(h_pre, h)
+            rnn.output(h)
+
+        return rnn()
+
+
+class RecurrentOpMultipleMemoryTest(RecurrentOpTest1):
+    '''
+    Test RNNOp with two memories
+    equation:
+        h_1 = h_pre_1
+        h_2 = h_pre_2
+        y = h_1 + h_2
+    vars:
+        - x
+    memories:
+        - h_1, h_2
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN3(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(RecurrentOpMultipleMemoryTest.PySimpleRNN3, self).__init__(
+                input_shape, output_shape)
+
+            seq_len, batch_size, input_dim = input_shape
+            self.h_boot1 = np.random.normal(size=(batch_size,
+                                                  input_dim)).astype("float32")
+            self.h_boot2 = np.random.normal(size=(batch_size,
+                                                  input_dim)).astype("float32")
+
+            men_dim = (seq_len, batch_size, input_dim)
+            self.mems1 = np.zeros(shape=men_dim).astype("float32")
+            self.mems2 = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem1 = self.h_boot1
+                pre_mem2 = self.h_boot2
+            else:
+                pre_mem1 = self.mems1[step_id - 1]
+                pre_mem2 = self.mems2[step_id - 1]
+            self.mems1[step_id] = pre_mem1
+            self.mems2[step_id] = pre_mem2
+            self.y[step_id] = self.mems1[step_id] + self.mems2[step_id] + x
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x", "h_boot1", "h_boot2"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3(
+            self.input_shape, self.output_shape)
+
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+        h_boot1 = layers.data(
+            shape=[self.batch_size, self.input_dim],
+            dtype='float32',
+            name='h_boot1',
+            append_batch_size=False,
+            **self.p_info)
+        h_boot1.stop_gradient = False
+        h_boot2 = layers.data(
+            shape=[self.batch_size, self.input_dim],
+            dtype='float32',
+            name='h_boot2',
+            append_batch_size=False,
+            **self.p_info)
+        h_boot2.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            h_pre1 = rnn.memory(init=h_boot1)
+            h_pre2 = rnn.memory(init=h_boot2)
+            x_t = rnn.step_input(x)
+
+            mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info)
+            mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info)
+            out = layers.sums(input=[mem1, x_t, mem2], **self.p_info)
+
+            rnn.update_memory(h_pre1, mem1)
+            rnn.update_memory(h_pre2, mem2)
+            rnn.output(out)
+
+        return rnn()
+
+
+class RecurrentOpNoMemBootTest(RecurrentOpTest1):
+    '''
+    Test RNNOp with two memories
+    equation:
+        mem = x + mem_pre
+        y = mem
+    vars:
+        - x
+    memories:
+        - mem
+    outputs:
+       - y
+    '''
+
+    class PySimpleRNN4(PyRNNBase):
+        def __init__(self, input_shape, output_shape):
+            super(RecurrentOpNoMemBootTest.PySimpleRNN4, self).__init__(
+                input_shape, output_shape)
+            men_dim = input_shape
+            self.mems = np.zeros(shape=men_dim).astype("float32")
+
+        def step(self, step_id, x):
+            if step_id == 0:
+                pre_mem = np.zeros_like(x)
+            else:
+                pre_mem = self.mems[step_id - 1]
+            self.mems[step_id] = pre_mem + x
+            self.y[step_id] = self.mems[step_id]
+
+    input_dim = 1
+    batch_size = 1
+    sent_len = 2
+
+    def setUp(self):
+        self.setup_program()
+
+        self.data_field = {"x"}
+
+        self.input_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.output_shape = (self.sent_len, self.batch_size, self.input_dim)
+        self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape,
+                                                            self.output_shape)
+        self.output = layers.mean(x=self.create_rnn_op(), **self.p_info)
+        print self.main_program
+
+    def create_rnn_op(self):
+        x = layers.data(
+            shape=[self.sent_len, self.batch_size, self.input_dim],
+            dtype='float32',
+            name='x',
+            append_batch_size=False,
+            **self.p_info)
+        x.stop_gradient = False
+
+        rnn = layers.StaticRNN(main_program=self.main_program)
+        with rnn.step():
+            mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x)
+            x_t = rnn.step_input(x)
+            mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info)
+            rnn.update_memory(mem_pre, mem)
+            rnn.output(mem)
+
+        return rnn()
+
+
+if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
+    exit(0)
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_recv_op.py b/python/paddle/v2/fluid/tests/test_recv_op.py
new file mode 100644
index 0000000000..5c4cec028d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_recv_op.py
@@ -0,0 +1,68 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import numpy
+from multiprocessing import Process
+import os, sys
+
+
+class TestRecvOp(unittest.TestCase):
+    def test_send(self):
+        # Run init_serv in a thread
+        place = fluid.CPUPlace()
+        p = Process(target=self.init_serv, args=(place, ))
+        p.daemon = True
+        p.start()
+        self.init_client(place)
+        # FIXME(typhoonzero): find a way to gracefully shutdown the server.
+        os.system("kill -9 %d" % p.pid)
+        p.join()
+
+    def init_serv(self, place):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name="X",
+                append_batch_size=False)
+            fluid.initializer.Constant(value=1.0)(x, main.global_block())
+            serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
+            with serv.do():
+                o = layers.scale(x=x, scale=10.0)
+            main.global_block().create_var(
+                name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
+        exe = fluid.Executor(place)
+        exe.run(main)
+
+    def init_client(self, place):
+        main = fluid.Program()
+        with fluid.program_guard(main):
+            x = layers.data(
+                shape=[32, 32],
+                dtype='float32',
+                name='X',
+                append_batch_size=False)
+            fluid.initializer.Constant(value=1.0)(x, main.global_block())
+            layers.Send("127.0.0.1:6174", [x], [x])
+        exe = fluid.Executor(place)
+        exe.run(main)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reduce_op.py b/python/paddle/v2/fluid/tests/test_reduce_op.py
new file mode 100644
index 0000000000..c669f73a7c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reduce_op.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMeanOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'dim': 1}
+        self.outputs = {'Out': self.inputs['X'].mean(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestMaxOp(OpTest):
+    """Remove Max with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_max"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': -1}
+        self.outputs = {'Out': self.inputs['X'].max(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMinOp(OpTest):
+    """Remove Min with subgradient from gradient check to confirm the success of CI."""
+
+    def setUp(self):
+        self.op_type = "reduce_min"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': 2}
+        self.outputs = {'Out': self.inputs['X'].min(axis=self.attrs['dim'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestKeepDimReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 10)).astype("float64")}
+        self.attrs = {'dim': -2, 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=self.attrs['dim'], keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class Test1DReduce(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random(20).astype("float64")}
+        self.outputs = {'Out': self.inputs['X'].sum(axis=0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((5, 6, 2, 10)).astype("float64")}
+        self.attrs = {'reduce_all': True}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py
new file mode 100644
index 0000000000..44e50ca55a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_registry.py
@@ -0,0 +1,31 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.fluid as fluid
+import numpy as np
+import decorators
+
+
+class TestRegistry(unittest.TestCase):
+    @decorators.prog_scope()
+    def test_registry_layer(self):
+        x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32')
+        output = fluid.layers.mean(x=x)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        X = np.random.random((10, 10)).astype("float32")
+        mean_out = exe.run(feed={"X": X}, fetch_list=[output])
+        self.assertAlmostEqual(np.mean(X), mean_out[0])
diff --git a/python/paddle/v2/fluid/tests/test_regularizer.py b/python/paddle/v2/fluid/tests/test_regularizer.py
new file mode 100644
index 0000000000..b33817fa41
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_regularizer.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle.v2.fluid.framework as framework
+import paddle.v2.fluid.optimizer as optimizer
+import paddle.v2.fluid.regularizer as regularizer
+from paddle.v2.fluid.backward import append_backward
+
+
+class TestL2DecayRegularizer(unittest.TestCase):
+    def test_l2decay_regularizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            regularizer=regularizer.L2DecayRegularizer(0.5))
+        self.assertTrue(mul_x.regularizer is not None)
+        self.assertTrue(
+            isinstance(mul_x.regularizer, regularizer.L2DecayRegularizer))
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        count_ops = len(block.ops)
+        params_grads = optimizer.append_regularization_ops(params_grads)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(block.ops), count_ops + 2)
+        self.assertEqual(block.ops[-1].type, 'elementwise_add')
+        self.assertEqual(block.ops[-2].type, 'scale')
+
+
+class TestL1DecayRegularizer(unittest.TestCase):
+    def test_l2decay_regularizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32",
+            shape=[5, 10],
+            lod_level=0,
+            name="mul.x",
+            regularizer=regularizer.L1DecayRegularizer(0.5))
+        self.assertTrue(mul_x.regularizer is not None)
+        self.assertTrue(
+            isinstance(mul_x.regularizer, regularizer.L1DecayRegularizer))
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        mean_out = block.create_var(
+            dtype="float32", shape=[1], lod_level=0, name="mean.out")
+        block.append_op(
+            type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
+        params_grads = append_backward(mean_out)
+        self.assertEqual(len(params_grads), 1)
+        count_ops = len(block.ops)
+        params_grads = optimizer.append_regularization_ops(params_grads)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(block.ops), count_ops + 3)
+        self.assertEqual(block.ops[-1].type, 'elementwise_add')
+        self.assertEqual(block.ops[-2].type, 'scale')
+        self.assertEqual(block.ops[-3].type, 'sign')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
new file mode 100644
index 0000000000..0a223bac0c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reorder_lod_tensor.py
@@ -0,0 +1,199 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import numpy
+
+
+class TestReorderLoDTensor(unittest.TestCase):
+    num_seq = 5
+    # [name, shape, lod_level] pair indicating data info of source and target
+    data_desc = (['input', [9], 0], ['ref', [5], 1])
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        dat = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        dat.stop_gradient = False
+        rank_dat = fluid.layers.data(
+            name=cls.data_desc[1][0], shape=cls.data_desc[1][1])
+        table = fluid.layers.lod_rank_table(rank_dat)
+        new_dat = fluid.layers.reorder_lod_tensor_by_rank(
+            x=dat, rank_table=table)
+        loss = fluid.layers.reduce_sum(new_dat)
+        fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
+
+    def run_program(self):
+        outputs = []
+        input_grads = []
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            output, input_grad = exe.run(fluid.default_main_program(),
+                                         feed=self.inputs,
+                                         fetch_list=self.fetch_list,
+                                         return_numpy=False)
+            outputs.append(output)
+            input_grads.append(input_grad)
+        self.actual_outputs = outputs
+        self.actual_grads = input_grads
+
+    def set_data(self):
+        self.data = {}
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.num_seq if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(
+                size=[data_lod[-1][-1] if data_lod else self.num_seq
+                      ] + data_shape).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def reorder(self):
+        level = 0
+
+        # compute the rank_table according to ref_lod
+        ref_lod = self.data[self.data_desc[1][0]][1][level]
+        rank_table = []  # list of (index, length)
+        for i in range(len(ref_lod) - 1):
+            rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
+        rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
+
+        # compute the input sequence info according to input_lod
+        input_value, input_lod = self.data[self.data_desc[0][0]]
+
+        input_table = []  # list of (offset, length, sub_lod)
+        if input_lod:
+            for i in range(len(input_lod[level]) - 1):
+                start_idx = i
+                end_idx = i + 1
+                sub_lod = []
+                for lod_level_i in input_lod[level:]:
+                    sub_lod_i = []
+                    for idx in range(start_idx, end_idx):
+                        sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
+                            idx])
+                    sub_lod.append(sub_lod_i)
+                    start_idx = lod_level_i[start_idx]
+                    end_idx = lod_level_i[end_idx]
+                input_table.append((start_idx, end_idx - start_idx, sub_lod))
+        else:
+            input_table = [(i, 1, []) for i in range(len(rank_table))]
+
+        # reorder by rank_table
+        output_value = numpy.zeros_like(input_value)
+        output_lod = []
+        offset = 0
+        for index, length in rank_table:
+            input_seq_start = input_table[index][0]
+            input_seq_len = input_table[index][1]
+            input_seq_end = input_seq_start + input_seq_len
+            output_value[offset:offset + input_seq_len] = input_value[
+                input_seq_start:input_seq_end]
+            offset += input_seq_len
+
+            input_seq_sub_lod = input_table[index][2]
+            if len(output_lod) == 0:
+                output_lod = [[0] for i in input_seq_sub_lod]
+            for i, sub_lod_i in enumerate(input_seq_sub_lod):
+                for idx_sub in sub_lod_i:
+                    output_lod[i].append(output_lod[i][-1] + idx_sub)
+        return output_value, output_lod
+
+    def test_reorder_lod_tensor(self):
+        self.data_desc[0][-1] = 2  # input is lod_tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+    def test_reorder_tensor(self):
+        self.data_desc[0][-1] = 0  # input is tensor
+        self.set_data()
+        self.run_program()
+        # check output
+        expect_output, expect_output_lod = self.reorder()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+            self.assertEqual(expect_output_lod, actual_output.lod())
+        # check gradient
+        expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
+        expect_grad_lod = self.data[self.data_desc[0][0]][1]
+        for actual_grad in self.actual_grads:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_grad), expect_grad, atol=0.001))
+            self.assertEqual(expect_grad_lod, actual_grad.lod())
+
+        # compare outputs between LodTensors with explicit and implicit lod
+        # use the same data but set the input lod explicitly
+        input_lod = [[
+            i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
+        ]]
+        self.inputs[self.data_desc[0][0]].set_lod(input_lod)
+        # preserve the output of LodTensor with implicit lod to compare
+        expect_output = [
+            numpy.array(actual_output) for actual_output in self.actual_outputs
+        ]
+        self.run_program()
+        for actual_output in self.actual_outputs:
+            self.assertTrue(
+                numpy.allclose(
+                    numpy.array(actual_output), expect_output, atol=0.001))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_reshape_op.py b/python/paddle/v2/fluid/tests/test_reshape_op.py
new file mode 100644
index 0000000000..2cc0b36460
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_reshape_op.py
@@ -0,0 +1,49 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestReshapeOp(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [10 * 20]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestReshapeOpDimInfer(OpTest):
+    def setUp(self):
+        self.op_type = "reshape"
+        self.inputs = {'X': np.random.random((10, 20)).astype("float32")}
+        self.attrs = {'shape': [4, -1, 5]}
+        self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_rmsprop_op.py b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
new file mode 100644
index 0000000000..b6d7c69800
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_rmsprop_op.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRmspropOp1(OpTest):
+    ''' Test RMSProp with explicit inputs
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1e-6
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestRmspropOp2(OpTest):
+    '''Test RMSProp with default values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1.0e-10
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
new file mode 100644
index 0000000000..82b54bbd1a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -0,0 +1,135 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+import numpy as np
+import paddle.v2.fluid.core as core
+
+
+class RNNMemoryHelperOpTest(unittest.TestCase):
+    def setUp(self):
+        self.program = Program()
+        self.place = core.CPUPlace()
+
+        self.X = self.program.global_block().create_var(
+            name='X', shape=[2, 3], dtype='float32')
+        self.Out = self.program.global_block().create_var(
+            name='Out', shape=[2, 3], dtype='float32')
+        self.program.global_block().append_op(
+            type='rnn_memory_helper',
+            inputs={"X": self.X},
+            outputs={"Out": self.Out},
+            attrs={})
+
+    def test_forward(self):
+        x_np = np.random.normal(size=(2, 3)).astype("float32")
+        self.feed_map = {'X': x_np}
+        self.fetch_list = [self.Out]
+        exe = Executor(self.place)
+        out = exe.run(self.program,
+                      feed=self.feed_map,
+                      fetch_list=self.fetch_list)
+        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
+
+
+class RNNMemoryHelperGradOpTest(unittest.TestCase):
+    def setUp(self):
+        self.program = Program()
+        self.place = core.CPUPlace()
+
+        self.input_names = ['X', 'Out', 'Out@GRAD']
+        self.input_vars = {
+            name: self.program.global_block().create_var(
+                name=name, shape=[2, 3], dtype='float32')
+            for name in self.input_names
+        }
+
+        self.output_names = ['X@GRAD']
+        self.output_vars = {
+            name: self.program.global_block().create_var(
+                name=name, shape=[2, 3], dtype='float32')
+            for name in self.output_names
+        }
+
+        self.program.global_block().append_op(
+            type='rnn_memory_helper_grad',
+            inputs=self.input_vars,
+            outputs=self.output_vars,
+            attrs={})
+
+    def test_backward(self):
+        self.feed_map = {
+            name: np.random.normal(size=(2, 3)).astype("float32")
+            for name in self.input_names
+        }
+        self.fetch_list = [self.output_vars['X@GRAD']]
+
+        exe = Executor(self.place)
+        out = exe.run(self.program,
+                      feed=self.feed_map,
+                      fetch_list=self.fetch_list)
+        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
+
+
+class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
+    def setUp(self):
+        self.program = Program()
+        self.fake_program = Program()
+        self.place = core.CPUPlace()
+
+        self.input_names = ['X', 'Out']
+        self.input_vars = {
+            name: self.program.global_block().create_var(
+                name=name, shape=[2, 3], dtype='float32')
+            for name in self.input_names
+        }
+        self.input_vars["Out@GRAD"] = \
+            self.fake_program.global_block().create_var(
+                name="Out@GRAD", shape=[2, 3], dtype='float32')
+
+        self.output_names = ['X@GRAD']
+        self.output_vars = {
+            name: self.program.global_block().create_var(
+                name=name, shape=[2, 3], dtype='float32')
+            for name in self.output_names
+        }
+
+        self.program.global_block().append_op(
+            type='rnn_memory_helper_grad',
+            inputs=self.input_vars,
+            outputs=self.output_vars,
+            attrs={})
+
+    def test_backward(self):
+        self.feed_map = {
+            name: np.random.normal(size=(2, 3)).astype("float32")
+            for name in ['X', 'Out']
+        }
+        self.fetch_list = [self.output_vars['X@GRAD']]
+
+        exe = Executor(self.place)
+        out = exe.run(self.program,
+                      feed=self.feed_map,
+                      fetch_list=self.fetch_list)
+        self.assertTrue(
+            np.allclose(
+                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
new file mode 100644
index 0000000000..af48848dcd
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -0,0 +1,137 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+import sys
+from op_test import OpTest
+
+
+class TestROIPoolOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_pool()
+
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width
+        }
+
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
+
+    def init_test_case(self):
+        self.batch_size = 5
+        self.channels = 3
+        self.height = 6
+        self.width = 4
+
+        # n, c, h, w
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
+
+        self.spatial_scale = 1.0 / 4.0
+        self.pooled_height = 2
+        self.pooled_width = 2
+        self.rois_num = 2
+
+        self.x = np.random.random(self.x_dim).astype('float32')
+
+    def calc_roi_pool(self):
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
+        for i in range(self.rois_num):
+            roi = self.rois[i]
+            roi_batch_id = roi[0]
+            roi_start_w = int(round(roi[1] * self.spatial_scale))
+            roi_start_h = int(round(roi[2] * self.spatial_scale))
+            roi_end_w = int(round(roi[3] * self.spatial_scale))
+            roi_end_h = int(round(roi[4] * self.spatial_scale))
+
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
+
+            x_i = self.x[roi_batch_id]
+
+            bin_size_h = float(roi_height) / float(self.pooled_height)
+            bin_size_w = float(roi_width) / float(self.pooled_width)
+
+            for c in range(self.channels):
+                for ph in range(self.pooled_height):
+                    for pw in range(self.pooled_width):
+                        hstart = int(math.floor(ph * bin_size_h))
+                        wstart = int(math.floor(pw * bin_size_w))
+                        hend = int(math.ceil((ph + 1) * bin_size_h))
+                        wend = int(math.ceil((pw + 1) * bin_size_w))
+
+                        hstart = min(max(hstart + roi_start_h, 0), self.height)
+                        hend = min(max(hend + roi_start_h, 0), self.height)
+                        wstart = min(max(wstart + roi_start_w, 0), self.width)
+                        wend = min(max(wend + roi_start_w, 0), self.width)
+
+                        is_empty = (hend <= hstart) or (wend <= wstart)
+                        if is_empty:
+                            out_data[i, c, ph, pw] = 0
+                        else:
+                            out_data[i, c, ph, pw] = -sys.float_info.max
+
+                        argmax_data[i, c, ph, pw] = -1
+
+                        for h in range(hstart, hend):
+                            for w in range(wstart, wend):
+                                if x_i[c, h, w] > out_data[i, c, ph, pw]:
+                                    out_data[i, c, ph, pw] = x_i[c, h, w]
+                                    argmax_data[i, c, ph, pw] = h * \
+                                        self.width + w
+
+        self.outs = out_data.astype('float32')
+        self.argmaxes = argmax_data.astype('int64')
+
+    def make_rois(self):
+        rois = []
+        batch_ids = np.random.randint(0, self.batch_size, size=self.rois_num)
+        for i in range(self.rois_num):
+            x1 = np.random.random_integers(
+                0, self.width / self.spatial_scale - self.pooled_width)
+            y1 = np.random.random_integers(
+                0, self.height / self.spatial_scale - self.pooled_height)
+
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
+            roi = [batch_ids[i], x1, y1, x2, y2]
+            rois.append(roi)
+        self.rois = np.array(rois).astype("int64")
+
+    def setUp(self):
+        self.op_type = "roi_pool"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py
new file mode 100644
index 0000000000..580b08f75e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py
@@ -0,0 +1,109 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def row_conv_forward(x, lod, wt):
+    out = np.zeros_like(x)
+    seq_info = lod[0]
+    num_sequences = len(seq_info) - 1
+    context_length = wt.shape[0]
+
+    for i in range(num_sequences):  # loop over number of sequences
+        start = seq_info[i]
+        end = seq_info[i + 1]
+        curinput = x[start:end, :]
+        curoutput = out[start:end, :]
+
+        cur_timesteps = end - start
+        for j in range(cur_timesteps):  # loop over different timesteps
+            for k in range(context_length):
+
+                if j + k >= cur_timesteps:
+                    continue
+                curoutput[j, :] += curinput[j + k, :] * wt[k, :]
+
+    return out
+
+
+class TestRowConvOp1(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 2, 5, 7]]
+        T = lod[0][-1]
+        D = 16
+        context_length = 2
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter'))
+
+
+class TestRowConvOp2(OpTest):
+    def setUp(self):
+
+        self.op_type = "row_conv"
+        lod = [[0, 20, 50, 100]]
+        T = lod[0][-1]
+        D = 35
+        context_length = 35
+
+        x = np.random.random((T, D)).astype("float32")
+        wt = np.random.random((context_length, D)).astype("float32")
+        self.inputs = {'X': (x, lod), 'Filter': wt}
+
+        out = row_conv_forward(x, lod, wt)
+        self.outputs = {'Out': (out, lod)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    #max_relative_error is increased from 0.05 to 0.06 as for higher
+    #dimensional input, the dX on CPU for some values has max_rel_error 
+    #slightly more than 0.05
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X'))
+
+    def test_check_grad_ignore_wt(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_scale_op.py b/python/paddle/v2/fluid/tests/test_scale_op.py
new file mode 100644
index 0000000000..95cd935dda
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_scale_op.py
@@ -0,0 +1,35 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestScaleOp(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.attrs = {'scale': -2.3}
+        self.outputs = {'Out': self.inputs['X'] * self.attrs['scale']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_scatter_op.py b/python/paddle/v2/fluid/tests/test_scatter_op.py
new file mode 100644
index 0000000000..f2936e19ae
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_scatter_op.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestScatterOp(OpTest):
+    def setUp(self):
+        self.op_type = "scatter"
+        ref_np = np.ones((3, 3)).astype("float32")
+        index_np = np.array([1, 2]).astype("int32")
+        updates_np = np.random.random((2, 3)).astype("float32")
+        output_np = np.copy(ref_np)
+        output_np[index_np] = updates_np
+        self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
+        self.outputs = {'Out': output_np}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Updates'], 'Out', in_place=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_scope.py b/python/paddle/v2/fluid/tests/test_scope.py
new file mode 100644
index 0000000000..566a11abbe
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_scope.py
@@ -0,0 +1,51 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core
+import unittest
+
+
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.fluid.core
+        scope = paddle_c.Scope()
+        self.assertIsNotNone(scope)
+        scope_with_parent = scope.new_scope()
+        self.assertIsNotNone(scope_with_parent)
+
+    def test_none_variable(self):
+        paddle_c = paddle.v2.fluid.core
+        scope = paddle_c.Scope()
+        self.assertIsNone(scope.find_var("test"))
+
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.fluid.core
+        scope = paddle_c.Scope()
+        var_a = scope.var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.find_var('var_a'))
+        scope2 = scope.new_scope()
+        self.assertIsNotNone(scope2.find_var('var_a'))
+
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.fluid.core
+        scope = paddle_c.Scope()
+        var = scope.var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_selected_rows.py b/python/paddle/v2/fluid/tests/test_selected_rows.py
new file mode 100644
index 0000000000..65ddf1f8f5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_selected_rows.py
@@ -0,0 +1,52 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+import unittest
+import numpy as np
+
+
+class TestSelectedRows(unittest.TestCase):
+    def test_selected_rows(self):
+        place = core.CPUPlace()
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = core.SelectedRows(rows, height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        # compare rows
+        self.assertEqual(0, selected_rows.rows()[0])
+        self.assertEqual(4, selected_rows.rows()[1])
+        self.assertEqual(7, selected_rows.rows()[2])
+
+        # compare height
+        self.assertEqual(10, selected_rows.height())
+
+        # compare tensor
+        self.assertAlmostEqual(2.0,
+                               selected_rows.get_tensor().get_float_element(0))
+        self.assertAlmostEqual(1.0,
+                               selected_rows.get_tensor().get_float_element(1))
+        self.assertAlmostEqual(
+            4.0,
+            selected_rows.get_tensor().get_float_element(2 * row_numel + 8))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_seq_concat_op.py b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
new file mode 100644
index 0000000000..ba2bb075e6
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_seq_concat_op.py
@@ -0,0 +1,118 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+exit(0)
+
+
+def to_abs_lod(lod):
+    if len(lod) == 0 or len(lod) == 1:
+        return lod
+    import copy
+    new_lod = copy.deepcopy(lod)
+    for idx, val in enumerate(lod[0]):
+        new_lod[0][idx] = lod[1][val]
+    return new_lod
+
+
+def seq_concat(inputs, level):
+    lod0 = inputs['X'][0][1][1]
+    lod1 = inputs['X'][1][1][1]
+    x0 = inputs['X'][0][1][0]
+    x1 = inputs['X'][1][1][0]
+    level_idx = len(lod0) - level - 1
+    outs = []
+    for i in range(len(lod0[level_idx]) - 1):
+        sub_x0 = x0[to_abs_lod(lod0)[level_idx][i]:to_abs_lod(lod0)[level_idx][
+            i + 1], :]
+        sub_x1 = x1[to_abs_lod(lod1)[level_idx][i]:to_abs_lod(lod1)[level_idx][
+            i + 1], :]
+        outs.append(np.concatenate((sub_x0, sub_x1), axis=0))
+    return np.concatenate(outs, axis=0)
+
+
+class TestSeqConcatOp(OpTest):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((4, 8, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        axis = 1
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        self.outputs = {'Out': (np.concatenate([x0, x1], axis=1), lod0)}
+
+    def setUp(self):
+        self.op_type = "sequence_concat"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+class TestSeqConcatOpLevelZeroNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 4], [0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOplevelOneNestedSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 4], [0, 1, 3, 5, 7]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 5, 8], [0, 1, 2, 3, 5, 7, 8, 9, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+class TestSeqConcatOpLevelZeroSequence(TestSeqConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 3, 4)).astype('float32')
+        lod0 = [[0, 1, 2, 3, 4]]
+        x1 = np.random.random((7, 3, 4)).astype('float32')
+        lod1 = [[0, 1, 3, 5, 7]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        out_lod = [[0, 2, 5, 8, 11]]
+        self.outputs = {'Out': (seq_concat(self.inputs, level), out_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_seq_conv.py b/python/paddle/v2/fluid/tests/test_seq_conv.py
new file mode 100644
index 0000000000..674a2e1694
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_seq_conv.py
@@ -0,0 +1,212 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import random
+from op_test import OpTest
+
+
+class TestSeqProject(OpTest):
+    def setUp(self):
+        self.init_test_case()
+        self.op_type = 'sequence_conv'
+
+        if self.context_length == 1 \
+                and self.context_start == 0 \
+                and self.padding_trainable:
+            print "If context_start is 0 " \
+                  "and context_length is 1," \
+                  " padding_trainable should be false."
+            return
+
+        # one level, batch size
+        x = np.random.uniform(0.1, 1, [self.input_size[0],
+                                       self.input_size[1]]).astype('float32')
+        w = np.random.uniform(0.1, 1, [
+            self.context_length * self.input_size[1], self.output_represention
+        ]).astype('float32')
+
+        begin_pad = np.max([0, -self.context_start])
+        end_pad = np.max([0, self.context_start + self.context_length - 1])
+        total_pad = begin_pad + end_pad
+        padding_data = np.random.uniform(
+            0.1, 1, [total_pad, self.input_size[1]]).astype('float32')
+        self.pad_data = padding_data
+        self.inputs = {
+            'X': (x, self.lod),
+            'Filter': w,
+        }
+        self.inputs_val = ['X', 'Filter']
+        self.inputs_val_no_x = ['Filter']
+        self.inputs_val_no_f = ['X']
+
+        if total_pad != 0:
+            self.inputs['PaddingData'] = padding_data
+            self.inputs_val = ['X', 'PaddingData', 'Filter']
+            self.inputs_val_no_x = ['PaddingData', 'Filter']
+            self.inputs_val_no_f = ['PaddingData', 'X']
+
+        self.attrs = {
+            'contextStart': self.context_start,
+            'contextLength': self.context_length,
+            'paddingTrainable': self.padding_trainable,
+            'contextStride': self.context_stride
+        }
+        out = np.zeros(
+            (self.input_size[0], self.output_represention)).astype('float32')
+        self.outputs = {'Out': out}
+        self.compute()
+
+    def compute(self):
+        x, lod = self.inputs['X']
+        filter = self.inputs['Filter']
+        pading_data = self.pad_data
+        out = np.zeros((self.input_size[0], self.context_length *
+                        self.input_size[1])).astype('float32')
+        lod = lod[0]
+        begin_pad = np.max([0, -self.context_start])
+
+        for i in range(len(lod) - 1):
+            for j in range(self.context_length):
+                in_begin = lod[i] + self.context_start + j
+                in_end = lod[i + 1] + self.context_start + j
+                out_begin = lod[i]
+                out_end = lod[i + 1]
+                if in_begin < lod[i]:
+                    pad_size = np.min([lod[i] - in_begin, lod[i + 1] - lod[i]])
+                    if self.padding_trainable:
+                        sub_w = pading_data[j:j + pad_size, :]
+                        out[lod[i]:lod[i] + pad_size, j * self.input_size[1]:(
+                            j + 1) * self.input_size[1]] = sub_w
+                    out_begin = lod[i] + pad_size
+                    in_begin = lod[i]
+
+                if in_end > lod[i + 1]:
+                    pad_size = np.min(
+                        [in_end - lod[i + 1], lod[i + 1] - lod[i]])
+                    if self.padding_trainable:
+                        sub_w = pading_data[begin_pad + self.context_start + j -
+                                            pad_size:begin_pad +
+                                            self.context_start + j, :]
+                        out[lod[i + 1] - pad_size:lod[i + 1], j * self.
+                            input_size[1]:(j + 1) * self.input_size[1]] = sub_w
+                    in_end = lod[i + 1]
+                    out_end = lod[i + 1] - pad_size
+                if in_end <= in_begin:
+                    continue
+
+                in_sub = x[in_begin:in_end, :]
+                out[out_begin:out_end, j * self.input_size[1]:(j + 1) *
+                    self.input_size[1]] += in_sub
+
+        np.dot(out, filter, out=self.outputs['Out'])
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.padding_trainable:
+            self.check_grad(
+                set(self.inputs_val), 'Out', max_relative_error=0.05)
+
+    def test_check_grad_input(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            max_relative_error=0.05,
+            no_grad_set=set(self.inputs_val_no_x))
+
+    def test_check_grad_padding_data(self):
+        if self.padding_trainable:
+            self.check_grad(
+                ['PaddingData'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set(['X', 'Filter']))
+
+    def test_check_grad_Filter(self):
+        self.check_grad(
+            ['Filter'],
+            'Out',
+            max_relative_error=0.05,
+            no_grad_set=set(self.inputs_val_no_f))
+
+    def test_check_grad_input_filter(self):
+        if self.padding_trainable:
+            self.check_grad(
+                ['X', 'Filter'],
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set(['PaddingData']))
+
+    def test_check_grad_padding_input(self):
+        if self.padding_trainable:
+            self.check_grad(
+                self.inputs_val_no_f,
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set(['Filter']))
+
+    def test_check_grad_padding_filter(self):
+        if self.padding_trainable:
+            self.check_grad(
+                self.inputs_val_no_x,
+                'Out',
+                max_relative_error=0.05,
+                no_grad_set=set(['X']))
+
+    def init_test_case(self):
+        self.input_row = 11
+        self.context_start = 0
+        self.context_length = 1
+        self.padding_trainable = False
+        self.context_stride = 1
+
+        self.input_size = [self.input_row, 23]
+        self.lod = [[0, 4, 5, 8, self.input_row]]
+        self.output_represention = 8  # output feature size
+
+
+class TestSeqProjectCase1(TestSeqProject):
+    def init_test_case(self):
+        self.input_row = 11
+        self.context_start = -1
+        self.context_length = 3
+        self.padding_trainable = True
+        self.context_stride = 1
+
+        self.input_size = [self.input_row, 23]
+        self.lod = [[0, 4, 5, 8, self.input_row]]
+        self.output_represention = 8  # output feature size
+
+
+class TestSeqProjectCase2(TestSeqProject):
+    def init_test_case(self):
+        self.input_row = 25
+        self.context_start = 2
+        self.context_length = 3
+        self.padding_trainable = True
+        self.context_stride = 1
+
+        self.input_size = [self.input_row, 23]
+        idx = range(self.input_size[0])
+        del idx[0]
+        self.lod = [[0] + np.sort(random.sample(idx, 8)).tolist() +
+                    [self.input_size[0]]]
+        self.output_represention = 8  # output feature size
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_seq_pool.py b/python/paddle/v2/fluid/tests/test_seq_pool.py
new file mode 100644
index 0000000000..9dd6b2a087
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_seq_pool.py
@@ -0,0 +1,187 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSeqAvgPool(OpTest):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        # one level, batch size is 4
+        x = np.random.uniform(0.1, 1, [11, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 11]]
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "AVERAGE"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.mean(axis=0)
+
+    def setUp(self):
+        x, lod, out = self.set_data()
+        self.compute(x, lod, out)
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
+        self.check_grad(["X"], "Out")
+
+
+class TestSeqAvgPool2D(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        # one level, batch size is 4
+        x = np.random.uniform(0.1, 1, [13, 3, 17]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 3, 17)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "AVERAGE"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.mean(axis=0), (3, 17))
+
+
+class TestSeqSumPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SUM"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x.sum(axis=0)
+
+
+class TestSeqSumPool2D(TestSeqAvgPool2D):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SUM"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
+
+
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SQRT"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+
+
+class TestSeqSqrtPool2D(TestSeqAvgPool2D):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "SQRT"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+
+    def test_check_grad(self):
+        # Remove MaxIndex after check_grad is refined.
+        self.outputs['MaxIndex'] = \
+            np.zeros(self.outputs['Out'].shape).astype('int32')
+        self.check_grad(["X"], "Out", max_relative_error=0.06)
+
+
+class TestSeqMaxPool(TestSeqAvgPool):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 23]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 2.0
+
+        self.inputs = {'X': (x, lod)}
+
+        out = np.zeros((4, 23)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "MAX"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = np.amax(sub_x, axis=0)
+
+
+class TestSeqMaxPool2D(TestSeqAvgPool2D):
+    def set_data(self):
+        self.op_type = 'sequence_pool'
+        x = np.random.uniform(0.1, 1, [13, 3, 11]).astype('float32')
+        lod = [[0, 4, 5, 8, 13]]
+        self.inputs = {'X': (x, lod)}
+        for i in range(4):
+            l = lod[0][i + 1] - lod[0][i]
+            x[lod[0][i] + np.random.randint(l), :] += 1.0
+
+        out = np.zeros((4, 3, 11)).astype('float32')
+        self.outputs = {'Out': out}
+        return x, lod, out
+
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "MAX"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 11))
+            out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11))
+
+
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "LAST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqLastPool2D(TestSeqAvgPool2D):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "LAST"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[-1, :], (3, 17))
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "FIRST"}
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
+class TestSeqFirstPool2D(TestSeqAvgPool2D):
+    def compute(self, x, lod, out):
+        self.attrs = {'pooltype': "FIRST"}
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[0, :], (3, 17))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
new file mode 100644
index 0000000000..4823836ba9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_erase(in_seq, lod0, tokens):
+    new_lod0 = [0]
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        num_out = 0
+        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+            if dat not in tokens:
+                out_seq.append(dat)
+                num_out += 1
+        new_lod0.append(new_lod0[-1] + num_out)
+    return np.array(out_seq).astype("int32"), new_lod0
+
+
+class TestSequenceEraseOpInt32(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceEraseOpInt64(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int64")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSequenceEraseOpEmpty(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = []
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_expand.py b/python/paddle/v2/fluid/tests/test_sequence_expand.py
new file mode 100644
index 0000000000..6fc045125f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_expand.py
@@ -0,0 +1,77 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSequenceExpand(OpTest):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [3, 1]).astype('float32')
+        y_data = np.random.uniform(0.1, 1, [8, 1]).astype('float32')
+        y_lod = [[0, 1, 4, 8]]
+        self.inputs = {'X': x_data, 'Y': (y_data, y_lod)}
+
+    def compute(self):
+        x = self.inputs['X']
+        x_data, x_lod = x if type(x) == tuple else (x, None)
+        n = 1 + x_data.shape[0] if not x_lod else len(x_lod[0])
+        y_data, y_lod = self.inputs['Y']
+        repeats = [((y_lod[-1][i + 1] - y_lod[-1][i]))
+                   for i in range(len(y_lod[-1]) - 1)]
+        out = x_data.repeat(repeats, axis=0)
+        self.outputs = {'Out': out}
+
+    def setUp(self):
+        self.op_type = 'sequence_expand'
+        self.set_data()
+        self.compute()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceExpandCase1(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [5, 1]).astype('float32')
+        x_lod = [[0, 2, 5]]
+        y_data = np.random.uniform(0.1, 1, [13, 1]).astype('float32')
+        y_lod = [[0, 2, 5], [0, 2, 4, 7, 10, 13]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+class TestSequenceExpandCase2(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [1, 2, 2]).astype('float32')
+        x_lod = [[0, 1]]
+        y_data = np.random.uniform(0.1, 1, [2, 2, 2]).astype('float32')
+        y_lod = [[0, 2]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+class TestSequenceExpandCase3(TestSequenceExpand):
+    def set_data(self):
+        x_data = np.random.uniform(0.1, 1, [4, 1]).astype('float32')
+        x_lod = [[0, 1, 2, 3, 4]]
+        y_data = np.random.uniform(0.1, 1, [6, 1]).astype('float32')
+        y_lod = [[0, 2, 4, 4, 6]]
+        self.inputs = {'X': (x_data, x_lod), 'Y': (y_data, y_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_reshape.py b/python/paddle/v2/fluid/tests/test_sequence_reshape.py
new file mode 100644
index 0000000000..06d5af8f5e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_reshape.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import math
+from op_test import OpTest
+
+
+class TestSequenceReshape(OpTest):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 12
+        x_lod = [[0, 4, 5, 8, 11]]
+        x = np.random.uniform(0.1, 1, [11, 24]).astype('float32')
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+
+        self.outputs = {'Out': (out, out_lod)}
+
+    def compute_output(self, x, x_lod, dimension):
+        x_width = x.shape[1]
+        out_lod = [[0]]
+        for i in xrange(len(x_lod[0]) - 1):
+            seq_len = x_lod[0][i + 1] - x_lod[0][i]
+            offset = (seq_len * x_width) / dimension
+            assert int(offset) * dimension == seq_len * x_width
+            out_lod[0].append(out_lod[0][-1] + int(offset))
+        out = np.zeros(shape=(out_lod[0][-1], dimension)).astype('float32')
+        out.ravel()[:] = x.ravel()[:]
+        return out, out_lod
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+
+
+class TestSequenceReshape_reduce(TestSequenceReshape):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 24
+        x_lod = [[0, 4, 6, 8, 12]]
+        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+
+        self.outputs = {'Out': (out, out_lod)}
+
+
+class TestSequenceReshape_same(TestSequenceReshape):
+    def setUp(self):
+        self.op_type = 'sequence_reshape'
+        dimension = 12
+        x_lod = [[0, 4, 6, 8, 12]]
+        x = np.random.uniform(0.1, 1, [12, 12]).astype('float32')
+
+        self.inputs = {'X': (x, x_lod)}
+        self.attrs = {'new_dim': dimension}
+
+        out, out_lod = self.compute_output(x, x_lod, dimension)
+
+        self.outputs = {'Out': (out, out_lod)}
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_slice_op.py b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
new file mode 100644
index 0000000000..bf1f21bcde
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_slice_op.py
@@ -0,0 +1,61 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+class TestSequenceSliceOp(OpTest):
+    def set_data(self):
+        self.init_test_case()
+        # only supprot one level LoD
+        x = np.random.random(self.x_dim).astype('float32')
+        lod = self.x_lod
+        offset = np.array(self.offset).astype("int64")
+        length = np.array(self.length).astype("int64")
+
+        self.inputs = {'X': (x, lod), 'Offset': offset, 'Length': length}
+        outs = []  #np.zeros((100, 3, 2)).astype('float32')
+        out_lod = [[0]]
+        out_lod_offset = 0
+        for i in range(len(offset)):
+            sub_x = x[lod[0][i] + offset[i, 0]:lod[0][i] + offset[i, 0] +
+                      length[i, 0], :]
+            out_lod_offset = out_lod_offset + len(sub_x)
+            outs.append(sub_x)
+            out_lod[0].append(out_lod_offset)
+        outs = np.concatenate(outs, axis=0)
+        self.outputs = {'Out': (outs, out_lod)}
+
+    def init_test_case(self):
+        self.x_dim = (100, 3, 2)
+        self.x_lod = [[0, 20, 40, 60, 80, 100]]
+        self.offset = [[1], [2], [3], [4], [5]]
+        self.length = [[10], [8], [6], [4], [2]]
+
+    def setUp(self):
+        self.op_type = "sequence_slice"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
new file mode 100644
index 0000000000..5bd780f6b5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_softmax_op.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+class TestSequenceSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_softmax"
+        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
+        lod = [[0, 4, 5, 8, 11]]
+
+        out = np.zeros((11, 1)).astype("float32")
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+            sub_out = stable_softmax(sub_x)
+            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
+                lod[0][i + 1] - lod[0][i], 1)
+
+        self.inputs = {"X": (x, lod)}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py
new file mode 100644
index 0000000000..ba2ca1683f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sgd_op.py
@@ -0,0 +1,101 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.op import Operator
+from op_test import OpTest
+
+
+class TestSGDOp(OpTest):
+    def setUp(self):
+        self.op_type = "sgd"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestSparseSGDOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        sgd_op.run(scope, place)
+
+        # get and compare result
+        result_array = np.array(param)
+
+        # rows[0] = 0, 5.0 - 2.0 * 2.0
+        self.assertAlmostEqual(1.0, result_array[rows[0], 0])
+        # rows[0] = 0, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[0], 2])
+        # 5.0 - 2.0 * 0.0
+        self.assertAlmostEqual(5.0, result_array[1, 0])
+        # rows[1] = 4, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[1], 10])
+        # 5.0 - 2.0 * 0.0
+        self.assertAlmostEqual(5.0, result_array[5, 8])
+        # rows[2] = 7, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[2], 1])
+        # rows[2] = 7, 5.0 - 2.0 * 4.0
+        self.assertAlmostEqual(-3.0, result_array[rows[2], 8])
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
new file mode 100644
index 0000000000..4578211bac
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.backward import append_backward
+from paddle.v2.fluid.framework import default_main_program, switch_main_program
+from paddle.v2.fluid.framework import Program
+import numpy as np
+
+
+class TestShrinkRNNMemoryBase(unittest.TestCase):
+    def setUp(self):
+        self.main_program = Program()
+        switch_main_program(self.main_program)
+        x = layers.data('x', shape=[100], dtype='float32')
+        x.stop_gradient = False
+        rank_table_tensor = layers.data(
+            'rank_table_tensor', shape=[1], dtype='float32', lod_level=1)
+        table = layers.lod_rank_table(x=rank_table_tensor)
+        i = layers.zeros(dtype='int64', shape=[1])
+        self.mem1 = layers.shrink_memory(x=x, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        self.mem2 = layers.shrink_memory(x=self.mem1, i=i, table=table)
+        i = layers.increment(x=i)
+        i.stop_gradient = True
+        self.mem3 = layers.shrink_memory(x=self.mem2, i=i, table=table)
+        mem3_mean = layers.mean(x=self.mem3)
+        append_backward(loss=mem3_mean)
+        self.x_grad = self.main_program.global_block().var('x@GRAD')
+
+    def sum_lodtensor(self, tensor):
+        sum_res = 0.0
+        for i in xrange(np.product(tensor.get_dims())):
+            sum_res += tensor.get_float_element(i)
+        return sum_res
+
+
+class TestShrinkRNNMemoryReferLoD(TestShrinkRNNMemoryBase):
+    def test_refer_lod(self):
+        cpu = core.CPUPlace()
+        x_tensor = core.LoDTensor()
+        x_tensor.set_lod([[0, 2, 5, 6]])
+        tensor_np = np.random.random(size=(6, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
+        exe = Executor(cpu)
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:6], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:5], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
+
+
+class TestShrinkRNNMemoryNoLoD(TestShrinkRNNMemoryBase):
+    def test_no_lod(self):
+        cpu = core.CPUPlace()
+        x_tensor = core.LoDTensor()
+        tensor_np = np.random.random(size=(3, 100)).astype('float32')
+        x_tensor.set(tensor_np, cpu)
+
+        rank_table_tensor = core.LoDTensor()
+        rank_table_tensor.set_lod([[0, 1, 3, 6]])
+        rank_table_tensor.set(np.random.random(size=(6, 1)).astype('float32'),
+                              cpu)
+
+        exe = Executor(cpu)
+        outs = exe.run(
+            feed={'x': x_tensor,
+                  'rank_table_tensor': rank_table_tensor},
+            fetch_list=[self.mem1, self.mem2, self.mem3, self.x_grad],
+            return_numpy=False)
+        self.assertTrue(np.allclose(tensor_np[0:3], outs[0]))
+        self.assertTrue(np.allclose(tensor_np[0:2], outs[1]))
+        self.assertTrue(np.allclose(tensor_np[0:1], outs[2]))
+        self.assertAlmostEqual(1.0, self.sum_lodtensor(outs[3]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
new file mode 100644
index 0000000000..f88fa62119
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+from op_test import OpTest
+from scipy.special import logit
+from scipy.special import expit
+import unittest
+
+
+class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sign_op.py b/python/paddle/v2/fluid/tests/test_sign_op.py
new file mode 100644
index 0000000000..c1dfa7f45d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sign_op.py
@@ -0,0 +1,36 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSignOp(OpTest):
+    def setUp(self):
+        self.op_type = "sign"
+        self.inputs = {
+            'X': np.random.uniform(-10, 10, (10, 10)).astype("float32")
+        }
+        self.outputs = {'Out': np.sign(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
new file mode 100644
index 0000000000..5a388bb7b3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_smooth_l1_loss_op.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def smooth_l1_loss_forward(val, sigma2):
+    abs_val = abs(val)
+    if abs_val < 1.0 / sigma2:
+        return 0.5 * val * val * sigma2
+    else:
+        return abs_val - 0.5 / sigma2
+
+
+class TestSmoothL1LossOp1(OpTest):
+    def setUp(self):
+        self.op_type = "smooth_l1_loss"
+        dims = (5, 10)
+        self.inputs = {
+            'X': np.random.random(dims).astype("float32"),
+            'Y': np.random.random(dims).astype("float32")
+        }
+        sigma = 3.0
+        self.attrs = {'sigma': sigma}
+        sigma2 = sigma * sigma
+        diff = self.inputs['X'] - self.inputs['Y']
+        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2).sum(1)
+        loss = loss.reshape((dims[0], 1))
+        self.outputs = {
+            'Diff': diff.astype('float32'),
+            'Out': loss.astype('float32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.02)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.03, no_grad_set=set("X"))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.03, no_grad_set=set('Y'))
+
+
+class TestSmoothL1LossOp2(OpTest):
+    def setUp(self):
+        self.op_type = "smooth_l1_loss"
+        dims = (5, 10)
+        self.inputs = {
+            'X': np.random.random(dims).astype("float32"),
+            'Y': np.random.random(dims).astype("float32"),
+            'InsideWeight': np.random.random(dims).astype("float32"),
+            'OutsideWeight': np.random.random(dims).astype("float32")
+        }
+        sigma = 3.0
+        self.attrs = {'sigma': sigma}
+        sigma2 = sigma * sigma
+        diff = self.inputs['X'] - self.inputs['Y']
+        diff = diff * self.inputs['InsideWeight']
+        loss = np.vectorize(smooth_l1_loss_forward)(diff, sigma2)
+        loss = loss * self.inputs['OutsideWeight']
+        loss = loss.sum(1).reshape((dims[0], 1))
+        self.outputs = {
+            'Diff': diff.astype('float32'),
+            'Out': loss.astype('float32')
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.03)
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad(
+            ['Y'],
+            'Out',
+            max_relative_error=0.03,
+            no_grad_set=set(['X', 'InsideWeight', 'OutsideWeight']))
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad(
+            ['X'],
+            'Out',
+            max_relative_error=0.03,
+            no_grad_set=set(['Y', 'InsideWeight', 'OutsideWeight']))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_softmax_op.py b/python/paddle/v2/fluid/tests/test_softmax_op.py
new file mode 100644
index 0000000000..cf43e676c5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_softmax_op.py
@@ -0,0 +1,45 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "softmax"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 10]).astype("float32")
+        }
+        self.outputs = {
+            'Out': np.apply_along_axis(stable_softmax, 1, self.inputs['X'])
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
new file mode 100644
index 0000000000..626f34f0e0
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_softmax_with_cross_entropy_op.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+
+class TestSoftmaxWithCrossEntropyOp(OpTest):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 2
+        class_num = 37
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
+
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax[i][labels[i][0]])]
+             for i in range(softmax.shape[0])],
+            dtype="float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+class TestSoftmaxWithCrossEntropyOp2(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = 2
+        class_num = 37
+
+        logits = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        labels = np.random.uniform(0.1, 1.0,
+                                   [batch_size, class_num]).astype("float64")
+        labels /= np.sum(labels, axis=1, keepdims=True)
+
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=1, keepdims=True).astype("float64")
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"soft_label": True}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
new file mode 100644
index 0000000000..bc541298ed
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -0,0 +1,183 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.executor import Executor
+from paddle.v2.fluid.backward import append_backward
+
+
+class TestCPULoDTensorArrayOps(unittest.TestCase):
+    def place(self):
+        return core.CPUPlace()
+
+    def test_split_and_merge_lod_tensor_no_lod(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+
+        mask_np = np.array([0, 0, 1, 1, 1, 1, 0, 0, 0, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([2, 3, 4, 5]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+
+        expect_false_tensor = np.array([0, 1, 6, 7, 8, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def test_split_and_merge_lod_tensor_level_0(self):
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('int32'), self.place())
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, self.place())
+
+        expect_true_tensor = np.array([3, 4, 5, 6, 7, 8]).astype('int32')
+        expect_true_tensor = np.expand_dims(expect_true_tensor, axis=1)
+        expect_true = core.LoDTensor()
+        expect_true.set(expect_true_tensor, self.place())
+        expect_true.set_lod([[0, 6]])
+
+        expect_false_tensor = np.array([0, 1, 2, 9]).astype('int32')
+        expect_false_tensor = np.expand_dims(expect_false_tensor, axis=1)
+        expect_false_lod = [[0, 3, 4]]
+
+        expect_false = core.LoDTensor()
+        expect_false.set(expect_false_tensor, self.place())
+        expect_false.set_lod(expect_false_lod)
+
+        self.main(
+            tensor=tensor,
+            mask=mask,
+            expect_true=expect_true,
+            expect_false=expect_false,
+            expect_out=tensor)
+
+    def main(self, tensor, mask, expect_true, expect_false, expect_out,
+             level=0):
+        place = self.place()
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[1])
+            x.persistable = True
+
+            y = layers.data(name='y', shape=[1])
+            y.persistable = True
+
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out_true.persistable = True
+            out_false.persistable = True
+
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+
+            out.persistable = True
+
+        exe = Executor(place)
+        scope = core.Scope()
+        exe.run(program,
+                feed={'x': tensor,
+                      'y': mask},
+                scope=scope,
+                return_numpy=False)
+
+        var_true = scope.find_var(out_true.name).get_tensor()
+
+        var_false = scope.find_var(out_false.name).get_tensor()
+
+        var_out = scope.find_var(out.name).get_tensor()
+
+        self.check_tensor_same(var_true, expect_true)
+        self.check_tensor_same(var_false, expect_false)
+        self.check_tensor_same(var_out, expect_out)
+
+    def check_tensor_same(self, actual, expect):
+        self.assertTrue(np.allclose(np.array(actual), np.array(expect)))
+        self.assertEqual(actual.lod(), expect.lod())
+
+
+class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
+    def test_grad(self):
+        place = core.CPUPlace()
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x', shape=[1], dtype='float32', stop_gradient=False)
+            y = layers.data(
+                name='y', shape=[1], dtype='bool', stop_gradient=False)
+
+            level = 0
+
+            out_true, out_false = layers.split_lod_tensor(
+                input=x, mask=y, level=level)
+            out = layers.merge_lod_tensor(
+                in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
+            mean = layers.mean(x=out)
+
+            append_backward(mean)
+
+        tensor = core.LoDTensor()
+        tensor.set(np.arange(10).reshape(10, 1).astype('float32'), place)
+        tensor.set_lod([[0, 3, 9, 10]])
+
+        mask_np = np.array([0, 1, 0]).astype('bool')
+        mask_np = np.expand_dims(mask_np, axis=1)
+
+        mask = core.LoDTensor()
+        mask.set(mask_np, place)
+
+        exe = Executor(place)
+        scope = core.Scope()
+
+        g_vars = program.global_block().var(x.name + "@GRAD")
+        g_out = [
+            item.sum()
+            for item in map(np.array,
+                            exe.run(program,
+                                    feed={'x': tensor,
+                                          'y': mask},
+                                    fetch_list=[g_vars],
+                                    scope=scope,
+                                    return_numpy=False))
+        ]
+
+        g_out_sum = np.array(g_out).sum()
+
+        self.assertAlmostEqual(1.0, g_out_sum, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_op.py b/python/paddle/v2/fluid/tests/test_split_op.py
new file mode 100644
index 0000000000..b80b64c41b
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_op.py
@@ -0,0 +1,39 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSplitOp(OpTest):
+    def setUp(self):
+        self.op_type = "split"
+        axis = 0
+        x = np.random.random((4, 2, 5)).astype('float32')
+        out = np.split(x, [1, 3], axis)
+        self.inputs = {'X': x}
+        self.attrs = {'axis': axis, 'sections': [1, 2, 1]}
+        self.outputs = {'Out': [('out%d' % i, out[i]) \
+            for i in xrange(len(out))]}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], ['out0', 'out1', 'out2'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
new file mode 100644
index 0000000000..343aa20066
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_split_selected_rows_op.py
@@ -0,0 +1,130 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.core as core
+import numpy as np
+from paddle.v2.fluid.op import Operator
+
+
+class TestSpliteSelectedRows(unittest.TestCase):
+    def get_places(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        return places
+
+    def test_check_output(self):
+        for place in self.get_places():
+            self.check_with_place(place)
+
+    def test_check_grad(self):
+        for place in self.get_places():
+            self.check_grad_with_place(place)
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        rows = [0, 5, 7, 4, 20]
+        height = 20
+        row_numel = 2
+
+        # initialize input variable X
+        x = scope.var('X').get_selected_rows()
+        x.set_rows(rows)
+        x.set_height(height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 1] = 4.0
+        np_array[4, 1] = 8.0
+        x_tensor = x.get_tensor()
+        x_tensor.set(np_array, place)
+
+        height_sections = [5, 5, 5, 5, 3]
+
+        # initialize output variables [out0, out1]
+        outs_name = ["out%d" % i for i in xrange(len(height_sections))]
+        outs = [
+            scope.var(var_name).get_selected_rows() for var_name in outs_name
+        ]
+
+        # expected output selected rows
+        expected_out0_rows = [0, 4]
+        expected_out1_rows = [5, 7]
+        expected_out4_rows = [20]
+
+        op = Operator(
+            "split_selected_rows",
+            X="X",
+            Out=outs_name,
+            height_sections=height_sections)
+
+        op.run(scope, place)
+
+        self.assertEqual(outs[0].rows(), expected_out0_rows)
+        self.assertEqual(outs[1].rows(), expected_out1_rows)
+        self.assertEqual(outs[4].rows(), expected_out4_rows)
+
+        self.assertEqual(outs[0].height(), height_sections[0])
+        self.assertEqual(outs[4].height(), height_sections[4])
+
+        self.assertAlmostEqual(2.0, np.array(outs[0].get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(outs[1].get_tensor())[1, 1])
+        self.assertAlmostEqual(8.0, np.array(outs[4].get_tensor())[0, 1])
+
+    def check_grad_with_place(self, place):
+        scope = core.Scope()
+        height = 10
+        row_numel = 2
+
+        # attr
+        height_sections = [5, 5]
+
+        # initialize input variable X
+        out0_grad = scope.var("out0@GRAD").get_selected_rows()
+        rows0 = [0, 5]
+        out0_grad.set_rows(rows0)
+        out0_grad.set_height(height)
+        out0_grad_tensor = out0_grad.get_tensor()
+        np_array = np.ones((len(rows0), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        out0_grad_tensor.set(np_array, place)
+
+        out1_grad = scope.var("out1@GRAD").get_selected_rows()
+        rows1 = [7, 5]
+        out1_grad.set_rows(rows1)
+        out1_grad.set_height(height)
+        out1_grad_tensor = out1_grad.get_tensor()
+        np_array = np.ones((len(rows1), row_numel)).astype("float32")
+        np_array[0, 1] = 4.0
+        out1_grad_tensor.set(np_array, place)
+
+        x_grad = scope.var("X@GRAD").get_selected_rows()
+
+        grad_op = Operator(
+            "sum",
+            X=["out0@GRAD", "out1@GRAD"],
+            Out="X@GRAD",
+            height_sections=height_sections)
+
+        grad_op.run(scope, place)
+
+        self.assertEqual(x_grad.rows(), rows0 + rows1)
+        self.assertEqual(x_grad.height(), height)
+
+        self.assertAlmostEqual(2.0, np.array(x_grad.get_tensor())[0, 0])
+        self.assertAlmostEqual(4.0, np.array(x_grad.get_tensor())[2, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_spp_op.py b/python/paddle/v2/fluid/tests/test_spp_op.py
new file mode 100644
index 0000000000..e912b56de5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_spp_op.py
@@ -0,0 +1,82 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_pool2d_op import max_pool2D_forward_naive
+from test_pool2d_op import avg_pool2D_forward_naive
+
+
+class TestSppOp(OpTest):
+    def setUp(self):
+        self.op_type = "spp"
+        self.init_test_case()
+        input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = input.shape
+        out_level_flatten = []
+        for i in xrange(self.pyramid_height):
+            bins = np.power(2, i)
+            kernel_size = [0, 0]
+            padding = [0, 0]
+            kernel_size[0] = np.ceil(hsize /
+                                     bins.astype("double")).astype("int32")
+            padding[0] = (
+                (kernel_size[0] * bins - hsize + 1) / 2).astype("int32")
+
+            kernel_size[1] = np.ceil(wsize /
+                                     bins.astype("double")).astype("int32")
+            padding[1] = (
+                (kernel_size[1] * bins - wsize + 1) / 2).astype("int32")
+            out_level = self.pool2D_forward_naive(input, kernel_size,
+                                                  kernel_size, padding)
+            out_level_flatten.append(
+                out_level.reshape(nsize, bins * bins * csize))
+            if i == 0:
+                output = out_level_flatten[i]
+            else:
+                output = np.concatenate((output, out_level_flatten[i]), 1)
+        # output = np.concatenate(out_level_flatten.tolist(), 0);
+        self.inputs = {'X': input.astype('float32'), }
+        self.attrs = {
+            'pyramid_height': self.pyramid_height,
+            'pooling_type': self.pool_type
+        }
+
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "avg":
+            self.check_grad(['X'], 'Out', max_relative_error=0.05)
+
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.pool_type = "max"
+
+
+class TestCase2(TestSppOp):
+    def init_test_case(self):
+        self.shape = [3, 2, 4, 4]
+        self.pyramid_height = 3
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.pool_type = "avg"
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
new file mode 100644
index 0000000000..8171207cd9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_distance_op.py
@@ -0,0 +1,85 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSquaredL2DistanceOp_f0(OpTest):
+    def setUp(self):
+        self.op_type = "squared_l2_distance"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
+            'Y': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32")
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+class TestSquaredL2DistanceOp_f1(OpTest):
+    def setUp(self):
+        self.op_type = "squared_l2_distance"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 0.6, (2, 3)).astype("float32"),
+            'Y': np.random.uniform(0.1, 0.6, (1, 3)).astype("float32")
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+class TestSquaredL2DistanceOp_f2(OpTest):
+    def setUp(self):
+        self.op_type = "squared_l2_distance"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 0.6, (2, 3, 4)).astype("float32"),
+            'Y': np.random.uniform(0.1, 0.6, (1, 3, 4)).astype("float32")
+        }
+        sub_res = self.inputs['X'] - self.inputs['Y']
+        sub_res = sub_res.reshape((2, 3 * 4))
+        output = sub_res * sub_res
+        self.outputs = {
+            'sub_result': sub_res,
+            'Out': np.expand_dims(output.sum(1), 1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
new file mode 100644
index 0000000000..b7575cb4d2
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_squared_l2_norm_op.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import unittest
+from numpy import linalg as LA
+from op_test import OpTest
+
+
+class TestL2LossOp(OpTest):
+    """Test squared_l2_norm
+    """
+
+    def setUp(self):
+        self.op_type = "squared_l2_norm"
+        self.max_relative_error = 0.05
+
+        X = np.random.uniform(-1, 1, (13, 19)).astype("float32")
+        X[np.abs(X) < self.max_relative_error] = 0.1
+        self.inputs = {'X': X}
+        self.outputs = {'Out': np.square(LA.norm(X))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=self.max_relative_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sum_op.py b/python/paddle/v2/fluid/tests/test_sum_op.py
new file mode 100644
index 0000000000..0a15a9485d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sum_op.py
@@ -0,0 +1,38 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestSumOp(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        x0 = np.random.random((3, 4)).astype('float32')
+        x1 = np.random.random((3, 4)).astype('float32')
+        x2 = np.random.random((3, 4)).astype('float32')
+        self.inputs = {"X": [("x0", x0), ("x1", x1), ("x2", x2)]}
+        y = x0 + x1 + x2
+        self.outputs = {'Out': y}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_tensor.py b/python/paddle/v2/fluid/tests/test_tensor.py
new file mode 100644
index 0000000000..d5cc235f58
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_tensor.py
@@ -0,0 +1,127 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.fluid.core as core
+import unittest
+import numpy
+
+
+class TestTensor(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int(place)
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1, tensor_array_2[3, 9])
+        self.assertEqual(2, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope()
+        var = scope.var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+    def test_int_lod_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var_lod = scope.var("test_lod_tensor")
+        lod_tensor = var_lod.get_tensor()
+
+        lod_tensor.set_dims([4, 4, 6])
+        lod_tensor.alloc_int(place)
+        array = numpy.array(lod_tensor)
+        array[0, 0, 0] = 3
+        array[3, 3, 5] = 10
+        lod_tensor.set(array, place)
+        lod_tensor.set_lod([[0, 2, 4]])
+
+        lod_v = numpy.array(lod_tensor)
+        self.assertTrue(numpy.alltrue(array == lod_v))
+
+        lod = lod_tensor.lod()
+        self.assertEqual(0, lod[0][0])
+        self.assertEqual(2, lod[0][1])
+        self.assertEqual(4, lod[0][2])
+
+    def test_float_lod_tensor(self):
+        place = core.CPUPlace()
+        scope = core.Scope()
+        var_lod = scope.var("test_lod_tensor")
+
+        lod_tensor = var_lod.get_tensor()
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.alloc_float(place)
+
+        tensor_array = numpy.array(lod_tensor)
+        self.assertEqual((5, 2, 3, 4), tensor_array.shape)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
+
+        lod_v = numpy.array(lod_tensor)
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertEqual(len(lod_tensor.lod()), 0)
+
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_tensor.set_lod(lod_py)
+        lod = lod_tensor.lod()
+        self.assertListEqual(lod_py, lod)
+
+    def test_lod_tensor_init(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        lod_py = [[0, 2, 5], [0, 2, 4, 5]]
+        lod_tensor = core.LoDTensor(lod_py)
+
+        lod_tensor.set_dims([5, 2, 3, 4])
+        lod_tensor.alloc_float(place)
+        tensor_array = numpy.array(lod_tensor)
+        tensor_array[0, 0, 0, 0] = 1.0
+        tensor_array[0, 0, 0, 1] = 2.0
+        lod_tensor.set(tensor_array, place)
+
+        lod_v = numpy.array(lod_tensor)
+        self.assertAlmostEqual(1.0, lod_v[0, 0, 0, 0])
+        self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
+        self.assertListEqual(lod_py, lod_tensor.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_top_k_op.py b/python/paddle/v2/fluid/tests/test_top_k_op.py
new file mode 100644
index 0000000000..a50faf0fff
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_top_k_op.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestTopkOp(OpTest):
+    def setUp(self):
+        self.op_type = "top_k"
+        k = 1
+        input = np.random.random((32, 84)).astype("float32")
+        output = np.ndarray((32, k))
+        indices = np.ndarray((32, k)).astype("int64")
+
+        self.inputs = {'X': input}
+        self.attrs = {'k': k}
+
+        for rowid in xrange(32):
+            row = input[rowid]
+            output[rowid] = np.sort(row)[-k:]
+            indices[rowid] = row.argsort()[-k:]
+
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestTopkOp3d(OpTest):
+    def setUp(self):
+        self.op_type = "top_k"
+        k = 1
+        input = np.random.random((32, 2, 84)).astype("float32")
+        input_flat_2d = input.reshape(64, 84)
+        output = np.ndarray((64, k))
+        indices = np.ndarray((64, k)).astype("int64")
+
+        # FIXME: should use 'X': input for a 3d input
+        self.inputs = {'X': input_flat_2d}
+        self.attrs = {'k': k}
+
+        for rowid in xrange(64):
+            row = input_flat_2d[rowid]
+            output[rowid] = np.sort(row)[-k:]
+            indices[rowid] = row.argsort()[-k:]
+
+        self.outputs = {'Out': output, 'Indices': indices}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_transpose_op.py b/python/paddle/v2/fluid/tests/test_transpose_op.py
new file mode 100644
index 0000000000..a16de1416f
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_transpose_op.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = "transpose"
+        self.inputs = {'X': np.random.random(self.shape).astype("float32")}
+        self.attrs = {'axis': list(self.axis)}
+        self.outputs = {'Out': self.inputs['X'].transpose(self.axis)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def initTestCase(self):
+        self.shape = (3, 4)
+        self.axis = (1, 0)
+
+
+class TestCase0(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (3, )
+        self.axis = (0, )
+
+
+class TestCase1(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (3, 4, 5)
+        self.axis = (0, 2, 1)
+
+
+class TestCase2(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+        self.axis = (0, 2, 3, 1)
+
+
+class TestCase3(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6)
+        self.axis = (4, 2, 3, 1, 0)
+
+
+class TestCase4(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5, 6, 1)
+        self.axis = (4, 2, 3, 1, 0, 5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
new file mode 100644
index 0000000000..94cf416fad
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+
+from paddle.v2.fluid.op import Operator
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid as fluid
+
+
+class TestUniformRandomOp(unittest.TestCase):
+    def setUp(self):
+        self.op_type = "uniform_random"
+        self.inputs = {}
+        self.attrs = {
+            "shape": [1000, 784],
+            "min": -5.0,
+            "max": 10.0,
+            "seed": 10
+        }
+        self.outputs = ["Out"]
+
+    def test_cpu(self):
+        self.uniform_random_test(place=core.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compiled_with_cuda():
+            self.uniform_random_test(place=core.CUDAPlace(0))
+
+    def uniform_random_test(self, place):
+        program = fluid.Program()
+        block = program.global_block()
+        vout = block.create_var(name="Out")
+        op = block.append_op(
+            type=self.op_type, outputs={"Out": vout}, attrs=self.attrs)
+
+        op.desc.infer_var_type(block.desc)
+        op.desc.infer_shape(block.desc)
+
+        fetch_list = []
+        for var_name in self.outputs:
+            fetch_list.append(block.var(var_name))
+
+        exe = fluid.Executor(place)
+        outs = exe.run(program, fetch_list=fetch_list)
+
+        tensor = outs[0]
+
+        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000..3dd43f9ba4
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
new file mode 100644
index 0000000000..9f9748ca4e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
+import paddle.v2.fluid.core as core
+import numpy as np
+
+
+class TestVariable(unittest.TestCase):
+    def test_np_dtype_convert(self):
+        DT = core.DataType
+        convert = convert_np_dtype_to_dtype_
+        self.assertEqual(DT.FP32, convert(np.float32))
+        self.assertEqual(DT.FP16, convert("float16"))
+        self.assertEqual(DT.FP64, convert("float64"))
+        self.assertEqual(DT.INT32, convert("int32"))
+        self.assertEqual(DT.INT16, convert("int16"))
+        self.assertEqual(DT.INT64, convert("int64"))
+        self.assertEqual(DT.BOOL, convert("bool"))
+        self.assertRaises(ValueError, lambda: convert("int8"))
+
+    def test_var(self):
+        b = default_main_program().current_block()
+        w = b.create_var(
+            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
+        self.assertNotEqual(str(w), "")
+        self.assertEqual(core.DataType.FP64, w.dtype)
+        self.assertEqual((784, 100), w.shape)
+        self.assertEqual("fc.w", w.name)
+        self.assertEqual(0, w.lod_level)
+
+        w = b.create_var(name='fc.w')
+        self.assertEqual(core.DataType.FP64, w.dtype)
+        self.assertEqual((784, 100), w.shape)
+        self.assertEqual("fc.w", w.name)
+        self.assertEqual(0, w.lod_level)
+
+        self.assertRaises(ValueError,
+                          lambda: b.create_var(name="fc.w", shape=(24, 100)))
+
+    def test_step_scopes(self):
+        prog = Program()
+        b = prog.current_block()
+        var = b.create_var(
+            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
+        self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_warpctc_op.py b/python/paddle/v2/fluid/tests/test_warpctc_op.py
new file mode 100644
index 0000000000..55d1c73262
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_warpctc_op.py
@@ -0,0 +1,232 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+import numpy as np
+from op_test import OpTest
+from test_softmax_op import stable_softmax
+
+CUDA_BLOCK_SIZE = 512
+
+
+class CTCForward(object):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
+                 norm_by_times):
+        self.softmax = softmax
+        self.softmax_lod = softmax_lod
+        assert labels.shape[1] == 1
+        self.labels = labels
+        self.labels_lod = labels_lod
+        self.blank = blank
+        self.norm_by_times = norm_by_times
+
+        self.level = 0
+        self.num_classes = softmax.shape[1]
+        self.batch_size = len(softmax_lod[self.level]) - 1
+        assert self.batch_size == len(labels_lod[self.level]) - 1
+
+        self.loss = np.zeros([self.batch_size, 1], dtype="float32")
+        self.gradient = np.zeros(self.softmax.shape, dtype="float32")
+
+        # float64
+        self.EXP_MAX = sys.float_info.max
+        self.EXP_MIN = sys.float_info.min
+        self.LOG_ZERO = np.log(self.EXP_MIN)
+        self.LOG_INFINITY = np.log(self.EXP_MAX)
+
+    def safe_exp(self, x):
+        if x <= self.LOG_ZERO:
+            return 0.0
+        if x >= self.LOG_INFINITY:
+            return self.EXP_MAX
+        return np.exp(x)
+
+    def safe_log(self, x):
+        if x <= self.EXP_MIN:
+            return self.LOG_ZERO
+        return np.log(x)
+
+    # x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
+    def log_div(self, x, y):
+        res = x - y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
+    def log_mul(self, x, y):
+        res = x + y
+        if res <= self.LOG_ZERO:
+            return self.LOG_ZERO
+        if res >= self.LOG_INFINITY:
+            return self.LOG_INFINITY
+        return res
+
+    # x = lna and y = lnb are in log scale,
+    # ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
+    def log_add(self, x, y):
+        if x < y:
+            t = y
+            y = x
+            x = t
+        return x + self.safe_log(1 + self.safe_exp(y - x))
+
+    def segment_range(self, time, total_times, total_segments):
+        start = max(0, total_segments - (2 * (total_times - time)))
+        end = min(total_segments, 2 * (time + 1))
+        return start, end
+
+    def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
+        total_times = softmax_a_sequence.shape[0]
+        total_segments = labels_a_sequence.shape[0] * 2 + 1
+
+        required_times = labels_a_sequence.shape[0]
+        old_label = -1
+        for i in range(labels_a_sequence.shape[0]):
+            # two contingous labels with the same value
+            if labels_a_sequence[i, 0] == old_label:
+                required_times = required_times + 1
+            old_label = labels_a_sequence[i, 0]
+
+        if total_times < required_times:
+            return 0
+
+        # calculate the forward and backward variables,
+        # reference Chapter 7.3 of "Alex Grave, Supervised Sequence
+        # Labelling with Recurrent Neural Networks"
+        log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
+        for i in range(total_times):
+            for j in range(self.num_classes):
+                log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
+
+        # calculate the forward variables
+        forward_vars = np.zeros([total_times, total_segments], dtype="float32")
+        for i in range(total_times):
+            for j in range(total_segments):
+                forward_vars[i, j] = self.LOG_ZERO
+
+        for i in range(total_times):
+            # dp initialization at t0
+            if i == 0:
+                forward_vars[i, 0] = log_acts[0, self.blank]
+                if total_segments > 1:
+                    forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
+                continue
+
+            # dp from t1
+            start, end = self.segment_range(i, total_times, total_segments)
+            for k in range(end - start):
+                j = k + start
+                if j & 1 == 1:
+                    label_idx = j / 2
+                    label_val = labels_a_sequence[label_idx, 0]
+                    fv = self.log_add(forward_vars[i - 1, j],
+                                      forward_vars[i - 1, j - 1])
+                    if j > 1 and label_val != labels_a_sequence[label_idx - 1,
+                                                                0]:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 2])
+                    fv = self.log_mul(fv, log_acts[i, label_val])
+                else:
+                    fv = forward_vars[i - 1, j]
+                    if j > 0:
+                        fv = self.log_add(fv, forward_vars[i - 1, j - 1])
+                    fv = self.log_mul(fv, log_acts[i, self.blank])
+                forward_vars[i, j] = fv
+
+        # sum the last two value as log_prob
+        log_prob = forward_vars[total_times - 1, total_segments - 1]
+        if total_segments > 1:
+            log_prob = self.log_add(
+                log_prob, forward_vars[total_times - 1, total_segments - 2])
+
+        return -log_prob
+
+    def forward(self):
+        for i in range(self.batch_size):
+            softmax_start_i = self.softmax_lod[self.level][i]
+            softmax_end_i = self.softmax_lod[self.level][i + 1]
+            labels_start_i = self.labels_lod[self.level][i]
+            labels_end_i = self.labels_lod[self.level][i + 1]
+
+            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
+            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                   labels_a_sequence)
+        return self.loss
+
+
+class TestWarpCTCOp(OpTest):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = 8
+        self.logits_lod = [[0, 4, 5, 8, 11]]
+        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+    def setUp(self):
+        self.op_type = "warpctc"
+        self.config()
+
+        logits = np.random.uniform(
+            0.1, 1.0,
+            [self.logits_lod[0][-1], self.num_classes]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0, self.num_classes - 1, [self.labels_lod[0][-1], 1], dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
+                         self.blank, self.norm_by_times)
+        loss = ctc.forward()
+
+        max_sequence_length = 0
+        for i in range(self.batch_size):
+            max_sequence_length = max(
+                max_sequence_length,
+                self.logits_lod[0][i + 1] - self.logits_lod[0][i])
+        self.gradient = np.zeros(
+            [max_sequence_length, self.batch_size, self.num_classes],
+            dtype="float32")
+
+        self.inputs = {
+            "Logits": (logits, self.logits_lod),
+            "Label": (labels, self.labels_lod)
+        }
+        self.outputs = {"Loss": loss}
+        self.attrs = {"blank": self.blank, "norm_by_times": self.norm_by_times}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.outputs['WarpCTCGrad'] = self.gradient
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.007)
+
+
+class TestWarpCTCOpCase1(TestWarpCTCOp):
+    def config(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_lod = [[0, 4, 5, 8, 11]]
+        self.labels_lod = [[0, 3, 4, 8, 12]]
+        self.blank = 0
+        self.norm_by_times = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_weight_normalization.py b/python/paddle/v2/fluid/tests/test_weight_normalization.py
new file mode 100644
index 0000000000..80ad8285d8
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_weight_normalization.py
@@ -0,0 +1,121 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy
+import collections
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.initializer import ConstantInitializer
+from paddle.v2.fluid.param_attr import WeightNormParamAttr
+
+
+class TestWeightNormalization(unittest.TestCase):
+    batch_size = 3
+    hidden_size = 5
+    data_desc = (['x', [10], 0], )
+
+    @classmethod
+    def setUpClass(cls):
+        cls.set_program()
+
+    @classmethod
+    def set_program(cls):
+        data = fluid.layers.data(
+            name=cls.data_desc[0][0], shape=cls.data_desc[0][1])
+        out = fluid.layers.fc(input=data,
+                              size=cls.hidden_size,
+                              param_attr=WeightNormParamAttr(
+                                  dim=None,
+                                  name='weight_norm_param',
+                                  initializer=ConstantInitializer(1.0)),
+                              bias_attr=False,
+                              act=None)
+        loss = fluid.layers.reduce_sum(out)
+        fluid.backward.append_backward(loss=loss)
+        cls.fetch_list = [
+            'weight_norm_param_g', 'weight_norm_param_v',
+            'weight_norm_param_g@GRAD'
+        ]
+
+    def run_program(self):
+        outputs = []
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
+        for place in places:
+            self.set_inputs(place)
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
+            output = exe.run(fluid.default_main_program(),
+                             feed=self.inputs,
+                             fetch_list=self.fetch_list,
+                             return_numpy=False)
+            outputs.append(output)
+        self.actual_outputs = outputs
+
+    def set_data(self):
+        self.data = collections.OrderedDict()
+        for desc in self.data_desc:
+            data_name = desc[0]
+            data_shape = desc[1]
+            data_lod_level = desc[2]
+            data_lod = []
+            for i in range(data_lod_level):
+                lod_level_i = numpy.random.randint(
+                    low=1,
+                    high=5,
+                    size=self.batch_size if i == 0 else lod_level_i[-1])
+                lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
+                data_lod.append(lod_level_i)
+            data_value = numpy.random.random(
+                size=[data_lod[-1][-1] if data_lod else self.batch_size
+                      ] + data_shape).astype('float32')
+            self.data[data_name] = (data_value, data_lod)
+
+    def set_inputs(self, place):
+        self.inputs = {}
+        for desc in self.data_desc:
+            tensor = fluid.Tensor()
+            tensor.set(self.data[desc[0]][0], place)
+            if self.data[desc[0]][1]:
+                tensor.set_lod(self.data[desc[0]][1])
+            self.inputs[desc[0]] = tensor
+
+    def weight_normalize(self):
+        v = numpy.ones((self.data[self.data_desc[0][0]][0].shape[-1],
+                        self.hidden_size))
+        g = numpy.linalg.norm(v, axis=None, keepdims=True)
+        w = g * v / numpy.linalg.norm(v, axis=None, keepdims=True)
+        x = self.data[self.data_desc[0][0]][0]
+        out = numpy.dot(x, w)
+        g_grad = (numpy.dot(x.T, numpy.ones_like(out)) * (v / numpy.linalg.norm(
+            v, axis=None, keepdims=True))).sum(axis=None, keepdims=True)
+        return g, v, g_grad
+
+    def test_weight_normalization(self):
+        self.set_data()
+        self.run_program()
+        expect_output = self.weight_normalize()
+        for actual_output in self.actual_outputs:
+            [
+                self.assertTrue(
+                    numpy.allclose(
+                        numpy.array(actual), expect, atol=0.001))
+                for expect, actual in zip(expect_output, actual_output)
+            ]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
new file mode 100644
index 0000000000..9f5e1b668c
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -0,0 +1,80 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle.v2.fluid.layers as layers
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+from paddle.v2.fluid.backward import append_backward
+import numpy
+
+
+class TestWhileOp(unittest.TestCase):
+    def test_simple_forward(self):
+        d0 = layers.data(
+            "d0", shape=[10], append_batch_size=False, dtype='float32')
+        d1 = layers.data(
+            "d1", shape=[10], append_batch_size=False, dtype='float32')
+        d2 = layers.data(
+            "d2", shape=[10], append_batch_size=False, dtype='float32')
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+        init = layers.zeros(shape=[10], dtype='float32')
+        mem_array = layers.array_write(x=init, i=i)
+        data_array = layers.array_write(x=d0, i=i)
+
+        i = layers.increment(i)
+        layers.array_write(d1, i, array=data_array)
+
+        i = layers.increment(i)
+        layers.array_write(d2, i, array=data_array)
+
+        i = layers.zeros(shape=[1], dtype='int64')
+        i.stop_gradient = True
+
+        array_len = layers.fill_constant(shape=[1], dtype='int64', value=3)
+        array_len.stop_gradient = True
+        cond = layers.less_than(x=i, y=array_len)
+
+        while_op = layers.While(cond=cond)
+        with while_op.block():
+            d = layers.array_read(array=data_array, i=i)
+            prev = layers.array_read(array=mem_array, i=i)
+            result = layers.sums(input=[d, prev])
+
+            i = layers.increment(x=i, in_place=True)
+            layers.array_write(result, i=i, array=mem_array)
+            layers.less_than(x=i, y=array_len, cond=cond)
+
+        sum_result = layers.array_read(array=mem_array, i=i)
+        loss = layers.mean(x=sum_result)
+
+        append_backward(loss)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+        d = []
+
+        for i in xrange(3):
+            d.append(numpy.random.random(size=[10]).astype('float32'))
+
+        outs = exe.run(feed={'d0': d[0],
+                             'd1': d[1],
+                             'd2': d[2]},
+                       fetch_list=[sum_result])
+        self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
index 0d648e9ae6..e5000e440c 100644
--- a/python/paddle/v2/image.py
+++ b/python/paddle/v2/image.py
@@ -1,33 +1,48 @@
-import numpy as np
-try:
-    import cv2
-except ImportError:
-    cv2 = None
-import os
-import tarfile
-import cPickle
-
-__all__ = [
-    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
-    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
-    "batch_images_from_tar"
-]
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 This file contains some common interfaces for image preprocess.
 Many users are confused about the image layout. We introduce
 the image layout as follows.
 
 - CHW Layout
+
   - The abbreviations: C=channel, H=Height, W=Width
   - The default layout of image opened by cv2 or PIL is HWC.
     PaddlePaddle only supports the CHW layout. And CHW is simply
     a transpose of HWC. It must transpose the input image.
 
 - Color format: RGB or BGR
+
   OpenCV use BGR color format. PIL use RGB color format. Both
   formats can be used for training. Noted that, the format should
   be keep consistent between the training and inference peroid.
 """
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+import os
+import tarfile
+import cPickle
+
+__all__ = [
+    "load_image_bytes", "load_image", "resize_short", "to_chw", "center_crop",
+    "random_crop", "left_right_flip", "simple_transform", "load_and_transform",
+    "batch_images_from_tar"
+]
 
 
 def batch_images_from_tar(data_file,
@@ -36,17 +51,18 @@ def batch_images_from_tar(data_file,
                           num_per_batch=1024):
     """
     Read images from tar file and batch them into batch file.
-    param data_file: path of image tar file
-    type data_file: string
-    param dataset_name: 'train','test' or 'valid'
-    type dataset_name: string
-    param img2label: a dic with image file name as key 
+
+    :param data_file: path of image tar file
+    :type data_file: string
+    :param dataset_name: 'train','test' or 'valid'
+    :type dataset_name: string
+    :param img2label: a dic with image file name as key 
                     and image's label as value
-    type img2label: dic
-    param num_per_batch: image number per batch file
-    type num_per_batch: int
-    return: path of list file containing paths of batch file
-    rtype: string
+    :type img2label: dic
+    :param num_per_batch: image number per batch file
+    :type num_per_batch: int
+    :return: path of list file containing paths of batch file
+    :rtype: string
     """
     batch_dir = data_file + "_batch"
     out_path = "%s/%s" % (batch_dir, dataset_name)
@@ -99,14 +115,16 @@ def load_image_bytes(bytes, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         with open('cat.jpg') as f:
             im = load_image_bytes(f.read())
 
     :param bytes: the input image bytes array.
-    :type file: str
+    :type bytes: str
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     flag = 1 if is_color else 0
     file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
@@ -121,6 +139,7 @@ def load_image(file, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
 
     :param file: the input image path.
@@ -128,6 +147,7 @@ def load_image(file, is_color=True):
     :param is_color: If set is_color True, it will load and
                      return a color image. Otherwise, it will
                      load and return a gray image.
+    :type is_color: bool
     """
     # cv2.IMAGE_COLOR for OpenCV3
     # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
@@ -147,6 +167,7 @@ def resize_short(im, size):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
     
@@ -155,7 +176,6 @@ def resize_short(im, size):
     :param size: the shorter edge size of image after resizing.
     :type size: int
     """
-    assert im.shape[-1] == 1 or im.shape[-1] == 3
     h, w = im.shape[:2]
     h_new, w_new = size, size
     if h > w:
@@ -175,6 +195,7 @@ def to_chw(im, order=(2, 0, 1)):
     Example usage:
     
     .. code-block:: python
+
         im = load_image('cat.jpg')
         im = resize_short(im, 256)
         im = to_chw(im)
@@ -196,6 +217,7 @@ def center_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = center_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -223,6 +245,7 @@ def random_crop(im, size, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = random_crop(im, 224)
     
     :param im: the input image with HWC layout.
@@ -243,7 +266,7 @@ def random_crop(im, size, is_color=True):
     return im
 
 
-def left_right_flip(im):
+def left_right_flip(im, is_color=True):
     """
     Flip an image along the horizontal direction.
     Return the flipped image.
@@ -251,18 +274,26 @@ def left_right_flip(im):
     Example usage:
     
     .. code-block:: python
+
         im = left_right_flip(im)
     
-    :paam im: input image with HWC layout
+    :param im: input image with HWC layout or HW layout for gray image
     :type im: ndarray
+    :param is_color: whether input image is color or not
+    :type is_color: bool
     """
-    if len(im.shape) == 3:
+    if len(im.shape) == 3 and is_color:
         return im[:, ::-1, :]
     else:
-        return im[:, ::-1, :]
+        return im[:, ::-1]
 
 
-def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
+def simple_transform(im,
+                     resize_size,
+                     crop_size,
+                     is_train,
+                     is_color=True,
+                     mean=None):
     """
     Simply data argumentation for training. These operations include
     resizing, croping and flipping.
@@ -270,6 +301,7 @@ def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
     Example usage:
     
     .. code-block:: python
+
         im = simple_transform(im, 256, 224, True)
 
     :param im: The input image with HWC layout.
@@ -280,15 +312,35 @@ def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
     im = resize_short(im, resize_size)
     if is_train:
-        im = random_crop(im, crop_size)
+        im = random_crop(im, crop_size, is_color=is_color)
         if np.random.randint(2) == 0:
-            im = left_right_flip(im)
+            im = left_right_flip(im, is_color)
     else:
-        im = center_crop(im, crop_size)
-    im = to_chw(im)
+        im = center_crop(im, crop_size, is_color)
+        im = center_crop(im, crop_size, is_color=is_color)
+    if len(im.shape) == 3:
+        im = to_chw(im)
+
+    im = im.astype('float32')
+    if mean is not None:
+        mean = np.array(mean, dtype=np.float32)
+        # mean value, may be one value per channel 
+        if mean.ndim == 1 and is_color:
+            mean = mean[:, np.newaxis, np.newaxis]
+        elif mean.ndim == 1:
+            mean = mean
+        else:
+            # elementwise mean
+            assert len(mean.shape) == len(im)
+        im -= mean
 
     return im
 
@@ -297,7 +349,8 @@ def load_and_transform(filename,
                        resize_size,
                        crop_size,
                        is_train,
-                       is_color=True):
+                       is_color=True,
+                       mean=None):
     """
     Load image from the input file `filename` and transform image for
     data argumentation. Please refer to the `simple_transform` interface
@@ -306,6 +359,7 @@ def load_and_transform(filename,
     Example usage:
     
     .. code-block:: python
+
         im = load_and_transform('cat.jpg', 256, 224, True)
 
     :param filename: The file name of input image.
@@ -316,7 +370,12 @@ def load_and_transform(filename,
     :type crop_size: int
     :param is_train: Whether it is training or not.
     :type is_train: bool
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    :param mean: the mean values, which can be element-wise mean values or 
+                 mean values per channel.
+    :type mean: numpy array | list
     """
-    im = load_image(filename)
-    im = simple_transform(im, resize_size, crop_size, is_train, is_color)
+    im = load_image(filename, is_color)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color, mean)
     return im
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601..78bf9807da 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,22 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
+import cPickle
 
 __all__ = ['infer', 'Inference']
 
@@ -27,19 +40,39 @@ class Inference(object):
     :type parameters: paddle.v2.parameters.Parameters
     """
 
-    def __init__(self, output_layer, parameters):
-        topo = topology.Topology(output_layer)
-        gm = api.GradientMachine.createFromConfigProto(
-            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+    def __init__(self, parameters, output_layer=None, fileobj=None):
+        import py_paddle.swig_paddle as api
+
+        if output_layer is not None:
+            topo = topology.Topology(output_layer)
+            gm = api.GradientMachine.createFromConfigProto(
+                topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+            self.__data_types__ = topo.data_type()
+        elif fileobj is not None:
+            tmp = cPickle.load(fileobj)
+            gm = api.GradientMachine.createByConfigProtoStr(
+                tmp['protobin'], api.CREATE_MODE_TESTING,
+                [api.PARAMETER_VALUE])
+            self.__data_types__ = tmp['data_type']
+        else:
+            raise ValueError("Either output_layer or fileobj must be set")
+
         for param in gm.getParameters():
             val = param.getBuf(api.PARAMETER_VALUE)
             name = param.getName()
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
+            # the setValueUpdated function is called in randomize, zeroMem,
+            # load function in paddle/parameter/Parameter.cpp. But in the
+            # inference mode, the setValueUpdated is never called, it will
+            # cause the parameter will not be dispatched
+            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
+            # called here, but it's better to call this function in one place.
+            param.setValueUpdated()
         self.__gradient_machine__ = gm
-        self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
@@ -63,7 +96,7 @@ class Inference(object):
                 item = [each_result[each_field] for each_field in field]
                 yield item
 
-    def infer(self, input, field='value', **kwargs):
+    def infer(self, input, field='value', flatten_result=True, **kwargs):
         """
         Infer a data by model.
         :param input: input data batch. Should be python iterable object.
@@ -76,7 +109,13 @@ class Inference(object):
                 retv = [[] for i in xrange(len(result))]
             for i, item in enumerate(result):
                 retv[i].append(item)
-        retv = [numpy.concatenate(out) for out in retv]
+
+        if retv == None:
+            return []
+
+        if flatten_result:
+            retv = [numpy.concatenate(out) for out in retv]
+
         if len(retv) == 1:
             return retv[0]
         else:
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 815635f5dd..6a2bb8d337 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """
 `paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
-we want to make Paddle a plain Python package. The model config package defined
+we want to make Paddle a plain Python package. The model config package defines
 the way how to configure a neural network topology in Paddle Python code.
 
 The primary usage shows below.
@@ -30,7 +30,6 @@ The primary usage shows below.
     # use prediction instance where needed.
     parameters = paddle.parameters.create(cost)
 """
-
 import collections
 import copy
 import re
@@ -44,16 +43,19 @@ __all__ = ['data', 'parse_network']
 
 
 def __need_to_keep__(name):
-    if name in ['StaticInput', 'LayerType', 'layer_support']:
-        return False
-    return True
+    return name in [
+        'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
+        'layer_support', 'BaseGeneratedInput'
+    ]
 
 
 def __need_to_wrap__(name):
-    return name not in ['AggregateLevel', 'ExpandLevel']
+    return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
 
 
 def __convert_name__(inname):
+    if __need_to_keep__(inname):
+        return inname
     if inname == 'maxid_layer':
         return 'max_id'
     elif inname.endswith('memory') or inname.endswith(
@@ -74,8 +76,6 @@ def __convert_name__(inname):
 
 for name in v1_layers.__all__:
     obj = getattr(v1_layers, name)
-    if not __need_to_keep__(name):
-        continue
     new_name = __convert_name__(name)
     if callable(obj) and __need_to_wrap__(name):
         globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
@@ -107,7 +107,7 @@ __data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
 data = __convert_to_v2__(__data_layer__, 'name', __name__)
 
 
-def __get_used_layers__(output_layers, extra_layers=None):
+def __get_used_layers__(output_layers):
     layer_names = set()
     parents = {}
 
@@ -132,6 +132,13 @@ def __get_used_layers__(output_layers, extra_layers=None):
                     add_parent(mem.layer_name, mem.boot_layer_name)
                 add_parent(mem.link_name, mem.layer_name)
 
+            if sub_model.HasField('generator'):
+                # according to the implementation of text generation
+                # in recurrent layer group, the generated word must be
+                # the first out link
+                add_parent(sub_model.out_links[0].layer_name,
+                           sub_model.generator.eos_layer_name)
+
     def dfs_travel(layer_name):
         if layer_name in layer_names:
             return
@@ -192,6 +199,15 @@ def __get_used_submodels__(layer_names):
     return submodel_names
 
 
+def __get_submodel_data_out_links__():
+    data_links = set()
+    for submodel in cp.g_config.model_config.sub_models:
+        for link in submodel.out_links:
+            if cp.g_layer_map[link.link_name].type == 'data':
+                data_links.add(link.link_name)
+    return data_links
+
+
 def __get_used_evaluators__(layer_names):
     evaluator_names = set()
     for e in cp.g_config.model_config.evaluators:
@@ -247,33 +263,45 @@ def __trim_submodel__(old_submodel, layer_names, input_layer_names,
 def parse_network(output_layers, extra_layers=None):
     if not isinstance(output_layers, collections.Sequence):
         output_layers = [output_layers]
-    if extra_layers is not None and not isinstance(extra_layers,
-                                                   collections.Sequence):
-        extra_layers = [extra_layers]
+    if extra_layers is not None:
+        if not isinstance(extra_layers, collections.Sequence):
+            extra_layers = [extra_layers]
     else:
         extra_layers = []
 
-    layer_names = __get_used_layers__(output_layers + extra_layers)
+    layer_names = __get_used_layers__(list(output_layers) + list(extra_layers))
     submodel_names = __get_used_submodels__(layer_names)
     submodel_names.add('root')
     evaluator_names = __get_used_evaluators__(layer_names)
+    data_out_links = __get_submodel_data_out_links__()
     input_layer_names = set()
     output_layer_names = set()
 
     model_config = ModelConfig()
     model_config.type = cp.g_config.model_config.type
+
+    for layer in output_layers:
+        model_config.output_layer_names.append(layer.full_name)
+        output_layer_names.add(layer.full_name)
+
     for l in cp.g_config.model_config.layers:
         if l.name not in layer_names:
             continue
         model_config.layers.extend([l])
         if l.type == 'data':
+            if l.name in data_out_links:
+                """
+                In text generation, the outlink to save the generated word
+                indices is a data_layer defined in recurrent_group. This
+                data_layer is sure to be the output of the network in text
+                generation task, so this statement excludes such a special
+                data_layer from being inputs of the network, otherwise an error
+                will occur during data feeding.
+                """
+                continue
             model_config.input_layer_names.append(l.name)
             input_layer_names.add(l.name)
 
-    for layer in output_layers:
-        model_config.output_layer_names.append(layer.full_name)
-        output_layer_names.add(layer.full_name)
-
     for e in cp.g_config.model_config.evaluators:
         if e.name in evaluator_names:
             model_config.evaluators.extend([e])
@@ -296,6 +324,3 @@ def parse_network(output_layers, extra_layers=None):
 
 def get_layer(name):
     return config_base.__layer_map__.get(name)
-
-
-cp.begin_parse()
diff --git a/python/paddle/v2/master/.gitignore b/python/paddle/v2/master/.gitignore
new file mode 100644
index 0000000000..a3ac6e1a33
--- /dev/null
+++ b/python/paddle/v2/master/.gitignore
@@ -0,0 +1,3 @@
+*.whl
+*.so
+*.pyc
diff --git a/python/paddle/v2/master/__init__.py b/python/paddle/v2/master/__init__.py
new file mode 100644
index 0000000000..494e4baf20
--- /dev/null
+++ b/python/paddle/v2/master/__init__.py
@@ -0,0 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from client import *
+
+__all__ = ['client']
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
new file mode 100644
index 0000000000..b3c790e39d
--- /dev/null
+++ b/python/paddle/v2/master/client.py
@@ -0,0 +1,95 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ctypes
+import os
+
+__lib__ = None
+
+
+def get_c_lib():
+    global __lib__
+    if __lib__ is None:
+        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+        __lib__ = ctypes.cdll.LoadLibrary(path)
+    return __lib__
+
+
+class client(object):
+    """
+    client is a client to the master server.
+    """
+
+    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
+        self.c = get_c_lib().paddle_new_etcd_master_client(
+            etcd_endpoints, timeout_sec, buf_size)
+
+    def request_save_model(self, trainer_id, block_ms):
+        """request to save model
+
+        Conventionally the 0-th trainer will save model. But in
+        distributed training, any trainer could be killed. This
+        function asks the master server if the trainer should proceed
+        with saving model.
+
+        :param trainer_id: trainer id.
+        :param block_ms: number of millisecond that other save model
+        will be blocked if this save model request succeeded.
+
+        Returns:
+            int: 1 if the save the model request is approved, 0 if
+            does the request is rejected because other trainer is
+            saving the model, -1 if error happened.
+
+        """
+        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
+                                                     block_ms)
+
+    def release(self):
+        get_c_lib().paddle_release_master_client(self.c)
+        self.c = None
+
+    def set_dataset(self, paths):
+        holder_type = ctypes.c_char_p * len(paths)
+        holder = holder_type()
+        for idx, path in enumerate(paths):
+            c_ptr = ctypes.c_char_p(path)
+            holder[idx] = c_ptr
+        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
+
+    def next_record(self):
+        """gets next record for training
+
+        Returns:
+            string: the record.
+            int: error code, 0 if successful, < 0 otherwise.
+        """
+        p = ctypes.c_char_p()
+        ret = ctypes.pointer(p)
+        size = get_c_lib().paddle_next_record(self.c, ret)
+        if size < 0:
+            # Error
+            return None, size
+
+        if size == 0:
+            # Empty record
+            return "", 0
+
+        record = ret.contents.value[:size]
+        # Memory created from C should be freed.
+        get_c_lib().mem_free(ret.contents)
+        return record, 0
+
+    def paddle_start_get_records(self, pass_id):
+        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 5e99d4a241..caef5f484e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,12 +1,20 @@
-import py_paddle.swig_paddle as swig_api
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
-"""
-Optimizers(update equation) for SGD method.
-
-TODO(yuyang18): Complete comments.
-"""
+from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
 
 __all__ = [
     'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
@@ -16,6 +24,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -34,18 +43,27 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def create_updater(self, is_local, num_passes, use_sparse_updater):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
+        return swig_api.ParameterUpdater.createNewRemoteUpdater(
+            self.__opt_conf__, pserver_spec, use_etcd)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater,
+                       pserver_spec, use_etcd):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -59,44 +77,56 @@ class Optimizer(object):
             if use_sparse_remote_updater:
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
+
+        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
+        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
         :return: parameter_updater
         """
         if is_local:
             parameter_updater = self.__create_local_updater__()
         else:
-            parameter_updater = self.__create_remote_updater__(
-                num_passes, use_sparse_updater)
+            if pserver_spec is None:
+                parameter_updater = self.__create_remote_updater__(
+                    num_passes, use_sparse_updater)
+            else:
+                parameter_updater = self.__create_new_remote_updater__(
+                    pserver_spec, use_etcd)
         return parameter_updater
 
 
 class Momentum(Optimizer):
     """
-    SGD Optimizer.
+    Momentum Optimizer.
 
-    SGD is an optimization method, trying to find a neural network that
-    minimize the "cost/error" of it by iteration. In paddle's implementation
-    SGD Optimizer is synchronized, which means all gradients will be wait to
-    calculate and reduced into one gradient, then do optimize operation.
-
-    The neural network consider the learning problem of minimizing an objective
-    function, that has the form of a sum
+    When sparse=False, the momentum update formula is as follows:
 
     ..  math::
 
-        Q(w) = \\sum_{i}^{n} Q_i(w)
+        v_{t} &= k * v_{t-1} - \\gamma_t (g_{t} + \\lambda w_{t-1}) \\\\
+        w_{t} &= w_{t-1} + v_{t} \\\\
 
-    The value of function Q sometimes is the cost of neural network (Mean
-    Square Error between prediction and label for example). The function Q is
-    parametrised by w, the weight/bias of neural network. And weights is what to
-    be learned. The i is the i-th observation in (trainning) data.
+    where, :math:`k` is momentum, :math:`\\lambda` is decay rate,
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+    :math:`w_{t}` is the weight as the t'th iteration.
+    And the :math:`v_{t}` is the history momentum variable.
 
-    So, the SGD method will optimize the weight by
+    When sparse=True, the update scheme:
 
     ..  math::
 
-        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
-
-    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+        \\alpha_t &= \\alpha_{t-1} / k \\\\
+        \\beta_t &= \\beta_{t-1} / (1 + \\lambda \\gamma_t) \\\\
+        u_t &= u_{t-1} - \\alpha_t \\gamma_t g_t \\\\
+        v_t &= v_{t-1} + \\tau_{t-1} \\alpha_t \\gamma_t g_t \\\\
+        \\tau_t &= \\tau_{t-1} + \\beta_t / \\alpha_t
+    
+    where :math:`k` is momentum, :math:`\\lambda` is decay rate, 
+    :math:`\\gamma_t` is learning rate at the t'th iteration.
+
+    :param momentum: the momentum factor.
+    :type momentum: float
+    :param sparse: with sparse support or not, False by default.
+    :type sparse: bool
     """
 
     def __init__(self, momentum=None, sparse=False, **kwargs):
@@ -116,7 +146,7 @@ class Adam(Optimizer):
 
         m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
         v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
-        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+        w & = w - \\frac{\\eta m(w, t)}{\\sqrt{v(w,t) + \\epsilon}}
 
     :param beta1: the :math:`\\beta_1` in equation.
     :type beta1: float
@@ -255,6 +285,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 64805d0c50..7b7d1a1d16 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,6 +1,21 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+from collections import OrderedDict
+import paddle.trainer.config_parser as cp
 import struct
 import tarfile
 import cStringIO
@@ -18,16 +33,35 @@ def create(layers):
     """
     topology = Topology(layers)
     pool = Parameters()
+    initializers = cp.g_parameter_initializer_map
     for param in topology.proto().parameters:
         pool.__append_config__(param)
+        if param.name in initializers:
+            pool[param.name] = initializers[param.name](param.name)
     return pool
 
 
 class Parameters(object):
     """
-    Parameters is a dictionary contains Paddle's parameter. The key of
-    Parameters is the name of parameter. The value of Parameters is a plain
-    :code:`numpy.ndarry` .
+    `Parameters` manages all the learnable parameters in a neural network.
+    It stores parameters' information in an OrderedDict. The key is
+    the name of a parameter, and value is a parameter's configuration(in
+    protobuf format), such as initialization mean and std, its size, whether it
+    is a static parameter, and so on.
+
+    :param __param_conf__: store the configurations of learnable parameters in
+        the network in an OrderedDict. Parameter is added one by one into the
+        dict by following their created order in the network: parameters of
+        the previous layers in a network are careted first. You can visit the
+        parameters from bottom to top by iterating over this dict.
+    :type __param_conf__: OrderedDict
+    :param __gradient_machines__: all of the parameters in a neural network are
+        appended to a PaddlePaddle gradient machine, which is used internally to
+        copy parameter values between C++ and Python end.
+    :type __gradient_machines__: list
+    :param __tmp_params__: a dict to store dummy parameters if no
+        __gradient_machines__ is appended to `Parameters`.
+    :type __tmp_params__: dict
 
     Basically usage is
 
@@ -45,9 +79,9 @@ class Parameters(object):
     """
 
     def __init__(self):
-        self.__param_conf__ = dict()
+        self.__param_conf__ = OrderedDict()
         self.__gradient_machines__ = []
-        self.__tmp_params__ = []
+        self.__tmp_params__ = dict()
 
     def __append_config__(self, param_conf):
         """
@@ -67,6 +101,10 @@ class Parameters(object):
 
         self.__param_conf__[param_conf.name] = param_conf
 
+    def update_param_conf(self, model_config):
+        for p in model_config.parameters:
+            self.__param_conf__[p.name] = p
+
     def keys(self):
         """
         keys are the names of each parameter.
@@ -110,34 +148,23 @@ class Parameters(object):
         """
         return iter(self.__param_conf__)
 
-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
+    def __getter_inner(self, key, param_type):
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
             # create new parameter in python numpy.
-            if len(self.__tmp_params__) != 0:
-                ret_list = [
-                    mat for name, mat in self.__tmp_params__ if name == key
-                ]
-                if len(ret_list) == 1:
-                    return ret_list[0]
-            return np.ndarray(shape=shape, dtype=np.float32)
+            if key in self.__tmp_params__:
+                return self.__tmp_params__[key]
+            else:
+                return np.ndarray(shape=shape, dtype=np.float32)
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 param = __get_parameter_in_gradient_machine__(
                     each_gradient_machine, key)
                 # for simplify implementation now, we always copy from C++
                 assert isinstance(param, api.Parameter)
-                val = param.getBuf(api.PARAMETER_VALUE)
+                val = param.getBuf(param_type)
                 assert isinstance(val, api.Vector)
                 val = val.copyToNumpyArray()
                 return val
@@ -145,6 +172,19 @@ class Parameters(object):
 
             raise RuntimeError("Unexpected branch")
 
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_VALUE)
+
     def get_shape(self, key):
         """
         get shape of the parameter.
@@ -183,7 +223,7 @@ class Parameters(object):
                              (shape, value.shape))
 
         if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__.append((key, value))
+            self.__tmp_params__[key] = value
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 __copy_parameter_to_gradient_machine__(each_gradient_machine,
@@ -201,6 +241,22 @@ class Parameters(object):
         """
         return self.__getitem__(key=parameter_name)
 
+    def get_grad(self, key):
+        """
+        Get grandient by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: parameter name
+        :type key: basestring
+        :return: The grandient matrix.
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        if self.__param_conf__[key].is_static:
+            return np.zeros(self.__param_conf__[key].size, dtype=np.float32)
+
+        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
+
     def set(self, parameter_name, value):
         """
         Set parameter by parameter name & matrix.
@@ -218,16 +274,16 @@ class Parameters(object):
         append gradient machine to parameters. This method is used internally in
         Trainer.train.
 
-        :param gradient_machine: Paddle C++ GradientMachine object.
+        :param gradient_machine: PaddlePaddle C++ GradientMachine object.
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
         if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__:
+            for name, val in self.__tmp_params__.iteritems():
                 try:
                     __copy_parameter_to_gradient_machine__(gradient_machine,
                                                            name, val)
@@ -249,7 +305,13 @@ class Parameters(object):
         size = reduce(lambda a, b: a * b, param.shape)
         f.write(struct.pack("IIQ", 0, 4, size))
         param = param.astype(np.float32)
-        f.write(param.tostring())
+        s = param.tostring()
+        wrote_size = 0
+        buf = buffer(s, wrote_size, 65535)
+        while buf:  # f.write crashes with big data blog.
+            f.write(buf)
+            wrote_size += 65535
+            buf = buffer(s, wrote_size, 65535)
 
     def deserialize(self, name, f):
         """
@@ -264,6 +326,17 @@ class Parameters(object):
         self.set(name, arr.reshape(self.get_shape(name)))
 
     def to_tar(self, f):
+        """
+        Save parameters to a tar file.
+
+        WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)`
+            to save parameters most of the time. Otherwise, some settings such
+            as model average will not take effect.
+
+        :param f:
+        :type f: file
+        :return:
+        """
         tar = tarfile.TarFile(fileobj=f, mode='w')
         for nm in self.names():
             buf = cStringIO.StringIO()
@@ -283,6 +356,18 @@ class Parameters(object):
 
     @staticmethod
     def from_tar(f):
+        """
+        Create a `Parameters` object from the given file. And
+        the `Parameters` only contains the parameters in this
+        file. It is adapted the parameters are same in the
+        defined network and the given file. For example, it
+        can be used in the inference.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: A Parameters object.
+        :rtype: Parameters.
+        """
         params = Parameters()
         tar = tarfile.TarFile(fileobj=f, mode='r')
         for finfo in tar:
@@ -298,6 +383,24 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
+    def init_from_tar(self, f, exclude_params=[]):
+        """
+        Different from `from_tar`, this interface can be used to
+        init partial network parameters from another saved model.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :param exclude_params: the names of parameters that should  
+            not be initialized from the model file.
+        :type exclude_params: list of strings
+        :return: Nothing.
+        """
+
+        tar_param = Parameters.from_tar(f)
+        for pname in tar_param.names():
+            if pname in self.names() and pname not in exclude_params:
+                self.set(pname, tar_param.get(pname))
+
 
 def __get_parameter_in_gradient_machine__(gradient_machine, name):
     """
@@ -331,6 +434,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py
index 6f7bd039b0..c18e63dd5f 100644
--- a/python/paddle/v2/plot/plot.py
+++ b/python/paddle/v2/plot/plot.py
@@ -56,7 +56,7 @@ class Ploter(object):
         assert isinstance(data, PlotData)
         data.append(step, value)
 
-    def plot(self):
+    def plot(self, path=None):
         if self.__plot_is_disabled__():
             return
 
@@ -68,8 +68,11 @@ class Ploter(object):
                 titles.append(title)
                 self.plt.plot(data.step, data.value)
         self.plt.legend(titles, loc='upper left')
-        self.display.clear_output(wait=True)
-        self.display.display(self.plt.gcf())
+        if path is None:
+            self.display.clear_output(wait=True)
+            self.display.display(self.plt.gcf())
+        else:
+            self.plt.savefig(path)
         self.plt.gcf().clear()
 
     def reset(self):
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
index da5cd76488..4b6c1c8096 100644
--- a/python/paddle/v2/plot/tests/CMakeLists.txt
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT APPLE)
   # The Mac OS X backend will not be able to function correctly if Python is
   # not installed as a framework.
-  add_python_test(test_ploter test_ploter.py)
+  py_test(test_ploter SRCS test_ploter.py)
 endif()
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 07142056f8..421f6c933d 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Creator package contains some simple reader creator, which could be used in user
-program.
+Creator package contains some simple reader creator, which could
+be used in user program.
 """
 
-__all__ = ['np_array', 'text_file']
+__all__ = ['np_array', 'text_file', "cloud_reader"]
 
 
 def np_array(x):
@@ -55,3 +55,76 @@ def text_file(path):
         f.close()
 
     return reader
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader from given RecordIO file paths separated by ",",
+        glob pattern is supported.
+    :path: path of recordio files, can be a string or a string list.
+    :returns: data reader of recordio files.
+    """
+
+    import recordio as rec
+    import paddle.v2.reader.decorator as dec
+    import cPickle as pickle
+
+    def reader():
+        if isinstance(paths, basestring):
+            path = paths
+        else:
+            path = ",".join(paths)
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield pickle.loads(r)
+        f.close()
+
+    return dec.buffered(reader, buf_size)
+
+
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
+    """
+    Create a data reader that yield a record one by one from
+        the paths:
+    :paths: path of recordio files, can be a string or a string list.
+    :etcd_endpoints: the endpoints for etcd cluster
+    :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
+    """
+    import os
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+
+    if isinstance(paths, basestring):
+        path = [paths]
+    else:
+        path = paths
+    c.set_dataset(path)
+
+    def reader():
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
+
+        while True:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
+                break
+            yield pickle.loads(r)
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index c76faa596c..44a6e34463 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,13 +14,16 @@
 
 __all__ = [
     'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader'
 ]
 
+from threading import Thread
+import subprocess
+
+from Queue import Queue
 import itertools
 import random
-from Queue import Queue
-from threading import Thread
+import zlib
 
 
 def map_readers(func, *readers):
@@ -166,12 +169,12 @@ def buffered(reader, size):
     The buffered data reader will read and save data entries into a
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
-    
+
     :param reader: the data reader to read from.
     :type reader: callable
     :param size: max buffer size.
     :type size: int
-    
+
     :returns: the buffered data reader.
     """
 
@@ -230,7 +233,7 @@ class XmapEndSignal():
     pass
 
 
-def xmap_readers(mapper, reader, process_num, buffer_size):
+def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     """
     Use multiprocess to map samples from reader by a mapper defined by user.
     And this function contains a buffered decorator.
@@ -238,16 +241,16 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
     :type mapper: callable
     :param reader: the data reader to read from
     :type reader: callable
-    :param process_num: process number to handle original sample 
+    :param process_num: process number to handle original sample
     :type process_num: int
     :param buffer_size: max buffer size
     :type buffer_size: int
+    :param order: keep the order of reader
+    :type order: bool
     :return: the decarated reader
     :rtype: callable
     """
     end = XmapEndSignal()
-    in_queue = Queue(buffer_size)
-    out_queue = Queue(buffer_size)
 
     # define a worker to read samples from reader to in_queue
     def read_worker(reader, in_queue):
@@ -255,10 +258,13 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
             in_queue.put(i)
         in_queue.put(end)
 
-    # start a read worker in a thread
-    t = Thread(target=read_worker, args=(reader, in_queue))
-    t.daemon = True
-    t.start()
+    # define a worker to read samples from reader to in_queue with order flag
+    def order_read_worker(reader, in_queue):
+        in_order = 0
+        for i in reader():
+            in_queue.put((in_order, i))
+            in_order += 1
+        in_queue.put(end)
 
     # define a worker to handle samples from in_queue by mapper
     # and put mapped samples into out_queue
@@ -271,17 +277,42 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
         in_queue.put(end)
         out_queue.put(end)
 
-    # start several handle_workers
-    workers = []
-    for i in xrange(process_num):
-        worker = Thread(
-            target=handle_worker, args=(in_queue, out_queue, mapper))
-        worker.daemon = True
-        workers.append(worker)
-    for w in workers:
-        w.start()
+    # define a worker to handle samples from in_queue by mapper
+    # and put mapped samples into out_queue by order
+    def order_handle_worker(in_queue, out_queue, mapper, out_order):
+        ins = in_queue.get()
+        while not isinstance(ins, XmapEndSignal):
+            order, sample = ins
+            r = mapper(sample)
+            while order != out_order[0]:
+                pass
+            out_queue.put(r)
+            out_order[0] += 1
+            ins = in_queue.get()
+        in_queue.put(end)
+        out_queue.put(end)
 
     def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
         sample = out_queue.get()
         while not isinstance(sample, XmapEndSignal):
             yield sample
@@ -295,3 +326,80 @@ def xmap_readers(mapper, reader, process_num, buffer_size):
                 yield sample
 
     return xreader
+
+
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+
+
+class PipeReader:
+    """
+        PipeReader read data by stream from a command, take it's 
+        stdout into a pipe buffer and redirect it to the parser to
+        parse, then yield data as your desired format.
+
+        You can using standard linux command or call another program
+        to read data, from HDFS, Ceph, URL, AWS S3 etc:
+
+        .. code-block:: python
+           cmd = "hadoop fs -cat /path/to/some/file"
+           cmd = "cat sample_file.tar.gz"
+           cmd = "curl http://someurl"
+           cmd = "python print_s3_bucket.py"
+
+        An example:
+
+        .. code-block:: python
+    
+           def example_reader():
+               for f in myfiles:
+                   pr = PipeReader("cat %s"%f)
+                   for l in pr.get_line():
+                       sample = l.split(" ")
+                       yield sample
+    """
+
+    def __init__(self, command, bufsize=8192, file_type="plain"):
+        if not isinstance(command, str):
+            raise TypeError("left_cmd must be a string")
+        if file_type == "gzip":
+            self.dec = zlib.decompressobj(
+                32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+        self.file_type = file_type
+        self.bufsize = bufsize
+        self.process = subprocess.Popen(
+            command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+
+    def get_line(self, cut_lines=True, line_break="\n"):
+        """
+        :param cut_lines: cut buffer to lines
+        :type cut_lines: bool
+        :param line_break: line break of the file, like \n or \r
+        :type line_break: string
+
+        :return: one line or a buffer of bytes
+        :rtype: string
+        """
+        remained = ""
+        while True:
+            buff = self.process.stdout.read(self.bufsize)
+            if buff:
+                if self.file_type == "gzip":
+                    decomp_buff = self.dec.decompress(buff)
+                elif self.file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" %
+                                    self.file_type)
+
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    for line in lines:
+                        yield line
+                else:
+                    yield decomp_buff
+            else:
+                break
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
index 6a1d337b23..107d5912e1 100644
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -1 +1,2 @@
-add_python_test(reader_tests creator_test.py decorator_test.py)
+py_test(creator_test SRCS creator_test.py)
+py_test(decorator_test SRCS decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
index e69de29bb2..b94a21a7e4 100644
--- a/python/paddle/v2/reader/tests/__init__.py
+++ b/python/paddle/v2/reader/tests/__init__.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index 9f8d7133b8..ac6cd4e9b6 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 # Copyright PaddlePaddle contributors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,9 +27,7 @@
 # limitations under the License.
 import os
 import unittest
-
 import numpy as np
-
 import paddle.v2.reader.creator
 
 
@@ -36,5 +48,27 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
+class TestRecordIO(unittest.TestCase):
+    def do_test(self, path):
+        reader = paddle.v2.reader.creator.recordio(path)
+        idx = 0
+        for e in reader():
+            if idx == 0:
+                self.assertEqual(e, (1, 2, 3))
+            elif idx == 1:
+                self.assertEqual(e, (4, 5, 6))
+            idx += 1
+        self.assertEqual(idx, 2)
+
+    def test_recordIO(self):
+        self.do_test(
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat"))
+        self.do_test([
+            os.path.join(
+                os.path.dirname(__file__), "test_reader_recordio.dat")
+        ])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index 734154b979..e41e9c78a0 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import time
 import unittest
 
@@ -121,5 +122,57 @@ class TestShuffle(unittest.TestCase):
             self.assertEqual(total, 10)
 
 
+class TestXmap(unittest.TestCase):
+    def test_xmap(self):
+        def mapper(x):
+            return (x + 1)
+
+        orders = (True, False)
+        thread_nums = (1, 2, 4, 8, 16)
+        buffered_size = (1, 2, 4, 8, 16)
+        for order in orders:
+            for tNum in thread_nums:
+                for size in buffered_size:
+                    reader = paddle.v2.reader.xmap_readers(mapper,
+                                                           reader_creator_10(0),
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
+
+
+class TestPipeReader(unittest.TestCase):
+    def test_pipe_reader(self):
+        def example_reader(myfiles):
+            for f in myfiles:
+                pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128)
+                for l in pr.get_line():
+                    yield l
+
+        import tempfile
+
+        records = [str(i) for i in xrange(5)]
+        temp = tempfile.NamedTemporaryFile()
+        try:
+            with open(temp.name, 'w') as f:
+                for r in records:
+                    f.write('%s\n' % r)
+
+            result = []
+            for r in example_reader([temp.name]):
+                result.append(r)
+
+            for idx, e in enumerate(records):
+                self.assertEqual(e, result[idx])
+        finally:
+            # delete the temporary file
+            temp.close()
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/v2/reader/tests/test_reader_recordio.dat
new file mode 100644
index 0000000000..a99a35bb82
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_reader_recordio.dat differ
diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000..17aa89b679
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index 058f22befd..b4333ed530 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -1,2 +1,8 @@
-add_python_test(test_v2_api test_data_feeder.py test_op.py test_parameters.py
-test_layer.py test_rnn_layer.py test_topology.py test_image.py)
+py_test(test_op SRCS test_op.py)
+py_test(test_image SRCS test_image.py)
+py_test(test_layer SRCS test_layer.py)
+py_test(test_topology SRCS test_topology.py)
+py_test(test_rnn_layer SRCS test_rnn_layer.py)
+py_test(test_parameters SRCS test_parameters.py)
+py_test(test_data_feeder SRCS test_data_feeder.py)
+py_test(test_paramconf_order SRCS test_paramconf_order.py)
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
index 83da678da3..63905c04cf 100644
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -97,7 +97,7 @@ class DataFeederTest(unittest.TestCase):
             each_sample.append(zip(a, b))
             data.append(each_sample)
 
-        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+        feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
                             {'input': 0})
         arg = feeder(data)
         output = arg.getSlotValue(0)
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
index b2d773510d..2b0444bb03 100644
--- a/python/paddle/v2/tests/test_image.py
+++ b/python/paddle/v2/tests/test_image.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import unittest
 import numpy as np
 
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index f2097e195f..710e8135f2 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import unittest
 
 import paddle.v2.activation as activation
@@ -134,19 +135,21 @@ class CostLayerTest(unittest.TestCase):
         cost3 = layer.cross_entropy_cost(input=inference, label=label)
         cost4 = layer.cross_entropy_with_selfnorm_cost(
             input=inference, label=label)
-        cost5 = layer.mse_cost(input=inference, label=label)
-        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
+        cost5 = layer.square_error_cost(input=inference, label=label)
+        cost6 = layer.square_error_cost(
+            input=inference, label=label, weight=weight)
         cost7 = layer.multi_binary_label_cross_entropy_cost(
             input=inference, label=label)
         cost8 = layer.rank_cost(left=score, right=score, label=score)
         cost9 = layer.lambda_cost(input=inference, score=score)
         cost10 = layer.sum_cost(input=inference)
-        cost11 = layer.huber_cost(input=score, label=label)
+        cost11 = layer.huber_regression_cost(input=score, label=label)
+        cost12 = layer.huber_classification_cost(input=score, label=label)
 
         print layer.parse_network([cost1, cost2])
         print layer.parse_network([cost3, cost4])
         print layer.parse_network([cost5, cost6])
-        print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
+        print layer.parse_network([cost7, cost8, cost9, cost10, cost11, cost12])
 
         crf = layer.crf(input=inference, label=label)
         crf_decoding = layer.crf_decoding(input=inference, size=3)
diff --git a/python/paddle/v2/tests/test_op.py b/python/paddle/v2/tests/test_op.py
index 69acccddf4..dd04cc4ab6 100644
--- a/python/paddle/v2/tests/test_op.py
+++ b/python/paddle/v2/tests/test_op.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import unittest
 
 import paddle.v2.data_type as data_type
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
new file mode 100644
index 0000000000..33c240b8f5
--- /dev/null
+++ b/python/paddle/v2/tests/test_paramconf_order.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import math
+import paddle.v2 as paddle
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=5,
+        param_attr=paddle.attr.Param(
+            name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
+    return wordemb
+
+
+def train():
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    # Every layer takes integer value of range [0, dict_size)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(name="fc1",
+                              input=contextemb,
+                              size=128,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(5 * 8),
+                                  learning_rate=1,
+                                  l2_rate=6e-4))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    return paddle.layer.classification_cost(input=predictword, label=nextword)
+
+
+class TestParamConfOrder(unittest.TestCase):
+    def test_param_conf_order(self):
+        paddle.init()
+        cost = train()
+        parameters = paddle.parameters.create(cost)
+        adagrad = paddle.optimizer.AdaGrad(
+            learning_rate=3e-3,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+        trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+        for p in trainer.get_topology_proto().parameters:
+            if p.name == "_fc1.w0":
+                self.assertEqual(p.decay_rate, 6e-4)
+            else:
+                self.assertEqual(p.decay_rate, 8e-4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index ebb182caab..1fe1f09b9d 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import unittest
 import sys
 
@@ -11,20 +25,26 @@ except ImportError:
     sys.exit(0)
 
 import paddle.v2.parameters as parameters
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.v2.attr import ParamAttr
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import random
 import cStringIO
 import numpy
 
 
-def __rand_param_config__(name):
+def __rand_param_config__(name, psize=None):
     conf = ParameterConfig()
     conf.name = name
     size = 1
-    for i in xrange(2):
-        dim = random.randint(1, 1000)
-        conf.dims.append(dim)
-        size *= dim
+    if psize is None:
+        for i in xrange(2):
+            dim = random.randint(1, 1000)
+            conf.dims.append(dim)
+            size *= dim
+    else:
+        size = psize
     conf.size = size
     assert conf.IsInitialized()
     return conf
@@ -55,6 +75,69 @@ class TestParameters(unittest.TestCase):
             p1 = params_dup.get(name)
             self.assertTrue(numpy.isclose(p0, p1).all())
 
+    def test_initializer(self):
+        def initializer(name):
+            assert name == "fc.w"
+            mat = numpy.ones((3, 2), dtype=numpy.float32)
+            mat[1, 1] = 2
+            return mat
+
+        x = layer.data(name="x", type=data_type.dense_vector(3))
+        y = layer.fc(x,
+                     size=2,
+                     bias_attr=False,
+                     param_attr=ParamAttr(
+                         name="fc.w", initializer=initializer))
+        params = parameters.create(y)
+        val = params["fc.w"]
+        assert val.shape == (3, 2)
+        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
+        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
+
+    def test_init_from_tar(self):
+        def get_param(names, size):
+            p = parameters.Parameters()
+            for k, v in zip(names, size):
+                p.__append_config__(__rand_param_config__(k, v))
+            for name in p.names():
+                param = p.get(name)
+                param[:] = numpy.random.uniform(
+                    -1.0, 1.0, size=p.get_shape(name))
+                p.set(name, param)
+            return p
+
+        def get_parames():
+            name1 = ['param_0', 'param_1']
+            size1 = [128, 256]
+            p1 = get_param(name1, size1)
+            file1 = cStringIO.StringIO()
+            p1.to_tar(file1)
+            file1.seek(0)
+
+            name2 = ['param_0', 'param_1', 'param_2']
+            size2 = [128, 256, 288]
+            p2 = get_param(name2, size2)
+            file2 = cStringIO.StringIO()
+            p2.to_tar(file2)
+            file2.seek(0)
+            return p1, file1, p2, file2
+
+        p1, file1, p2, file2 = get_parames()
+        p2.init_from_tar(file1)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
+        p1, file1, p2, file2 = get_parames()
+        p1.init_from_tar(file2)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
index 192b0ee678..7920e342e1 100644
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ b/python/paddle/v2/tests/test_rnn_layer.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import difflib
 import unittest
 
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
index 7fd2ee82fd..11b4154eed 100644
--- a/python/paddle/v2/tests/test_topology.py
+++ b/python/paddle/v2/tests/test_topology.py
@@ -1,4 +1,4 @@
-# Copyright PaddlePaddle contributors. All Rights Reserved
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
 import unittest
 import paddle.v2.layer as layer
 import paddle.v2.topology as topology
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index f3bb4d5f10..923ccecb0b 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -18,6 +18,8 @@ from paddle.proto.ModelConfig_pb2 import ModelConfig
 import paddle.trainer_config_helpers as conf_helps
 import layer as v2_layer
 import config_base
+import cPickle
+from paddle.trainer import config_parser as cp
 
 __all__ = ['Topology']
 
@@ -31,7 +33,6 @@ class Topology(object):
     def __init__(self, layers, extra_layers=None):
         def __check__(layers):
             if not isinstance(layers, collections.Sequence):
-                __check_layer_type__(layers)
                 layers = [layers]
             for layer in layers:
                 __check_layer_type__(layer)
@@ -50,6 +51,35 @@ class Topology(object):
 
         assert isinstance(self.__model_config__, ModelConfig)
 
+    def update_from_default(self):
+        # HACK(typhoonzero): update ParameterConfig(proto) in case of
+        # optimizers are defined after layers, or between layers.
+        # Must be called from trainer.__init__()
+        for parameter in self.__model_config__.parameters:
+            if parameter.momentum == 0.0 and cp.g_default_momentum:
+                parameter.momentum = cp.g_default_momentum
+            if parameter.decay_rate == 0.0 and cp.g_default_decay_rate:
+                parameter.decay_rate = cp.g_default_decay_rate
+            if parameter.initial_mean == 0.0:
+                parameter.initial_mean = cp.g_default_initial_mean
+            if parameter.initial_std == 0.01:
+                parameter.initial_std = cp.g_default_initial_std
+            if parameter.initial_strategy == 0:
+                parameter.initial_strategy = cp.g_default_initial_strategy
+            if parameter.initial_smart == False:
+                parameter.initial_smart = cp.g_default_initial_smart
+            if parameter.num_batches_regularization == 1 and \
+                cp.g_default_num_batches_regularization:
+                parameter.num_batches_regularization = \
+                    cp.g_default_num_batches_regularization
+            if parameter.gradient_clipping_threshold == 0.0 and \
+                cp.g_default_gradient_clipping_threshold:
+                parameter.gradient_clipping_threshold = \
+                    cp.g_default_gradient_clipping_threshold
+            if parameter.device == -1 and cp.g_default_device:
+                parameter.device = cp.g_default_device
+            # FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func
+
     def use_sparse_updater(self):
         """
         check if any parameter require to use sparse_update
@@ -91,6 +121,7 @@ class Topology(object):
         [('image', dense_vector(768)), ('label', integer_value(10))]
         """
         data_layers = self.data_layers()
+
         return [(nm, data_layers[nm].data_type)
                 for nm in self.proto().input_layer_names]
 
@@ -100,6 +131,14 @@ class Topology(object):
                 return layer
         return None
 
+    def serialize_for_inference(self, stream):
+        protobin = self.proto().SerializeToString()
+        data_type = self.data_type()
+        cPickle.dump({
+            'protobin': protobin,
+            'data_type': data_type
+        }, stream, cPickle.HIGHEST_PROTOCOL)
+
 
 def __check_layer_type__(layer):
     if not isinstance(layer, config_base.Layer):
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 8fdb67cc26..a0060bf227 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -1,13 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -33,15 +40,24 @@ class SGD(object):
     SGD Trainer combines data reader, network topolopy and update_equation together
     to train/test a neural network.
 
-    :param update_equation: The optimizer object.
-    :type update_equation: paddle.v2.optimizer.Optimizer
     :param cost: Target cost that neural network should be optimized.
     :type cost: paddle.v2.config_base.Layer
     :param parameters: The parameters dictionary.
     :type parameters: paddle.v2.parameters.Parameters
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
     :type extra_layers: paddle.v2.config_base.Layer
+    :param is_local: Whether trainning locally
+    :type is_local: bool
+    :param pserver_spec: comma string for pserver location,
+                         eg:127.10.0.10:3000,127.10.0.11:3000,
+                         and this parameter is only used for fault
+                         tolerant mode cluster training.
+    :type pserver_spec: string
+    :param use_etcd: Whether using etcd pserver.
+    :param use_etcd: bool
     """
 
     def __init__(self,
@@ -49,7 +65,9 @@ class SGD(object):
                  parameters,
                  update_equation,
                  extra_layers=None,
-                 is_local=True):
+                 is_local=True,
+                 pserver_spec=None,
+                 use_etcd=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -57,12 +75,20 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
+        # HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers
+        # are defined after layers, or between layers.
+        topology.update_from_default()
+        parameters.update_param_conf(topology.proto())
+
         self.__optimizer__ = update_equation
         self.__topology__ = topology
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
+        self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -83,6 +109,9 @@ class SGD(object):
         self.__parameters__.append_gradient_machine(gm)
         self.__parameter_updater__ = None
 
+    def get_topology_proto(self):
+        return self.__topology_in_proto__
+
     def __use_remote_sparse_updater__(self):
         return self.__use_sparse_updater__ and not self.__is_local__
 
@@ -121,12 +150,15 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__)
+            self.__is_local__, num_passes, self.__use_sparse_updater__,
+            self.__pserver_spec__, self.__use_etcd__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
@@ -153,6 +185,11 @@ class SGD(object):
                                                           pass_type)
                 self.__gradient_machine__.eval(pass_evaluator)
                 self.__gradient_machine__.eval(batch_evaluator)
+                event_handler(
+                    v2_event.EndForwardBackward(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        gm=self.__gradient_machine__))
                 for each_param in self.__gradient_machine__.getNonStaticParameters(
                 ):
                     self.__parameter_updater__.update(each_param)
@@ -165,24 +202,32 @@ class SGD(object):
                         pass_id=pass_id,
                         batch_id=batch_id,
                         cost=cost,
-                        evaluator=batch_evaluator))
+                        evaluator=batch_evaluator,
+                        gm=self.__gradient_machine__))
 
             self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
-            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+            event_handler(
+                v2_event.EndPass(
+                    pass_id,
+                    evaluator=pass_evaluator,
+                    gm=self.__gradient_machine__))
         self.__gradient_machine__.finish()
 
     def test(self, reader, feeding=None):
         """
         Testing method. Will test input data.
 
-        :param reader: A reader that reads and yeilds data items.
+        :param reader: A batch reader that reads and yeilds data items,
+                       it should be a paddle.v2.batch.
         :type reader: collections.Iterable
         :param feeding: Feeding is a map of neural network input name and array
                         index that reader returns.
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/requirements.txt b/python/requirements.txt
new file mode 100644
index 0000000000..daf3f368b9
--- /dev/null
+++ b/python/requirements.txt
@@ -0,0 +1,10 @@
+requests==2.9.2
+numpy>=1.12
+protobuf==3.1
+recordio>=0.1.0
+matplotlib
+rarfile
+scipy>=0.19.0
+Pillow
+nltk>=3.2.2
+graphviz
diff --git a/python/setup.py.in b/python/setup.py.in
index 93724f9188..65ec58ecf9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,4 +1,64 @@
-from setuptools import setup
+from setuptools import setup, Distribution, Extension
+import subprocess
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
+
+MAJOR   = 0
+MINOR   = 11
+PATCH   = 0
+RC      = 0
+ISTAGED = False
+
+
+
+def git_commit():
+    try:
+        cmd = ['git', 'rev-parse', 'HEAD']
+        git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip()
+    except:
+        git_commit = 'Unknown'
+    return git_commit
+
+def write_version_py(filename='paddle/version.py'):
+    cnt = '''
+# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY
+#
+full_version    = '%(major)d.%(minor)d.%(patch)d'
+major           = '%(major)d'
+minor           = '%(minor)d'
+patch           = '%(patch)d'
+rc              = '%(rc)d'
+istaged         = %(istaged)s
+commit          = '%(commit)s'
+with_mkl        = '%(with_mkl)s'
+
+def show():
+    if istaged:
+        print 'full_version:', full_version
+        print 'major:', major
+        print 'minor:', minor
+        print 'patch:', patch
+        print 'rc:', rc
+    else:
+        print 'commit:', commit
+
+def mkl():
+    return with_mkl
+'''
+    commit = git_commit()
+    with open(filename, 'w') as f:
+        f.write(cnt % {
+            'major': MAJOR,
+            'minor': MINOR,
+            'patch': PATCH,
+            'rc': RC,
+            'version': '${PADDLE_VERSION}',
+            'commit': commit,
+            'istaged': ISTAGED,
+            'with_mkl': '@WITH_MKL@'})
+
+write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py')
 
 
 packages=['paddle',
@@ -9,23 +69,49 @@ packages=['paddle',
           'paddle.v2',
           'paddle.v2.dataset',
           'paddle.v2.reader',
-          'paddle.v2.plot']
+          'paddle.v2.master',
+          'paddle.v2.plot',
+          'paddle.v2.fluid',
+          'paddle.v2.fluid.proto',
+          'paddle.v2.fluid.layers',
+          'py_paddle']
 
-setup_requires=["requests",
-                "numpy",
-                "protobuf==3.1",
-                "matplotlib",
-                "rarfile"]
+with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
+    setup_requires = f.read().splitlines()
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
-    setup_requires+=["opencv-python"]
+    setup_requires+=['opencv-python']
+
+# the prefix is sys.prefix which should always be usr
+paddle_bin_dir = 'opt/paddle/bin'
+paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+               '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+               '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main',
+               '${PADDLE_BINARY_DIR}/paddle/scripts/paddle']
+
+paddle_rt_lib_dir = 'lib'
+paddle_rt_libs = ['${WARPCTC_LIBRARIES}']
+if '${MKL_SHARED_LIBS}'!= '':
+  paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';')
 
-setup(name='paddle',
+setup(name='${PACKAGE_NAME}',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
+      ext_modules=[Extension('_foo', ['stub.cc'])],
+      package_data={
+        'paddle.v2.master': ['libpaddle_master.so'],
+        'paddle.v2.fluid': ['core.so'],
+        'py_paddle':['*.py','_swig_paddle.so']
+      },
       package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}'
-      }
+          '': '${CMAKE_CURRENT_SOURCE_DIR}',
+          # The paddle.v2.fluid.proto will be generated while compiling.
+          # So that package points to other directory.
+          'paddle.v2.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
+      },
+      scripts=paddle_bins,
+      data_files=[(paddle_rt_lib_dir, paddle_rt_libs)]
 )
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
new file mode 100644
index 0000000000..b6cae228a0
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.android
@@ -0,0 +1,55 @@
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG ANDROID_ABI
+ARG ANDROID_API
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+ENV ANDROID_API=${ANDROID_API:-21}
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# Install Go and glide
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
+    mkdir -p /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    rm -rf /opt/android-ndk-tmp
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
+
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
new file mode 100644
index 0000000000..0f1b833130
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.x64
@@ -0,0 +1,54 @@
+# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
+# order to satisfy the build of capnproto library (a nupic.core dependency),
+# which requires some headers and symbols not present on CentOS-5 (e.g.,
+# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
+# https://github.com/sandstorm-io/capnproto/issues/350.
+FROM nvidia/cuda:<baseimg>
+MAINTAINER Numenta, based on the ManyLinux project
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+
+COPY build_scripts /build_scripts
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# for paddle
+RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+
+
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
+
+# protobuf 3.1.0
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
+    tar xzf protobuf-cpp-3.1.0.tar.gz && \
+    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+
+
+RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
+
+RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    go get github.com/Masterminds/glide && \
+    rm -rf /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
+
+RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
+    make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
new file mode 100644
index 0000000000..cb0a9ac22c
--- /dev/null
+++ b/tools/manylinux1/README.md
@@ -0,0 +1,30 @@
+# buildtools
+
+We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
+which, we hope could be released as wheel packages on PyPI, so we need
+to make sure that the build follows the
+[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
+
+The manylinux standard suggests building Python modules on an old
+system, because that a module would anyway depend on some shared
+libraries, and Linux's shared library standard states that those built
+with newer version compilers cannot work with those with older
+versions.  The suggested building environment is as old as CentOS 5.
+However, PaddlePaddle relies on CUDA, and the earlies version of
+[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
+So, here we provide a Docker image basing on CentOS 6 and CUDA for
+building PaddlePaddle and making the release supports "as-manylinux as
+possible."  or "sufficiently many Linux" according to [this
+discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
+
+The build output of our Docker image includes multiple wheel files --
+some contain the CPU-only binary, some others support CUDA; some are
+compatible with the cp27m Python ABI, some others with cp27.
+
+To build these wheels, please run the following commands:
+
+```bash
+git clone https://github.com/paddlepaddle/paddle
+cd paddle/tools/manylinux1
+REPO=[yourrepo] ./build_all.sh
+```
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
new file mode 100755
index 0000000000..097bedb526
--- /dev/null
+++ b/tools/manylinux1/build_all.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -xe
+
+REPO="${REPO:-typhoon1986}"
+
+# NOTE: version matches are determined!
+sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
+
+sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
new file mode 100644
index 0000000000..93591fa9dd
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+# Python versions to be installed in /opt/$VERSION_NO
+# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
+# remove others to expedite build and reduce docker image size. The original
+# manylinux docker image project builds many python versions.
+# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
+CPYTHON_VERSIONS="2.7.11 3.5.1"
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.0.2l
+OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.49.1
+CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+# EPEL support
+yum -y install wget curl
+curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
+check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
+
+# Dev toolset (for LLVM and other projects requiring C++11 support)
+curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
+check_sha256sum devtools-2.repo $DEVTOOLS_HASH
+mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
+rpm -Uvh --replacepkgs epel-release-6*.rpm
+rm -f epel-release-6*.rpm
+
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    kernel-devel-`uname -r` \
+    devtoolset-2-binutils devtoolset-2-gcc \
+    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
+    ${PYTHON_COMPILE_DEPS}
+
+# Install more recent version of cmake
+# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
+# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
+# rm cmake-3.8.1-Linux-x86_64.sh
+
+wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
+cd cmake-3.5.2 && ./bootstrap && \
+make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+mkdir -p /opt/python
+build_cpythons $CPYTHON_VERSIONS
+
+PY35_BIN=/opt/python/cp35-cp35m/bin
+# NOTE Since our custom manylinux image builds pythons with shared
+# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
+# python.
+ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
+ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.9njs2.tar.gz
+(cd patchelf-0.9njs2 && ./configure && make && make install)
+rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+
+# Install latest pypi release of auditwheel
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
+yum -y install ${MANYLINUX1_DEPS}
+yum -y clean all > /dev/null 2>&1
+yum list installed
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Add matching directory of libpython shared library to library lookup path
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
+
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Restore LD_LIBRARY_PATH
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
new file mode 100755
index 0000000000..10422ae3bd
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Helper utilities for build
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+# XXX: the official https server at www.openssl.org cannot be reached
+# with the old versions of openssl and curl in Centos 5.11 hence the fallback
+# to the ftp mirror:
+# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
+# Ditto the curl sources
+CURL_DOWNLOAD_URL=http://curl.askapache.com/download
+
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function lex_pyver {
+    # Echoes Python version string padded with zeros
+    # Thus:
+    # 3.2.1 -> 003002001
+    # 3     -> 003000000
+    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
+}
+
+
+function do_cpython_build {
+    local py_ver=$1
+    check_var $py_ver
+    local ucs_setting=$2
+    check_var $ucs_setting
+    tar -xzf Python-$py_ver.tgz
+    pushd Python-$py_ver
+    if [ "$ucs_setting" = "none" ]; then
+        unicode_flags=""
+        dir_suffix=""
+    else
+        local unicode_flags="--enable-unicode=$ucs_setting"
+        local dir_suffix="-$ucs_setting"
+    fi
+    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
+    mkdir -p ${prefix}/lib
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+
+    # NOTE --enable-shared for generating libpython shared library needed for
+    # linking of some of the nupic.core test executables.
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+    make -j2 > /dev/null
+    make install > /dev/null
+    popd
+    echo "ZZZ looking for libpython"
+    find / -name 'libpython*.so*'
+    rm -rf Python-$py_ver
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    # NOTE Make libpython shared library visible to python calls below
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
+    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
+        # NOTE We only need wide unicode for nupic.bindings wheel
+        do_cpython_build $py_ver ucs2
+        do_cpython_build $py_ver ucs4
+    else
+        do_cpython_build $py_ver none
+    fi
+    rm -f Python-$py_ver.tgz
+}
+
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm get-pip.py
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/tools/manylinux1/build_scripts/manylinux1-check.py b/tools/manylinux1/build_scripts/manylinux1-check.py
new file mode 100644
index 0000000000..a27eab1c77
--- /dev/null
+++ b/tools/manylinux1/build_scripts/manylinux1-check.py
@@ -0,0 +1,70 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+    if get_platform() not in ["linux-x86_64", "linux-i686"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+if is_manylinux1_compatible():
+    print("%s is manylinux1 compatible" % (sys.executable, ))
+    sys.exit(0)
+else:
+    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
+    sys.exit(1)
diff --git a/tools/manylinux1/build_scripts/python-tag-abi-tag.py b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
new file mode 100644
index 0000000000..cd2573314c
--- /dev/null
+++ b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
@@ -0,0 +1,21 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Utility script to print the python tag + the abi tag for a Python
+# See PEP 425 for exactly what these are, but an example would be:
+#   cp27-cp27mu
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+
+print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/manylinux1/build_scripts/ssl-check.py b/tools/manylinux1/build_scripts/ssl-check.py
new file mode 100644
index 0000000000..34a3116207
--- /dev/null
+++ b/tools/manylinux1/build_scripts/ssl-check.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+    EXC = OSError
+else:
+    from urllib import urlopen
+    EXC = IOError
+
+print("Connecting to %s should work" % (GOOD_SSL, ))
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print("Connecting to %s should fail" % (BAD_SSL, ))
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")
diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md
deleted file mode 100644
index 9442f76941..0000000000
--- a/v1_api_demo/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-The examples in v1_api_demo are using v1_api now, and will be upgraded into v2_api later.
-Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
-
-Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
-[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
diff --git a/v1_api_demo/gan/.gitignore b/v1_api_demo/gan/.gitignore
deleted file mode 100644
index 93a6f5080a..0000000000
--- a/v1_api_demo/gan/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-output/
-uniform_params/
-cifar_params/
-mnist_params/
-*.png
-.pydevproject
-.project
-*.log
-*.pyc
-data/mnist_data/
-data/cifar-10-batches-py/
diff --git a/v1_api_demo/gan/README.md b/v1_api_demo/gan/README.md
deleted file mode 100644
index 1908b534b0..0000000000
--- a/v1_api_demo/gan/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# Generative Adversarial Networks (GAN) 
-
-This demo implements GAN training described in the original GAN paper (https://arxiv.org/abs/1406.2661) and DCGAN (https://arxiv.org/abs/1511.06434).
-
-The general training procedures are implemented in gan_trainer.py. The neural network configurations are specified in gan_conf.py (for synthetic data) and gan_conf_image.py (for image data).
-
-In order to run the model, first download the corresponding data by running the shell script in ./data.
-Then you can run the command below. The flag -d specifies the training data (cifar, mnist or uniform) and flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-
-$python gan_trainer.py -d cifar --use_gpu 1
-
-The generated images will be stored in ./cifar_samples/
-The corresponding models will be stored in ./cifar_params/
diff --git a/v1_api_demo/gan/data/get_mnist_data.sh b/v1_api_demo/gan/data/get_mnist_data.sh
deleted file mode 100755
index a77c81bf5a..0000000000
--- a/v1_api_demo/gan/data/get_mnist_data.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env sh
-# This script downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/mnist_data"
-mkdir "$DIR/mnist_data"
-cd "$DIR/mnist_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
diff --git a/v1_api_demo/gan/gan_conf.py b/v1_api_demo/gan/gan_conf.py
deleted file mode 100644
index 86ac2dffe5..0000000000
--- a/v1_api_demo/gan/gan_conf.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-mode = get_config_arg("mode", str, "generator")
-assert mode in set([
-    "generator", "discriminator", "generator_training", "discriminator_training"
-])
-
-is_generator_training = mode == "generator_training"
-is_discriminator_training = mode == "discriminator_training"
-is_generator = mode == "generator"
-is_discriminator = mode == "discriminator"
-
-# The network structure below follows the ref https://arxiv.org/abs/1406.2661
-# Here we used two hidden layers and batch_norm
-
-print('mode=%s' % mode)
-# the dim of the noise (z) as the input of the generator network
-noise_dim = 10
-# the dim of the hidden layer
-hidden_dim = 10
-# the dim of the generated sample
-sample_dim = 2
-
-settings(
-    batch_size=128,
-    learning_rate=1e-4,
-    learning_method=AdamOptimizer(beta1=0.5))
-
-
-def discriminator(sample):
-    """
-    discriminator ouputs the probablity of a sample is from generator
-    or real data.
-    The output has two dimenstional: dimension 0 is the probablity
-    of the sample is from generator and dimension 1 is the probabblity
-    of the sample is from real data.
-    """
-    param_attr = ParamAttr(is_static=is_generator_training)
-    bias_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=1.0, initial_std=0)
-
-    hidden = fc_layer(
-        input=sample,
-        name="dis_hidden",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=ReluActivation())
-
-    hidden2 = fc_layer(
-        input=hidden,
-        name="dis_hidden2",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    hidden_bn = batch_norm_layer(
-        hidden2,
-        act=ReluActivation(),
-        name="dis_hidden_bn",
-        bias_attr=bias_attr,
-        param_attr=ParamAttr(
-            is_static=is_generator_training, initial_mean=1.0,
-            initial_std=0.02),
-        use_global_stats=False)
-
-    return fc_layer(
-        input=hidden_bn,
-        name="dis_prob",
-        size=2,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=SoftmaxActivation())
-
-
-def generator(noise):
-    """
-    generator generates a sample given noise
-    """
-    param_attr = ParamAttr(is_static=is_discriminator_training)
-    bias_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
-
-    hidden = fc_layer(
-        input=noise,
-        name="gen_layer_hidden",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=ReluActivation())
-
-    hidden2 = fc_layer(
-        input=hidden,
-        name="gen_hidden2",
-        size=hidden_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    hidden_bn = batch_norm_layer(
-        hidden2,
-        act=ReluActivation(),
-        name="gen_layer_hidden_bn",
-        bias_attr=bias_attr,
-        param_attr=ParamAttr(
-            is_static=is_discriminator_training,
-            initial_mean=1.0,
-            initial_std=0.02),
-        use_global_stats=False)
-
-    return fc_layer(
-        input=hidden_bn,
-        name="gen_layer1",
-        size=sample_dim,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim)
-
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
diff --git a/v1_api_demo/gan/gan_conf_image.py b/v1_api_demo/gan/gan_conf_image.py
deleted file mode 100644
index c469227994..0000000000
--- a/v1_api_demo/gan/gan_conf_image.py
+++ /dev/null
@@ -1,298 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-mode = get_config_arg("mode", str, "generator")
-dataSource = get_config_arg("data", str, "mnist")
-assert mode in set([
-    "generator", "discriminator", "generator_training", "discriminator_training"
-])
-
-is_generator_training = mode == "generator_training"
-is_discriminator_training = mode == "discriminator_training"
-is_generator = mode == "generator"
-is_discriminator = mode == "discriminator"
-
-# The network structure below follows the dcgan paper 
-# (https://arxiv.org/abs/1511.06434)
-
-print('mode=%s' % mode)
-# the dim of the noise (z) as the input of the generator network
-noise_dim = 100
-# the number of filters in the layer in generator/discriminator that is 
-# closet to the image
-gf_dim = 64
-df_dim = 64
-if dataSource == "mnist":
-    sample_dim = 28  # image dim
-    c_dim = 1  # image color
-else:
-    sample_dim = 32
-    c_dim = 3
-s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
-s8, s16 = int(sample_dim / 8), int(sample_dim / 16)
-
-settings(
-    batch_size=128,
-    learning_rate=2e-4,
-    learning_method=AdamOptimizer(beta1=0.5))
-
-
-def conv_bn(input,
-            channels,
-            imgSize,
-            num_filters,
-            output_x,
-            stride,
-            name,
-            param_attr,
-            bias_attr,
-            param_attr_bn,
-            bn,
-            trans=False,
-            act=ReluActivation()):
-    """
-    conv_bn is a utility function that constructs a convolution/deconv layer 
-    with an optional batch_norm layer
-
-    :param bn: whether to use batch_norm_layer
-    :type bn: bool
-    :param trans: whether to use conv (False) or deconv (True)
-    :type trans: bool
-    """
-
-    # calculate the filter_size and padding size based on the given
-    # imgSize and ouput size
-    tmp = imgSize - (output_x - 1) * stride
-    if tmp <= 1 or tmp > 5:
-        raise ValueError("conv input-output dimension does not fit")
-    elif tmp <= 3:
-        filter_size = tmp + 2
-        padding = 1
-    else:
-        filter_size = tmp
-        padding = 0
-
-    print(imgSize, output_x, stride, filter_size, padding)
-
-    if trans:
-        nameApx = "_convt"
-    else:
-        nameApx = "_conv"
-
-    if bn:
-        conv = img_conv_layer(
-            input,
-            filter_size=filter_size,
-            num_filters=num_filters,
-            name=name + nameApx,
-            num_channels=channels,
-            act=LinearActivation(),
-            groups=1,
-            stride=stride,
-            padding=padding,
-            bias_attr=bias_attr,
-            param_attr=param_attr,
-            shared_biases=True,
-            layer_attr=None,
-            filter_size_y=None,
-            stride_y=None,
-            padding_y=None,
-            trans=trans)
-
-        conv_bn = batch_norm_layer(
-            conv,
-            act=act,
-            name=name + nameApx + "_bn",
-            bias_attr=bias_attr,
-            param_attr=param_attr_bn,
-            use_global_stats=False)
-
-        return conv_bn
-    else:
-        conv = img_conv_layer(
-            input,
-            filter_size=filter_size,
-            num_filters=num_filters,
-            name=name + nameApx,
-            num_channels=channels,
-            act=act,
-            groups=1,
-            stride=stride,
-            padding=padding,
-            bias_attr=bias_attr,
-            param_attr=param_attr,
-            shared_biases=True,
-            layer_attr=None,
-            filter_size_y=None,
-            stride_y=None,
-            padding_y=None,
-            trans=trans)
-        return conv
-
-
-def generator(noise):
-    """
-    generator generates a sample given noise
-    """
-    param_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
-    bias_attr = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
-
-    param_attr_bn = ParamAttr(
-        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
-
-    h1 = fc_layer(
-        input=noise,
-        name="gen_layer_h1",
-        size=s8 * s8 * gf_dim * 4,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=LinearActivation())
-
-    h1_bn = batch_norm_layer(
-        h1,
-        act=ReluActivation(),
-        name="gen_layer_h1_bn",
-        bias_attr=bias_attr,
-        param_attr=param_attr_bn,
-        use_global_stats=False)
-
-    h2_bn = conv_bn(
-        h1_bn,
-        channels=gf_dim * 4,
-        output_x=s8,
-        num_filters=gf_dim * 2,
-        imgSize=s4,
-        stride=2,
-        name="gen_layer_h2",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True,
-        trans=True)
-
-    h3_bn = conv_bn(
-        h2_bn,
-        channels=gf_dim * 2,
-        output_x=s4,
-        num_filters=gf_dim,
-        imgSize=s2,
-        stride=2,
-        name="gen_layer_h3",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True,
-        trans=True)
-
-    return conv_bn(
-        h3_bn,
-        channels=gf_dim,
-        output_x=s2,
-        num_filters=c_dim,
-        imgSize=sample_dim,
-        stride=2,
-        name="gen_layer_h4",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=False,
-        trans=True,
-        act=TanhActivation())
-
-
-def discriminator(sample):
-    """
-    discriminator ouputs the probablity of a sample is from generator
-    or real data.
-    The output has two dimenstional: dimension 0 is the probablity
-    of the sample is from generator and dimension 1 is the probabblity
-    of the sample is from real data.
-    """
-    param_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
-    bias_attr = ParamAttr(
-        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
-
-    param_attr_bn = ParamAttr(
-        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
-
-    h0 = conv_bn(
-        sample,
-        channels=c_dim,
-        imgSize=sample_dim,
-        num_filters=df_dim,
-        output_x=s2,
-        stride=2,
-        name="dis_h0",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=False)
-
-    h1_bn = conv_bn(
-        h0,
-        channels=df_dim,
-        imgSize=s2,
-        num_filters=df_dim * 2,
-        output_x=s4,
-        stride=2,
-        name="dis_h1",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True)
-
-    h2_bn = conv_bn(
-        h1_bn,
-        channels=df_dim * 2,
-        imgSize=s4,
-        num_filters=df_dim * 4,
-        output_x=s8,
-        stride=2,
-        name="dis_h2",
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        param_attr_bn=param_attr_bn,
-        bn=True)
-
-    return fc_layer(
-        input=h2_bn,
-        name="dis_prob",
-        size=2,
-        bias_attr=bias_attr,
-        param_attr=param_attr,
-        act=SoftmaxActivation())
-
-
-if is_generator_training:
-    noise = data_layer(name="noise", size=noise_dim)
-    sample = generator(noise)
-
-if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)
-
-if is_generator_training or is_discriminator_training:
-    label = data_layer(name="label", size=1)
-    prob = discriminator(sample)
-    cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(
-        input=prob, label=label, name=mode + '_error')
-    outputs(cost)
-
-if is_generator:
-    noise = data_layer(name="noise", size=noise_dim)
-    outputs(generator(noise))
diff --git a/v1_api_demo/gan/gan_trainer.py b/v1_api_demo/gan/gan_trainer.py
deleted file mode 100644
index 4a26c230f7..0000000000
--- a/v1_api_demo/gan/gan_trainer.py
+++ /dev/null
@@ -1,349 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import random
-import numpy
-import cPickle
-import sys, os
-from PIL import Image
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-import py_paddle.swig_paddle as api
-import matplotlib.pyplot as plt
-
-
-def plot2DScatter(data, outputfile):
-    '''
-    Plot the data as a 2D scatter plot and save to outputfile
-    data needs to be two dimensinoal
-    '''
-    x = data[:, 0]
-    y = data[:, 1]
-    logger.info("The mean vector is %s" % numpy.mean(data, 0))
-    logger.info("The std vector is %s" % numpy.std(data, 0))
-
-    heatmap, xedges, yedges = numpy.histogram2d(x, y, bins=50)
-    extent = [xedges[0], xedges[-1], yedges[0], yedges[-1]]
-
-    plt.clf()
-    plt.scatter(x, y)
-    plt.savefig(outputfile, bbox_inches='tight')
-
-
-def CHECK_EQ(a, b):
-    assert a == b, "a=%s, b=%s" % (a, b)
-
-
-def copy_shared_parameters(src, dst):
-    '''
-    copy the parameters from src to dst
-    :param src: the source of the parameters
-    :type src: GradientMachine
-    :param dst: the destination of the parameters
-    :type dst: GradientMachine
-    '''
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-    src_params = dict([(p.getName(), p) for p in src_params])
-
-    for i in xrange(dst.getParameterSize()):
-        dst_param = dst.getParameter(i)
-        src_param = src_params.get(dst_param.getName(), None)
-        if src_param is None:
-            continue
-        src_value = src_param.getBuf(api.PARAMETER_VALUE)
-        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
-        CHECK_EQ(len(src_value), len(dst_value))
-        dst_value.copyFrom(src_value)
-        dst_param.setValueUpdated()
-
-
-def print_parameters(src):
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-
-    print "***************"
-    for p in src_params:
-        print "Name is %s" % p.getName()
-        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
-        )
-
-
-def load_mnist_data(imageFile):
-    f = open(imageFile, "rb")
-    f.read(16)
-
-    # Define number of samples for train/test
-    if "train" in imageFile:
-        n = 60000
-    else:
-        n = 10000
-
-    data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
-    data = data / 255.0 * 2.0 - 1.0
-
-    f.close()
-    return data.astype('float32')
-
-
-def load_cifar_data(cifar_path):
-    batch_size = 10000
-    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
-    for i in range(1, 6):
-        file = cifar_path + "/data_batch_" + str(i)
-        fo = open(file, 'rb')
-        dict = cPickle.load(fo)
-        fo.close()
-        data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
-
-    data = data / 255.0 * 2.0 - 1.0
-    return data
-
-
-# synthesize 2-D uniform data
-def load_uniform_data():
-    data = numpy.random.rand(1000000, 2).astype('float32')
-    return data
-
-
-def merge(images, size):
-    if images.shape[1] == 28 * 28:
-        h, w, c = 28, 28, 1
-    else:
-        h, w, c = 32, 32, 3
-    img = numpy.zeros((h * size[0], w * size[1], c))
-    for idx in xrange(size[0] * size[1]):
-        i = idx % size[1]
-        j = idx // size[1]
-        img[j*h:j*h+h, i*w:i*w+w, :] = \
-          ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
-    return img.astype('uint8')
-
-
-def save_images(images, path):
-    merged_img = merge(images, [8, 8])
-    if merged_img.shape[2] == 1:
-        im = Image.fromarray(numpy.squeeze(merged_img)).convert('RGB')
-    else:
-        im = Image.fromarray(merged_img, mode="RGB")
-    im.save(path)
-
-
-def get_real_samples(batch_size, data_np):
-    return data_np[numpy.random.choice(
-        data_np.shape[0], batch_size, replace=False), :]
-
-
-def get_noise(batch_size, noise_dim):
-    return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
-
-
-def get_fake_samples(generator_machine, batch_size, noise):
-    gen_inputs = api.Arguments.createArguments(1)
-    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    gen_outputs = api.Arguments.createArguments(0)
-    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
-    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
-    return fake_samples
-
-
-def get_training_loss(training_machine, inputs):
-    outputs = api.Arguments.createArguments(0)
-    training_machine.forward(inputs, outputs, api.PASS_TEST)
-    loss = outputs.getSlotValue(0).copyToNumpyMat()
-    return numpy.mean(loss)
-
-
-def prepare_discriminator_data_batch_pos(batch_size, data_np):
-    real_samples = get_real_samples(batch_size, data_np)
-    labels = numpy.ones(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(real_samples))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
-    return inputs
-
-
-def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
-    fake_samples = get_fake_samples(generator_machine, batch_size, noise)
-    labels = numpy.zeros(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(fake_samples))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
-    return inputs
-
-
-def prepare_generator_data_batch(batch_size, noise):
-    label = numpy.ones(batch_size, dtype='int32')
-    inputs = api.Arguments.createArguments(2)
-    inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(label))
-    return inputs
-
-
-def find(iterable, cond):
-    for item in iterable:
-        if cond(item):
-            return item
-    return None
-
-
-def get_layer_size(model_conf, layer_name):
-    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
-    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
-    return layer_conf.size
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
-    parser.add_argument(
-        "--use_gpu", default="1", help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
-    args = parser.parse_args()
-    data_source = args.data_source
-    use_gpu = args.use_gpu
-    assert data_source in ["mnist", "cifar", "uniform"]
-    assert use_gpu in ["0", "1"]
-
-    if not os.path.exists("./%s_samples/" % data_source):
-        os.makedirs("./%s_samples/" % data_source)
-
-    if not os.path.exists("./%s_params/" % data_source):
-        os.makedirs("./%s_params/" % data_source)
-
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-                   '--log_period=100', '--gpu_id=' + args.gpu_id,
-                   '--save_dir=' + "./%s_params/" % data_source)
-
-    if data_source == "uniform":
-        conf = "gan_conf.py"
-        num_iter = 10000
-    else:
-        conf = "gan_conf_image.py"
-        num_iter = 1000
-
-    gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-    dis_conf = parse_config(conf,
-                            "mode=discriminator_training,data=" + data_source)
-    generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
-    batch_size = dis_conf.opt_config.batch_size
-    noise_dim = get_layer_size(gen_conf.model_config, "noise")
-
-    if data_source == "mnist":
-        data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
-    elif data_source == "cifar":
-        data_np = load_cifar_data("./data/cifar-10-batches-py/")
-    else:
-        data_np = load_uniform_data()
-
-    # this creates a gradient machine for discriminator
-    dis_training_machine = api.GradientMachine.createFromConfigProto(
-        dis_conf.model_config)
-    # this create a gradient machine for generator    
-    gen_training_machine = api.GradientMachine.createFromConfigProto(
-        gen_conf.model_config)
-
-    # generator_machine is used to generate data only, which is used for
-    # training discriminator
-    logger.info(str(generator_conf.model_config))
-    generator_machine = api.GradientMachine.createFromConfigProto(
-        generator_conf.model_config)
-
-    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
-
-    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
-
-    dis_trainer.startTrain()
-    gen_trainer.startTrain()
-
-    # Sync parameters between networks (GradientMachine) at the beginning
-    copy_shared_parameters(gen_training_machine, dis_training_machine)
-    copy_shared_parameters(gen_training_machine, generator_machine)
-
-    # constrain that either discriminator or generator can not be trained
-    # consecutively more than MAX_strike times
-    curr_train = "dis"
-    curr_strike = 0
-    MAX_strike = 5
-
-    for train_pass in xrange(100):
-        dis_trainer.startTrainPass()
-        gen_trainer.startTrainPass()
-        for i in xrange(num_iter):
-            # Do forward pass in discriminator to get the dis_loss
-            noise = get_noise(batch_size, noise_dim)
-            data_batch_dis_pos = prepare_discriminator_data_batch_pos(
-                batch_size, data_np)
-            dis_loss_pos = get_training_loss(dis_training_machine,
-                                             data_batch_dis_pos)
-
-            data_batch_dis_neg = prepare_discriminator_data_batch_neg(
-                generator_machine, batch_size, noise)
-            dis_loss_neg = get_training_loss(dis_training_machine,
-                                             data_batch_dis_neg)
-
-            dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
-
-            # Do forward pass in generator to get the gen_loss
-            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
-            gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
-
-            if i % 100 == 0:
-                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
-                                                                 dis_loss_neg)
-                print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
-
-            # Decide which network to train based on the training history
-            # And the relative size of the loss        
-            if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
-               ((curr_train == "gen" and curr_strike == MAX_strike) or dis_loss > gen_loss):
-                if curr_train == "dis":
-                    curr_strike += 1
-                else:
-                    curr_train = "dis"
-                    curr_strike = 1
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
-                copy_shared_parameters(dis_training_machine,
-                                       gen_training_machine)
-
-            else:
-                if curr_train == "gen":
-                    curr_strike += 1
-                else:
-                    curr_train = "gen"
-                    curr_strike = 1
-                gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
-                # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
-                # so that we do not need to copy shared parameters. 
-                copy_shared_parameters(gen_training_machine,
-                                       dis_training_machine)
-                copy_shared_parameters(gen_training_machine, generator_machine)
-
-        dis_trainer.finishTrainPass()
-        gen_trainer.finishTrainPass()
-        # At the end of each pass, save the generated samples/images
-        fake_samples = get_fake_samples(generator_machine, batch_size, noise)
-        if data_source == "uniform":
-            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
-                          (data_source, train_pass))
-        else:
-            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
-                        (data_source, train_pass))
-    dis_trainer.finishTrain()
-    gen_trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/mnist/.gitignore b/v1_api_demo/mnist/.gitignore
deleted file mode 100644
index 7e61d5e3a0..0000000000
--- a/v1_api_demo/mnist/.gitignore
+++ /dev/null
@@ -1,10 +0,0 @@
-data/raw_data
-data/*.list
-mnist_vgg_model
-plot.png
-train.log
-*pyc
-.ipynb_checkpoints
-params.pkl
-params.tar
-params.tar.gz
diff --git a/v1_api_demo/mnist/api_train.py b/v1_api_demo/mnist/api_train.py
deleted file mode 100644
index ea1caa7dd9..0000000000
--- a/v1_api_demo/mnist/api_train.py
+++ /dev/null
@@ -1,196 +0,0 @@
-"""
-A very basic example for how to use current Raw SWIG API to train mnist network.
-
-Current implementation uses Raw SWIG, which means the API call is directly \
-passed to C++ side of Paddle.
-
-The user api could be simpler and carefully designed.
-"""
-import random
-
-import numpy as np
-import paddle.v2 as paddle_v2
-import py_paddle.swig_paddle as api
-from paddle.trainer_config_helpers import *
-from py_paddle import DataProviderConverter
-
-from mnist_util import read_from_mnist
-
-
-def init_parameter(network):
-    assert isinstance(network, api.GradientMachine)
-    for each_param in network.getParameters():
-        assert isinstance(each_param, api.Parameter)
-        array_size = len(each_param)
-        array = np.random.uniform(-1.0, 1.0, array_size).astype('float32')
-        each_param.getBuf(api.PARAMETER_VALUE).copyFromNumpyArray(array)
-
-
-def generator_to_batch(generator, batch_size):
-    ret_val = list()
-    for each_item in generator:
-        ret_val.append(each_item)
-        if len(ret_val) == batch_size:
-            yield ret_val
-            ret_val = list()
-    if len(ret_val) != 0:
-        yield ret_val
-
-
-class BatchPool(object):
-    def __init__(self, generator, batch_size):
-        self.data = list(generator)
-        self.batch_size = batch_size
-
-    def __call__(self):
-        random.shuffle(self.data)
-        for offset in xrange(0, len(self.data), self.batch_size):
-            limit = min(offset + self.batch_size, len(self.data))
-            yield self.data[offset:limit]
-
-
-def input_order_converter(generator):
-    for each_item in generator:
-        yield each_item['pixel'], each_item['label']
-
-
-def main():
-    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
-
-    optimizer = paddle_v2.optimizer.Adam(
-        learning_rate=1e-4,
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))
-
-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = optimizer.create_local_updater()
-    assert isinstance(updater, api.ParameterUpdater)
-
-    # define network
-    images = paddle_v2.layer.data(
-        name='pixel', type=paddle_v2.data_type.dense_vector(784))
-    label = paddle_v2.layer.data(
-        name='label', type=paddle_v2.data_type.integer_value(10))
-    hidden1 = paddle_v2.layer.fc(input=images, size=200)
-    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
-    inference = paddle_v2.layer.fc(input=hidden2,
-                                   size=10,
-                                   act=paddle_v2.activation.Softmax())
-    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
-
-    # Create Simple Gradient Machine.
-    model_config = paddle_v2.layer.parse_network(cost)
-    m = api.GradientMachine.createFromConfigProto(model_config,
-                                                  api.CREATE_MODE_NORMAL,
-                                                  optimizer.enable_types())
-
-    # This type check is not useful. Only enable type hint in IDE.
-    # Such as PyCharm
-    assert isinstance(m, api.GradientMachine)
-
-    # Initialize Parameter by numpy.
-    init_parameter(network=m)
-
-    # Initialize ParameterUpdater.
-    updater.init(m)
-
-    # DataProvider Converter is a utility convert Python Object to Paddle C++
-    # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(input_types=[images.type, label.type])
-
-    train_file = './data/raw_data/train'
-    test_file = './data/raw_data/t10k'
-
-    # start gradient machine.
-    # the gradient machine must be started before invoke forward/backward.
-    # not just for training, but also for inference.
-    m.start()
-
-    # evaluator can print error rate, etc. It is a C++ class.
-    batch_evaluator = m.makeEvaluator()
-    test_evaluator = m.makeEvaluator()
-
-    # Get Train Data.
-    # TrainData will stored in a data pool. Currently implementation is not care
-    # about memory, speed. Just a very naive implementation.
-    train_data_generator = input_order_converter(read_from_mnist(train_file))
-    train_data = BatchPool(train_data_generator, 512)
-
-    # outArgs is Neural Network forward result. Here is not useful, just passed
-    # to gradient_machine.forward
-    outArgs = api.Arguments.createArguments(0)
-
-    for pass_id in xrange(2):  # we train 2 passes.
-        updater.startPass()
-
-        for batch_id, data_batch in enumerate(train_data()):
-            # data_batch is input images.
-            # here, for online learning, we could get data_batch from network.
-
-            # Start update one batch.
-            pass_type = updater.startBatch(len(data_batch))
-
-            # Start BatchEvaluator.
-            # batch_evaluator can be used between start/finish.
-            batch_evaluator.start()
-
-            # forwardBackward is a shortcut for forward and backward.
-            # It is sometimes faster than invoke forward/backward separately,
-            # because in GradientMachine, it may be async.
-            m.forwardBackward(converter(data_batch), outArgs, pass_type)
-
-            for each_param in m.getParameters():
-                updater.update(each_param)
-
-            # Get cost. We use numpy to calculate total cost for this batch.
-            cost_vec = outArgs.getSlotValue(0)
-            cost_vec = cost_vec.copyToNumpyMat()
-            cost = cost_vec.sum() / len(data_batch)
-
-            # Make evaluator works.
-            m.eval(batch_evaluator)
-
-            # Print logs.
-            print 'Pass id', pass_id, 'Batch id', batch_id, 'with cost=', \
-                cost, batch_evaluator
-
-            batch_evaluator.finish()
-            # Finish batch.
-            #  * will clear gradient.
-            #  * ensure all values should be updated.
-            updater.finishBatch(cost)
-
-        # testing stage. use test data set to test current network.
-        updater.apply()
-        test_evaluator.start()
-        test_data_generator = input_order_converter(read_from_mnist(test_file))
-        for data_batch in generator_to_batch(test_data_generator, 512):
-            # in testing stage, only forward is needed.
-            m.forward(converter(data_batch), outArgs, api.PASS_TEST)
-            m.eval(test_evaluator)
-
-        # print error rate for test data set
-        print 'Pass', pass_id, ' test evaluator: ', test_evaluator
-        test_evaluator.finish()
-        updater.restore()
-
-        updater.catchUpWith()
-        params = m.getParameters()
-        for each_param in params:
-            assert isinstance(each_param, api.Parameter)
-            value = each_param.getBuf(api.PARAMETER_VALUE)
-            value = value.copyToNumpyArray()
-
-            # Here, we could save parameter to every where you want
-            print each_param.getName(), value
-
-        updater.finishPass()
-
-    m.finish()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/mnist/data/get_mnist_data.sh b/v1_api_demo/mnist/data/get_mnist_data.sh
deleted file mode 100755
index 5a2e34026d..0000000000
--- a/v1_api_demo/mnist/data/get_mnist_data.sh
+++ /dev/null
@@ -1,21 +0,0 @@
-#!/usr/bin/env sh
-# This scripts downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/raw_data"
-mkdir "$DIR/raw_data"
-cd "$DIR/raw_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
-
-cd $DIR
-rm -f *.list
-python generate_list.py
diff --git a/v1_api_demo/mnist/light_mnist.py b/v1_api_demo/mnist/light_mnist.py
deleted file mode 100644
index 3340905435..0000000000
--- a/v1_api_demo/mnist/light_mnist.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-
-if not is_predict:
-    data_dir = './data/'
-    define_py_data_sources2(
-        train_list=data_dir + 'train.list',
-        test_list=data_dir + 'test.list',
-        module='mnist_provider',
-        obj='process')
-
-######################Algorithm Configuration #############
-settings(batch_size=50, learning_rate=0.001, learning_method=AdamOptimizer())
-
-#######################Network Configuration #############
-
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-
-
-# light cnn
-# A shallower cnn model: [CNN, BN, ReLU, Max-Pooling] x4 + FC x1
-# Easier to train for mnist dataset and quite efficient
-# Final performance is close to deeper ones on tasks such as digital and character classification 
-def light_cnn(input_image, num_channels, num_classes):
-    def __light__(ipt,
-                  num_filter=128,
-                  times=1,
-                  conv_filter_size=3,
-                  dropouts=0,
-                  num_channels_=None):
-        return img_conv_group(
-            input=ipt,
-            num_channels=num_channels_,
-            pool_size=2,
-            pool_stride=2,
-            conv_padding=0,
-            conv_num_filter=[num_filter] * times,
-            conv_filter_size=conv_filter_size,
-            conv_act=ReluActivation(),
-            conv_with_batchnorm=True,
-            conv_batchnorm_drop_rate=dropouts,
-            pool_type=MaxPooling())
-
-    tmp = __light__(input_image, num_filter=128, num_channels_=num_channels)
-    tmp = __light__(tmp, num_filter=128)
-    tmp = __light__(tmp, num_filter=128)
-    tmp = __light__(tmp, num_filter=128, conv_filter_size=1)
-
-    tmp = fc_layer(input=tmp, size=num_classes, act=SoftmaxActivation())
-    return tmp
-
-
-predict = light_cnn(input_image=img, num_channels=1, num_classes=label_size)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/v1_api_demo/mnist/mnist_provider.py b/v1_api_demo/mnist/mnist_provider.py
deleted file mode 100644
index 888cfef1e7..0000000000
--- a/v1_api_demo/mnist/mnist_provider.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from paddle.trainer.PyDataProvider2 import *
-from mnist_util import read_from_mnist
-
-
-# Define a py data provider
-@provider(
-    input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)},
-    cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):  # settings is not used currently.
-    for each in read_from_mnist(filename):
-        yield each
diff --git a/v1_api_demo/mnist/mnist_util.py b/v1_api_demo/mnist/mnist_util.py
deleted file mode 100644
index 3fd88ae7ed..0000000000
--- a/v1_api_demo/mnist/mnist_util.py
+++ /dev/null
@@ -1,30 +0,0 @@
-import numpy
-
-__all__ = ['read_from_mnist']
-
-
-def read_from_mnist(filename):
-    imgf = filename + "-images-idx3-ubyte"
-    labelf = filename + "-labels-idx1-ubyte"
-    f = open(imgf, "rb")
-    l = open(labelf, "rb")
-
-    f.read(16)
-    l.read(8)
-
-    # Define number of samples for train/test
-    if "train" in filename:
-        n = 60000
-    else:
-        n = 10000
-
-    images = numpy.fromfile(
-        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
-    images = images / 255.0 * 2.0 - 1.0
-    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
-
-    for i in xrange(n):
-        yield {"pixel": images[i, :], 'label': labels[i]}
-
-    f.close()
-    l.close()
diff --git a/v1_api_demo/mnist/vgg_16_mnist.py b/v1_api_demo/mnist/vgg_16_mnist.py
deleted file mode 100644
index a819b391c6..0000000000
--- a/v1_api_demo/mnist/vgg_16_mnist.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-is_predict = get_config_arg("is_predict", bool, False)
-
-####################Data Configuration ##################
-
-if not is_predict:
-    data_dir = './data/'
-    define_py_data_sources2(
-        train_list=data_dir + 'train.list',
-        test_list=data_dir + 'test.list',
-        module='mnist_provider',
-        obj='process')
-
-######################Algorithm Configuration #############
-settings(
-    batch_size=128,
-    learning_rate=0.1 / 128.0,
-    learning_method=MomentumOptimizer(0.9),
-    regularization=L2Regularization(0.0005 * 128))
-
-#######################Network Configuration #############
-
-data_size = 1 * 28 * 28
-label_size = 10
-img = data_layer(name='pixel', size=data_size)
-
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=1, num_classes=label_size)
-
-if not is_predict:
-    lbl = data_layer(name="label", size=label_size)
-    inputs(img, lbl)
-    outputs(classification_cost(input=predict, label=lbl))
-else:
-    outputs(predict)
diff --git a/v1_api_demo/model_zoo/embedding/.gitignore b/v1_api_demo/model_zoo/embedding/.gitignore
deleted file mode 100644
index 908f5a3fb2..0000000000
--- a/v1_api_demo/model_zoo/embedding/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-baidu.dict
-model_*.emb
diff --git a/v1_api_demo/model_zoo/embedding/extract_para.py b/v1_api_demo/model_zoo/embedding/extract_para.py
deleted file mode 100755
index 570b90c1f7..0000000000
--- a/v1_api_demo/model_zoo/embedding/extract_para.py
+++ /dev/null
@@ -1,113 +0,0 @@
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python extract_para.py --preModel PREMODEL --preDict PREDICT \
-                            --usrModel USRMODEL --usrDict USRDICT -d DIM
-
-Options:
-    -h, --help          show this help message and exit
-    --preModel PREMODEL the name of pretrained embedding model
-    --preDict PREDICT   the name of pretrained dictionary
-    --usrModel usrModel the name of output usr embedding model
-    --usrDict usrDict   the name of user specified dictionary
-    -d DIM              dimension of parameter
-"""
-from optparse import OptionParser
-import struct
-
-
-def get_row_index(preDict, usrDict):
-    """
-    Get the row positions for all words in user dictionary from pre-trained dictionary.
-    return: a list of row positions
-    Example: preDict='a\nb\nc\n', usrDict='a\nc\n', then return [0,2]
-    """
-    pos = []
-    index = dict()
-    with open(preDict, "r") as f:
-        for line_index, line in enumerate(f):
-            word = line.strip().split()[0]
-            index[word] = line_index
-    with open(usrDict, "r") as f:
-        for line in f:
-            word = line.strip().split()[0]
-            pos.append(index[word])
-    return pos
-
-
-def extract_parameters_by_usrDict(preModel, preDict, usrModel, usrDict,
-                                  paraDim):
-    """
-    Extract desired parameters from a pretrained embedding model based on user dictionary
-    """
-    if paraDim not in [32, 64, 128, 256]:
-        raise RuntimeError("We only support 32, 64, 128, 256 dimensions now")
-
-    fi = open(preModel, "rb")
-    fo = open(usrModel, "wb")
-
-    # write filehead
-    rowIndex = get_row_index(preDict, usrDict)
-    newHead = struct.pack("iil", 0, 4, len(rowIndex) * paraDim)
-    fo.write(newHead)
-    bytes = 4 * paraDim
-    for i in range(0, len(rowIndex)):
-        # find the absolute position of input file
-        fi.seek(rowIndex[i] * bytes + 16, 0)
-        fo.write(fi.read(bytes))
-
-    print "extract parameters finish, total", len(rowIndex), "lines"
-    fi.close()
-
-
-def main():
-    """
-    Main entry for running paraconvert.py 
-    """
-    usage = "usage: \n" \
-            "python %prog --preModel PREMODEL --preDict PREDICT" \
-            " --usrModel USRMODEL --usrDict USRDICT -d DIM"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "--preModel",
-        action="store",
-        dest="preModel",
-        help="the name of pretrained embedding model")
-    parser.add_option(
-        "--preDict",
-        action="store",
-        dest="preDict",
-        help="the name of pretrained dictionary")
-    parser.add_option(
-        "--usrModel",
-        action="store",
-        dest="usrModel",
-        help="the name of output usr embedding model")
-    parser.add_option(
-        "--usrDict",
-        action="store",
-        dest="usrDict",
-        help="the name of user specified dictionary")
-    parser.add_option(
-        "-d", action="store", dest="dim", help="dimension of parameter")
-    (options, args) = parser.parse_args()
-    extract_parameters_by_usrDict(options.preModel, options.preDict,
-                                  options.usrModel, options.usrDict,
-                                  int(options.dim))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/embedding/paraconvert.py b/v1_api_demo/model_zoo/embedding/paraconvert.py
deleted file mode 100755
index ce7a70efc4..0000000000
--- a/v1_api_demo/model_zoo/embedding/paraconvert.py
+++ /dev/null
@@ -1,159 +0,0 @@
-#!/bin/env python
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Example:
-    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
-    python paraconvert.py --t2b -i INPUT -o OUTPUT
-
-Options:
-    -h, --help  show this help message and exit
-    --b2t       convert parameter file of embedding model from binary to text
-    --t2b       convert parameter file of embedding model from text to binary
-    -i INPUT    input parameter file name
-    -o OUTPUT   output parameter file name
-    -d DIM      dimension of parameter
-"""
-from optparse import OptionParser
-import struct
-
-
-def binary2text(input, output, paraDim):
-    """
-    Convert a binary parameter file of embedding model to be a text file.  
-    input: the name of input binary parameter file, the format is:
-           1) the first 16 bytes is filehead:
-                version(4 bytes): version of paddle, default = 0
-                floatSize(4 bytes): sizeof(float) = 4
-                paraCount(8 bytes): total number of parameter
-           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes 
-    output: the name of output text parameter file, for example:
-           0,4,32156096
-           -0.7845433,1.1937413,-0.1704215,...
-           0.0000909,0.0009465,-0.0008813,...
-           ...
-           the format is:
-           1) the first line is filehead: 
-              version=0, floatSize=4, paraCount=32156096
-           2) other lines print the paramters
-              a) each line prints paraDim paramters splitted by ','
-              b) there is paraCount/paraDim lines (embedding words)
-    paraDim: dimension of parameters 
-    """
-    fi = open(input, "rb")
-    fo = open(output, "w")
-    """
-    """
-    version, floatSize, paraCount = struct.unpack("iil", fi.read(16))
-    newHead = ','.join([str(version), str(floatSize), str(paraCount)])
-    print >> fo, newHead
-
-    bytes = 4 * int(paraDim)
-    format = "%df" % int(paraDim)
-    context = fi.read(bytes)
-    line = 0
-
-    while context:
-        numbers = struct.unpack(format, context)
-        lst = []
-        for i in numbers:
-            lst.append('%8.7f' % i)
-        print >> fo, ','.join(lst)
-        context = fi.read(bytes)
-        line += 1
-    fi.close()
-    fo.close()
-    print "binary2text finish, total", line, "lines"
-
-
-def get_para_count(input):
-    """
-    Compute the total number of embedding parameters in input text file. 
-    input: the name of input text file
-    """
-    numRows = 1
-    paraDim = 0
-    with open(input) as f:
-        line = f.readline()
-        paraDim = len(line.split(","))
-        for line in f:
-            numRows += 1
-    return numRows * paraDim
-
-
-def text2binary(input, output, paddle_head=True):
-    """
-    Convert a text parameter file of embedding model to be a binary file.
-    input: the name of input text parameter file, for example:
-           -0.7845433,1.1937413,-0.1704215,...
-           0.0000909,0.0009465,-0.0008813,... 
-           ...
-           the format is:
-           1) it doesn't have filehead
-           2) each line stores the same dimension of parameters, 
-              the separator is commas ','
-    output: the name of output binary parameter file, the format is:
-           1) the first 16 bytes is filehead: 
-             version(4 bytes), floatSize(4 bytes), paraCount(8 bytes)
-           2) the next (paraCount * 4) bytes is parameters, each has 4 bytes
-    """
-    fi = open(input, "r")
-    fo = open(output, "wb")
-
-    newHead = struct.pack("iil", 0, 4, get_para_count(input))
-    fo.write(newHead)
-
-    count = 0
-    for line in fi:
-        line = line.strip().split(",")
-        for i in range(0, len(line)):
-            binary_data = struct.pack("f", float(line[i]))
-            fo.write(binary_data)
-        count += 1
-    fi.close()
-    fo.close()
-    print "text2binary finish, total", count, "lines"
-
-
-def main():
-    """
-    Main entry for running paraconvert.py 
-    """
-    usage = "usage: \n" \
-            "python %prog --b2t -i INPUT -o OUTPUT -d DIM \n" \
-            "python %prog --t2b -i INPUT -o OUTPUT"
-    parser = OptionParser(usage)
-    parser.add_option(
-        "--b2t",
-        action="store_true",
-        help="convert parameter file of embedding model from binary to text")
-    parser.add_option(
-        "--t2b",
-        action="store_true",
-        help="convert parameter file of embedding model from text to binary")
-    parser.add_option(
-        "-i", action="store", dest="input", help="input parameter file name")
-    parser.add_option(
-        "-o", action="store", dest="output", help="output parameter file name")
-    parser.add_option(
-        "-d", action="store", dest="dim", help="dimension of parameter")
-    (options, args) = parser.parse_args()
-    if options.b2t:
-        binary2text(options.input, options.output, options.dim)
-    if options.t2b:
-        text2binary(options.input, options.output)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh b/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
deleted file mode 100755
index f61c65a935..0000000000
--- a/v1_api_demo/model_zoo/embedding/pre_DictAndModel.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-set -x
-BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
-
-DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
-ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
-          f88c8325ee6da6187f1080e8fe66c1cd
-          927cf70f27f860aff1a5703ebf7f1584
-	  a52e43655cd25d279777ed509a1ae27b
-	  b92c67fe9ff70fea53596080e351ac80)
-
-for ((i=0; i<${#ITEM_MD5[@]}; i++))
-do
-  FILENAME=${DOWNLOAD_ITEMS[${i}]}
-  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
-  EXPECTED_MD5=${ITEM_MD5[${i}]}
-  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
-done
diff --git a/v1_api_demo/model_zoo/resnet/.gitignore b/v1_api_demo/model_zoo/resnet/.gitignore
deleted file mode 100644
index 7a64209b62..0000000000
--- a/v1_api_demo/model_zoo/resnet/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-fea_output/
-features/
-model.list
-ResNet_50.dot
-ResNet_50.png
diff --git a/v1_api_demo/model_zoo/resnet/classify.py b/v1_api_demo/model_zoo/resnet/classify.py
deleted file mode 100755
index 6074cc1d3a..0000000000
--- a/v1_api_demo/model_zoo/resnet/classify.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cPickle
-import logging
-from PIL import Image
-import numpy as np
-from optparse import OptionParser
-
-import paddle.utils.image_util as image_util
-
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import dense_vector
-from paddle.trainer.config_parser import parse_config
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-class ImageClassifier():
-    def __init__(self,
-                 train_conf,
-                 model_dir=None,
-                 resize_dim=256,
-                 crop_dim=224,
-                 use_gpu=True,
-                 mean_file=None,
-                 output_layer=None,
-                 oversample=False,
-                 is_color=True):
-        """
-        train_conf: network configure.
-        model_dir: string, directory of model.
-        resize_dim: int, resized image size.
-        crop_dim: int, crop size.
-        mean_file: string, image mean file.
-        oversample: bool, oversample means multiple crops, namely five
-                    patches (the four corner patches and the center
-                    patch) as well as their horizontal reflections,
-                    ten crops in all.
-        """
-        self.train_conf = train_conf
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.resize_dim = resize_dim
-        self.crop_dims = [crop_dim, crop_dim]
-        self.oversample = oversample
-        self.is_color = is_color
-
-        self.output_layer = output_layer
-        if self.output_layer:
-            assert isinstance(self.output_layer, basestring)
-            self.output_layer = self.output_layer.split(",")
-
-        self.transformer = image_util.ImageTransformer(is_color=is_color)
-        self.transformer.set_transpose((2, 0, 1))
-        self.transformer.set_channel_swap((2, 1, 0))
-
-        self.mean_file = mean_file
-        if self.mean_file is not None:
-            mean = np.load(self.mean_file)['data_mean']
-            mean = mean.reshape(3, self.crop_dims[0], self.crop_dims[1])
-            self.transformer.set_mean(mean)  # mean pixel
-        else:
-            # if you use three mean value, set like:
-            # this three mean value is calculated from ImageNet.
-            self.transformer.set_mean(np.array([103.939, 116.779, 123.68]))
-
-        conf_args = "is_test=1,use_gpu=%d,is_predict=1" % (int(use_gpu))
-        conf = parse_config(train_conf, conf_args)
-        swig_paddle.initPaddle("--use_gpu=%d" % (int(use_gpu)))
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        assert isinstance(self.network, swig_paddle.GradientMachine)
-        self.network.loadParameters(self.model_dir)
-
-        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [dense_vector(data_size)]
-        self.converter = DataProviderConverter(slots)
-
-    def get_data(self, img_path):
-        """
-        1. load image from img_path.
-        2. resize or oversampling.
-        3. transformer data: transpose, channel swap, sub mean.
-        return K x H x W ndarray.
-
-        img_path: image path.
-        """
-        image = image_util.load_image(img_path, self.is_color)
-        # Another way to extract oversampled features is that
-        # cropping and averaging from large feature map which is
-        # calculated by large size of image.
-        # This way reduces the computation.
-        if self.oversample:
-            # image_util.resize_image: short side is self.resize_dim
-            image = image_util.resize_image(image, self.resize_dim)
-            image = np.array(image)
-            input = np.zeros(
-                (1, image.shape[0], image.shape[1], 3), dtype=np.float32)
-            input[0] = image.astype(np.float32)
-            input = image_util.oversample(input, self.crop_dims)
-        else:
-            image = image.resize(self.crop_dims, Image.ANTIALIAS)
-            input = np.zeros(
-                (1, self.crop_dims[0], self.crop_dims[1], 3), dtype=np.float32)
-            input[0] = np.array(image).astype(np.float32)
-
-        data_in = []
-        for img in input:
-            img = self.transformer.transformer(img).flatten()
-            data_in.append([img.tolist()])
-        # paddle input: [[[]],[[]],...], [[]] is one sample.
-        return data_in
-
-    def forward(self, input_data):
-        """
-        return output arguments which are the Outputs() in network configure.
-
-        input_data: py_paddle input data.
-        call forward.
-        """
-        in_arg = self.converter(input_data)
-        return self.network.forwardTest(in_arg)
-
-    def forward(self, data, output_layer):
-        """
-        return output arguments which are the Outputs() in network configure.
-
-        input_data: py_paddle input data.
-        call forward.
-        """
-        input = self.converter(data)
-        self.network.forwardTest(input)
-        output = self.network.getLayerOutputs(output_layer)
-        res = {}
-        if isinstance(output_layer, basestring):
-            output_layer = [output_layer]
-        for name in output_layer:
-            # For oversampling, average predictions across crops.
-            # If not, the shape of output[name]: (1, class_number),
-            # the mean is also applicable.
-            res[name] = output[name]['value'].mean(0)
-
-        return res
-
-    def predict(self, data_file):
-        """
-        call forward and predicting.
-
-        data_file: input image list.
-        """
-        image_files = open(data_file, 'rb').readlines()
-        results = {}
-        if self.output_layer is None:
-            self.output_layer = ["output"]
-        for line in image_files:
-            image = line.split()[0]
-            data = self.get_data(image)
-            prob = self.forward(data, self.output_layer)
-            lab = np.argsort(-prob[self.output_layer[0]])
-            results[image] = lab[0]
-            logging.info("Label of %s is: %d", image, lab[0])
-        return results
-
-    def extract(self, data_file, output_dir, batch_size=10000):
-        """
-        extract and save features of output layers, which are
-        specify in Outputs() in network configure.
-
-        data_file: file name of input data.
-        output_dir: saved directory of extracted features.
-        batch_size: sample number of one batch file.
-        """
-        if not os.path.exists(output_dir):
-            os.mkdir(output_dir)
-
-        sample_num = 0
-        batch_num = 0
-        image_feature = {}
-        image_files = open(data_file, 'rb').readlines()
-        for idx, line in enumerate(image_files):
-            image = line.split()[0]
-            data = self.get_data(image)
-            feature = self.forward(data, self.output_layer)
-            # save extracted features
-            file_name = image.split("/")[-1]
-            image_feature[file_name] = feature
-            sample_num += 1
-            if sample_num == batch_size:
-                batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
-                self.save_file(image_feature, batch_name)
-                logging.info('Finish batch %d', batch_num)
-                batch_num += 1
-                sample_num = 0
-                image_feature = {}
-            if idx % 1000 == 0:
-                logging.info('%d/%d, %s', idx, len(image_files), file_name)
-        if sample_num > 0:
-            batch_name = os.path.join(output_dir, 'batch_%d' % (batch_num))
-            self.save_file(image_feature, batch_name)
-            logging.info('Finish batch %d', batch_num)
-        logging.info('Done: make image feature batch')
-
-    def save_file(self, data, file):
-        of = open(file, 'wb')
-        cPickle.dump(data, of, protocol=cPickle.HIGHEST_PROTOCOL)
-
-
-def option_parser():
-    """
-    Main entry for predciting
-    """
-    usage = "%prog -c config -i data_list -w model_dir [options]"
-    parser = OptionParser(usage="usage: %s" % usage)
-    parser.add_option(
-        "-j",
-        "--job",
-        action="store",
-        dest="job_type",
-        help="job type: predict, extract\
-                            predict: predicting,\
-                            extract: extract features")
-    parser.add_option(
-        "-c",
-        "--conf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-i", "--data", action="store", dest="data_file", help="image list")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    parser.add_option(
-        "-g",
-        "--use_gpu",
-        action="store",
-        dest="use_gpu",
-        default=True,
-        help="Whether to use gpu mode.")
-    parser.add_option(
-        "-o",
-        "--output_dir",
-        action="store",
-        dest="output_dir",
-        default="output",
-        help="output path")
-    parser.add_option(
-        "-m",
-        "--mean",
-        action="store",
-        dest="mean",
-        default=None,
-        help="mean file.")
-    parser.add_option(
-        "-p",
-        "--multi_crop",
-        action="store_true",
-        dest="multi_crop",
-        default=False,
-        help="Wether to use multiple crops on image.")
-    parser.add_option("-l", "--output_layer", action="store",
-                      dest="output_layer", default=None,
-                      help="--job=extract, specify layers to extract "\
-                           "features, --job=predict, specify layer of "
-                           "classification probability, output in resnet.py.")
-    return parser.parse_args()
-
-
-def main():
-    """
-    1. parse input arguments.
-    2. predicting or extract features according job type.
-    """
-    options, args = option_parser()
-    obj = ImageClassifier(
-        options.train_conf,
-        options.model_path,
-        use_gpu=options.use_gpu,
-        mean_file=options.mean,
-        output_layer=options.output_layer,
-        oversample=options.multi_crop)
-    if options.job_type == "predict":
-        obj.predict(options.data_file)
-
-    elif options.job_type == "extract":
-        obj.extract(options.data_file, options.output_dir)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/model_zoo/resnet/example/.gitignore b/v1_api_demo/model_zoo/resnet/example/.gitignore
deleted file mode 100644
index 4a2b5962a6..0000000000
--- a/v1_api_demo/model_zoo/resnet/example/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-*image_list_provider_copy_1.py
diff --git a/v1_api_demo/model_zoo/resnet/example/cat.jpg b/v1_api_demo/model_zoo/resnet/example/cat.jpg
deleted file mode 100644
index 47b01db90e..0000000000
Binary files a/v1_api_demo/model_zoo/resnet/example/cat.jpg and /dev/null differ
diff --git a/v1_api_demo/model_zoo/resnet/example/dog.jpg b/v1_api_demo/model_zoo/resnet/example/dog.jpg
deleted file mode 100644
index b9cc33cf06..0000000000
Binary files a/v1_api_demo/model_zoo/resnet/example/dog.jpg and /dev/null differ
diff --git a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py b/v1_api_demo/model_zoo/resnet/example/image_list_provider.py
deleted file mode 100644
index 2cd8eb8bf8..0000000000
--- a/v1_api_demo/model_zoo/resnet/example/image_list_provider.py
+++ /dev/null
@@ -1,102 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.utils.image_util import *
-from paddle.trainer.PyDataProvider2 import *
-
-
-def hook(settings, image_size, crop_size, color, file_list, is_train, **kwargs):
-    """
-    Description: Init with a list of data file
-    file_list is the name list of input files.
-    kwargs["load_data_args"] is the value of 'load_data_args'
-    which can be set in config.
-    Each args is separated by a column.
-    image_size: the crop image size.
-    mean_meta: the path of the meta file to store the mean image.
-    mean_value: can be mean value, not a file.
-                can not set mean_meta and mean_value at the same time.
-    color: 'color' means a color image. Otherwise, it means a gray image.
-    is_train: whether the data provider is used for training.
-              Data argumentation might be different for training and testing.
-    """
-    settings.img_size = image_size
-    settings.crop_size = crop_size
-    settings.mean_img_size = settings.crop_size
-    settings.color = color  # default is color
-    settings.is_train = is_train
-
-    settings.is_swap_channel = kwargs.get('swap_channel', None)
-    if settings.is_swap_channel is not None:
-        settings.swap_channel = settings.is_swap_channel
-        settings.is_swap_channel = True
-
-    if settings.color:
-        settings.img_input_size = settings.crop_size * settings.crop_size * 3
-    else:
-        settings.img_input_size = settings.crop_size * settings.crop_size
-
-    settings.file_list = file_list
-    settings.mean_meta = kwargs.get('mean_meta', None)
-    settings.mean_value = kwargs.get('mean_value', None)
-    # can not specify both mean_meta and mean_value.
-    assert not (settings.mean_meta and settings.mean_value)
-    if not settings.mean_meta:
-        settings.mean_value = kwargs.get('mean_value')
-        sz = settings.crop_size * settings.crop_size
-        settings.img_mean = np.zeros(sz * 3, dtype=np.single)
-        for idx, value in enumerate(settings.mean_value):
-            settings.img_mean[idx * sz:(idx + 1) * sz] = value
-        settings.img_mean = settings.img_mean.reshape(3, settings.crop_size,
-                                                      settings.crop_size)
-
-    else:
-        settings.img_mean = load_meta(settings.mean_meta,
-                                      settings.mean_img_size,
-                                      settings.crop_size, settings.color)
-
-    settings.input_types = [
-        dense_vector(settings.img_input_size),  # image feature
-        integer_value(1)
-    ]  # labels
-
-    settings.logger.info('Image short side: %s', settings.img_size)
-    settings.logger.info('Crop size: %s', settings.crop_size)
-    settings.logger.info('Meta path: %s', settings.mean_meta)
-    if settings.is_swap_channel:
-        settings.logger.info('swap channel: %s', settings.swap_channel)
-    settings.logger.info('DataProvider Initialization finished')
-
-
-@provider(init_hook=hook, should_shuffle=False)
-def processData(settings, file_list):
-    """
-    The main function for loading data.
-    Load the batch, iterate all the images and labels in this batch.
-    file_name: the batch file name.
-    """
-    img_path, lab = file_list.strip().split(' ')
-    img = Image.open(img_path)
-    img.load()
-    img = img.resize((settings.img_size, settings.img_size), Image.ANTIALIAS)
-    img = np.array(img).astype(np.float32)
-    if len(img.shape) == 3:
-        img = np.swapaxes(img, 1, 2)
-        img = np.swapaxes(img, 1, 0)
-    # swap channel
-    if settings.is_swap_channel:
-        img = img[settings.swap_channel, :, :]
-    img_feat = preprocess_img(img, settings.img_mean, settings.crop_size,
-                              settings.is_train, settings.color)
-    yield img_feat.tolist(), int(lab.strip())
diff --git a/v1_api_demo/model_zoo/resnet/example/test.list b/v1_api_demo/model_zoo/resnet/example/test.list
deleted file mode 100644
index 30bbf630b6..0000000000
--- a/v1_api_demo/model_zoo/resnet/example/test.list
+++ /dev/null
@@ -1,2 +0,0 @@
-example/dog.jpg 0
-example/cat.jpg 0
diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh b/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
deleted file mode 100755
index 5447aa92df..0000000000
--- a/v1_api_demo/model_zoo/resnet/extract_fea_c++.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#set names of layer which you want to extract feature
-#in Outputs() of resnet.py 
-#like: Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
-layer_num=50
-configure=./resnet.py
-model_path=./model/resnet_$layer_num
-fea_dir=fea_output
-#Output is text file.
-#Each line is one sample's features.
-#If you set N layer names in Outputs()
-#each line contains N features sperated by ";". 
-
-# create model list file.
-model_list=./model.list
-touch $model_list | echo $model_path > $model_list
-
-paddle train \
-  --local=true \
-  --job=test \
-  --config=$configure \
-  --model_list=$model_list \
-  --use_gpu=1 \
-  --predict_output_dir=$fea_dir \
-  --config_args=is_test=1,layer_num=$layer_num
diff --git a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh b/v1_api_demo/model_zoo/resnet/extract_fea_py.sh
deleted file mode 100755
index 2e87152f7f..0000000000
--- a/v1_api_demo/model_zoo/resnet/extract_fea_py.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note if you use CPU mode, you need to set use_gpu=0 in classify.py. like this:
-#conf_args = "is_test=0,use_gpu=1,is_predict=1"
-#conf = parse_config(train_conf, conf_args)
-#swig_paddle.initPaddle("--use_gpu=0")
-python classify.py \
-     --job=extract \
-     --conf=resnet.py \
-     --use_gpu=1 \
-     --mean=model/mean_meta_224/mean.meta \
-     --model=model/resnet_50 \
-     --data=./example/test.list \
-     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
-     --output_dir=features
diff --git a/v1_api_demo/model_zoo/resnet/load_feature.py b/v1_api_demo/model_zoo/resnet/load_feature.py
deleted file mode 100644
index 5d3d0c0d30..0000000000
--- a/v1_api_demo/model_zoo/resnet/load_feature.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-import cPickle
-import logging
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
-logging.getLogger().setLevel(logging.INFO)
-
-
-def load_feature_c(file):
-    """
-    Load feature extracted by C++ interface.
-    Return a list.
-    file: feature file.
-    """
-    features = []
-    f = open(file, 'r')
-    for line in f:
-        sample = []
-        for slot in line.strip().split(";"):
-            fea = [float(val) for val in slot.strip().split()]
-            if fea:
-                sample.append(fea)
-        features.append(sample)
-    f.close()
-    return features
-
-
-def load_feature_py(feature_dir):
-    """
-    Load feature extracted by python interface.
-    Return a dictionary.
-    feature_dir: directory of feature file.
-    """
-    file_list = os.listdir(feature_dir)
-    file_list = [os.path.join(feature_dir, f) for f in file_list]
-    features = {}
-    for file_name in file_list:
-        with open(file_name, 'rb') as f:
-            feature = cPickle.load(f)
-            features.update(feature)
-            logging.info('Load feature file %s', file_name)
-    return features
-
-
-if __name__ == '__main__':
-    print load_feature_py(sys.argv[1])
-    #print load_feature_c(sys.argv[1]) 
diff --git a/v1_api_demo/model_zoo/resnet/net_diagram.sh b/v1_api_demo/model_zoo/resnet/net_diagram.sh
deleted file mode 100755
index 1b06ffa44e..0000000000
--- a/v1_api_demo/model_zoo/resnet/net_diagram.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-:'
-Visual deep residual network
-1. Using make_model_diagram.py to generate dot file.
-2. Using graphviz to convert dot file.
-
-Usage:
-./net_diagram.sh
-'
-
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-img_type=png
-img_fileprefix=ResNet_50
-conf_filename=resnet.py
-dot_filename=ResNet_50.dot
-config_str="layer_num=50,data_provider=0"
-
-python -m paddle.utils.make_model_diagram $conf_filename $dot_filename $config_str
-
-# If you have installed graphviz, running like this:
-# dot -Tpng -o ResNet.png ResNet.dot
diff --git a/v1_api_demo/quick_start/.gitignore b/v1_api_demo/quick_start/.gitignore
deleted file mode 100644
index f71662563f..0000000000
--- a/v1_api_demo/quick_start/.gitignore
+++ /dev/null
@@ -1,15 +0,0 @@
-*.pyc
-data/dict.txt
-data/dict_all.txt
-data/labels.list
-data/mosesdecoder-master/
-data/reviews_Electronics_5.json.gz
-data/test.list
-data/test.txt
-data/train.list
-data/train.txt
-data/pred.list
-data/pred.txt
-dataprovider_copy_1.py
-train.log
-output
diff --git a/v1_api_demo/quick_start/api_predict.py b/v1_api_demo/quick_start/api_predict.py
deleted file mode 100755
index 9bdffe1006..0000000000
--- a/v1_api_demo/quick_start/api_predict.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os, sys
-import numpy as np
-from optparse import OptionParser
-from py_paddle import swig_paddle, DataProviderConverter
-from paddle.trainer.PyDataProvider2 import sparse_binary_vector
-from paddle.trainer.config_parser import parse_config
-"""
-Usage: run following command to show help message.
-  python api_predict.py -h
-"""
-
-
-class QuickStartPrediction():
-    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
-        """
-        train_conf: trainer configure.
-        dict_file: word dictionary file name.
-        model_dir: directory of model.
-        """
-        self.train_conf = train_conf
-        self.dict_file = dict_file
-        self.word_dict = {}
-        self.dict_dim = self.load_dict()
-        self.model_dir = model_dir
-        if model_dir is None:
-            self.model_dir = os.path.dirname(train_conf)
-
-        self.label = None
-        if label_file is not None:
-            self.load_label(label_file)
-
-        conf = parse_config(train_conf, "is_predict=1")
-        self.network = swig_paddle.GradientMachine.createFromConfigProto(
-            conf.model_config)
-        self.network.loadParameters(self.model_dir)
-        input_types = [sparse_binary_vector(self.dict_dim)]
-        self.converter = DataProviderConverter(input_types)
-
-    def load_dict(self):
-        """
-        Load dictionary from self.dict_file.
-        """
-        for line_count, line in enumerate(open(self.dict_file, 'r')):
-            self.word_dict[line.strip().split('\t')[0]] = line_count
-        return len(self.word_dict)
-
-    def load_label(self, label_file):
-        """
-        Load label.
-        """
-        self.label = {}
-        for v in open(label_file, 'r'):
-            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
-
-    def get_index(self, data):
-        """
-        transform word into integer index according to the dictionary.
-        """
-        words = data.strip().split()
-        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
-        return word_slot
-
-    def batch_predict(self, data_batch):
-        input = self.converter(data_batch)
-        output = self.network.forwardTest(input)
-        prob = output[0]["id"].tolist()
-        print("predicting labels is:")
-        print prob
-
-
-def option_parser():
-    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
-    parser = OptionParser(usage="usage: %s [options]" % usage)
-    parser.add_option(
-        "-n",
-        "--tconf",
-        action="store",
-        dest="train_conf",
-        help="network config")
-    parser.add_option(
-        "-d",
-        "--dict",
-        action="store",
-        dest="dict_file",
-        help="dictionary file")
-    parser.add_option(
-        "-b",
-        "--label",
-        action="store",
-        dest="label",
-        default=None,
-        help="dictionary file")
-    parser.add_option(
-        "-c",
-        "--batch_size",
-        type="int",
-        action="store",
-        dest="batch_size",
-        default=1,
-        help="the batch size for prediction")
-    parser.add_option(
-        "-w",
-        "--model",
-        action="store",
-        dest="model_path",
-        default=None,
-        help="model path")
-    return parser.parse_args()
-
-
-def main():
-    options, args = option_parser()
-    train_conf = options.train_conf
-    batch_size = options.batch_size
-    dict_file = options.dict_file
-    model_path = options.model_path
-    label = options.label
-    swig_paddle.initPaddle("--use_gpu=0")
-    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
-
-    batch = []
-    labels = []
-    for line in sys.stdin:
-        [label, text] = line.split("\t")
-        labels.append(int(label))
-        batch.append([predict.get_index(text)])
-    print("labels is:")
-    print labels
-    predict.batch_predict(batch)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/api_predict.sh b/v1_api_demo/quick_start/api_predict.sh
deleted file mode 100755
index 4d9aa9e885..0000000000
--- a/v1_api_demo/quick_start/api_predict.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-#only test on trainer_config.lr.py
-model=output/model/pass-00001/
-config=trainer_config.lr.py
-label=data/labels.list
-dict=data/dict.txt
-batch_size=20
-head -n$batch_size data/test.txt | python api_predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=$dict \
-     --batch_size=$batch_size
diff --git a/v1_api_demo/quick_start/api_train.py b/v1_api_demo/quick_start/api_train.py
deleted file mode 100644
index 5699789daa..0000000000
--- a/v1_api_demo/quick_start/api_train.py
+++ /dev/null
@@ -1,122 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import itertools
-import random
-
-from paddle.trainer.config_parser import parse_config
-from py_paddle import swig_paddle as api
-from py_paddle import DataProviderConverter
-from paddle.trainer.PyDataProvider2 \
-    import integer_value, integer_value_sequence, sparse_binary_vector
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--train_data", type=str, required=False, help="train data file")
-    parser.add_argument("--test_data", type=str, help="test data file")
-    parser.add_argument(
-        "--config", type=str, required=True, help="config file name")
-    parser.add_argument("--dict_file", required=True, help="dictionary file")
-    parser.add_argument(
-        "--seq", default=1, type=int, help="whether use sequence training")
-    parser.add_argument(
-        "--use_gpu", default=0, type=int, help="whether use GPU for training")
-    parser.add_argument(
-        "--trainer_count",
-        default=1,
-        type=int,
-        help="Number of threads for training")
-    parser.add_argument(
-        "--num_passes", default=5, type=int, help="Number of training passes")
-    return parser.parse_args()
-
-
-UNK_IDX = 0
-
-
-def load_data(file_name, word_dict):
-    with open(file_name, 'r') as f:
-        for line in f:
-            label, comment = line.strip().split('\t')
-            words = comment.split()
-            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
-
-
-def load_dict(dict_file):
-    word_dict = dict()
-    with open(dict_file, 'r') as f:
-        for i, line in enumerate(f):
-            w = line.strip().split()[0]
-            word_dict[w] = i
-    return word_dict
-
-
-def main():
-    options = parse_arguments()
-    api.initPaddle("--use_gpu=%s" % options.use_gpu,
-                   "--trainer_count=%s" % options.trainer_count)
-
-    word_dict = load_dict(options.dict_file)
-    train_dataset = list(load_data(options.train_data, word_dict))
-    if options.test_data:
-        test_dataset = list(load_data(options.test_data, word_dict))
-    else:
-        test_dataset = None
-
-    trainer_config = parse_config(options.config,
-                                  "dict_file=%s" % options.dict_file)
-    # No need to have data provider for trainer
-    trainer_config.ClearField('data_config')
-    trainer_config.ClearField('test_data_config')
-
-    # create a GradientMachine from the model configuratin
-    model = api.GradientMachine.createFromConfigProto(
-        trainer_config.model_config)
-    # create a trainer for the gradient machine
-    trainer = api.Trainer.create(trainer_config, model)
-
-    # create a data converter which converts data to PaddlePaddle
-    # internal format
-    input_types = [
-        integer_value_sequence(len(word_dict)) if options.seq else
-        sparse_binary_vector(len(word_dict)), integer_value(2)
-    ]
-    converter = DataProviderConverter(input_types)
-
-    batch_size = trainer_config.opt_config.batch_size
-    trainer.startTrain()
-    for train_pass in xrange(options.num_passes):
-        trainer.startTrainPass()
-        random.shuffle(train_dataset)
-        for pos in xrange(0, len(train_dataset), batch_size):
-            batch = itertools.islice(train_dataset, pos, pos + batch_size)
-            size = min(batch_size, len(train_dataset) - pos)
-            trainer.trainOneDataBatch(size, converter(batch))
-        trainer.finishTrainPass()
-        if test_dataset:
-            trainer.startTestPeriod()
-            for pos in xrange(0, len(test_dataset), batch_size):
-                batch = itertools.islice(test_dataset, pos, pos + batch_size)
-                size = min(batch_size, len(test_dataset) - pos)
-                trainer.testOneDataBatch(size, converter(batch))
-            trainer.finishTestPeriod()
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/api_train.sh b/v1_api_demo/quick_start/api_train.sh
deleted file mode 100755
index 9b2a4e2f22..0000000000
--- a/v1_api_demo/quick_start/api_train.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-# Note: if using trainer_config.emb.py, trainer_config.cnn.py
-# or trainer_config.lstm.py, you need to change --seq to --seq=1
-# because they are sequence models.
-python api_train.py \
-  --config=trainer_config.lr.py \
-  --trainer_count=2 \
-  --num_passes=15 \
-  --use_gpu=0 \
-  --seq=0 \
-  --train_data=data/train.txt \
-  --test_data=data/test.txt \
-  --dict_file=data/dict.txt \
-  2>&1 | tee 'train.log'
diff --git a/v1_api_demo/quick_start/cluster/cluster_train.sh b/v1_api_demo/quick_start/cluster/cluster_train.sh
deleted file mode 100755
index a7b1f01064..0000000000
--- a/v1_api_demo/quick_start/cluster/cluster_train.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-# Should run pserver.sh before run this script.
-bin_dir=$(cd `dirname $0`; pwd)
-home_dir=$(cd "${bin_dir}/.."; pwd)
-source "$bin_dir/env.sh"
-
-model_dir="$bin_dir/output"
-log_file="$bin_dir/train.log"
-
-pushd "$home_dir"
-cfg=trainer_config.lr.py
-paddle train \
-  --start_pserver=false \
-  --config=$cfg \
-  --save_dir=${model_dir} \
-  --trainer_count=4 \
-  --local=0 \
-  --log_period=100 \
-  --num_passes=15 \
-  --use_gpu=false \
-  --show_parameter_stats_period=100 \
-  --test_all_data_in_one_period=1 \
-  --num_gradient_servers=1 \
-  --nics=`get_nics` \
-  --port=7164 \
-  --ports_num=1 \
-  --pservers="127.0.0.1" \
-  --comment="paddle_trainer" \
-  2>&1 | tee "$log_file"
-popd
diff --git a/v1_api_demo/quick_start/data/README.md b/v1_api_demo/quick_start/data/README.md
deleted file mode 100644
index 63abcf7ebf..0000000000
--- a/v1_api_demo/quick_start/data/README.md
+++ /dev/null
@@ -1,9 +0,0 @@
-This dataset consists of electronics product reviews associated with
-binary labels (positive/negative) for sentiment classification.
-
-The preprocessed data can be downloaded by script `get_data.sh`.
-The data was derived from reviews_Electronics_5.json.gz at
-
-http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
-
-If you want to process the raw data, you can use the script `proc_from_raw_data/get_data.sh`.
diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh b/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
deleted file mode 100755
index d976eaebfa..0000000000
--- a/v1_api_demo/quick_start/data/proc_from_raw_data/get_data.sh
+++ /dev/null
@@ -1,79 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# 1. size of pos : neg = 1:1.
-# 2. size of testing set = min(25k, len(all_data) * 0.1), others is traning set.
-# 3. distinct train set and test set.
-
-set -e
-
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-cd $DIR
-
-# Download data
-echo "Downloading Amazon Electronics reviews data..."
-# http://jmcauley.ucsd.edu/data/amazon/
-wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
-echo "Downloading mosesdecoder..."
-# https://github.com/moses-smt/mosesdecoder
-wget https://github.com/moses-smt/mosesdecoder/archive/master.zip
-
-unzip master.zip
-rm master.zip
-
-##################
-# Preprocess data 
-echo "Preprocess data..."
-export LC_ALL=C
-UNAME_STR=`uname`
-
-if [ ${UNAME_STR} == 'Linux' ]; then
-  SHUF_PROG='shuf'
-else
-  SHUF_PROG='gshuf'
-fi
-
-mkdir -p tmp
-python preprocess.py -i reviews_Electronics_5.json.gz
-# uniq and shuffle
-cd tmp
-echo 'Uniq and shuffle...'
-cat pos_*|sort|uniq|${SHUF_PROG}> pos.shuffed
-cat neg_*|sort|uniq|${SHUF_PROG}> neg.shuffed
-
-min_len=`sed -n '$=' neg.shuffed`
-test_num=$((min_len/10))
-if [ $test_num -gt 12500 ];then
- test_num=12500
-fi
-train_num=$((min_len-test_num))
-
-head -n$train_num pos.shuffed >train.pos
-head -n$train_num neg.shuffed >train.neg
-tail -n$test_num pos.shuffed >test.pos
-tail -n$test_num neg.shuffed >test.neg
-
-cat train.pos train.neg | ${SHUF_PROG} >../train.txt
-cat test.pos test.neg | ${SHUF_PROG} >../test.txt
-
-cd -
-echo 'train.txt' > train.list
-echo 'test.txt' > test.list
-
-# use 30k dict
-rm -rf tmp
-mv dict.txt dict_all.txt
-cat dict_all.txt | head -n 30001 > dict.txt
-echo 'Done.'
diff --git a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py b/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
deleted file mode 100755
index 72bd95f21d..0000000000
--- a/v1_api_demo/quick_start/data/proc_from_raw_data/preprocess.py
+++ /dev/null
@@ -1,223 +0,0 @@
-# -*- coding: UTF-8 -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-1. Tokenize the words and punctuation 
-2. pos sample : rating score 5; neg sample: rating score 1-2.
-
-Usage:
-    python preprocess.py -i data_file [random seed]
-"""
-
-import sys
-import os
-import operator
-import gzip
-from subprocess import Popen, PIPE
-from optparse import OptionParser
-import json
-from multiprocessing import Queue
-from multiprocessing import Pool
-import multiprocessing
-
-batch_size = 5000
-word_count = {}
-num_tokenize = max(1,
-                   multiprocessing.cpu_count() - 2)  # parse + tokenize + save
-max_queue_size = 8
-parse_queue = Queue(maxsize=max_queue_size + num_tokenize)
-tokenize_queue = Queue(maxsize=max_queue_size + num_tokenize)
-
-
-def create_dict(data):
-    """
-    Create dictionary based on data, and saved in data_dir/dict.txt.
-    The first line is unk \t -1.
-    data: list, input data by batch.
-    """
-    for seq in data:
-        try:
-            for w in seq.lower().split():
-                if w not in word_count:
-                    word_count[w] = 1
-                else:
-                    word_count[w] += 1
-        except:
-            sys.stderr.write(seq + "\tERROR\n")
-
-
-def parse(path):
-    """
-    Open .gz file.
-    """
-    sys.stderr.write(path)
-    g = gzip.open(path, 'r')
-    for l in g:
-        yield json.loads(l)
-    g.close()
-
-
-def tokenize(sentences):
-    """
-    Use tokenizer.perl to tokenize input sentences.
-    tokenizer.perl is tool of Moses.
-    sentences : a list of input sentences.
-    return: a list of processed text.
-    """
-    dir = './mosesdecoder-master/scripts/tokenizer/tokenizer.perl'
-    if not os.path.exists(dir):
-        sys.exit(
-            "The ./mosesdecoder-master/scripts/tokenizer/tokenizer.perl does not exists."
-        )
-    tokenizer_cmd = [dir, '-l', 'en', '-q', '-']
-    assert isinstance(sentences, list)
-    text = "\n".join(sentences)
-    tokenizer = Popen(tokenizer_cmd, stdin=PIPE, stdout=PIPE)
-    tok_text, _ = tokenizer.communicate(text)
-    toks = tok_text.split('\n')[:-1]
-    return toks
-
-
-def save_data(instance, data_dir, pre_fix, batch_num):
-    """
-    save data by batch
-    """
-    label = ['1' if pre_fix == 'pos' else '0' for i in range(len(instance))]
-    lines = ['%s\t%s' % (label[i], instance[i]) for i in range(len(label))]
-    file_name = os.path.join(data_dir, "%s_%s.txt" % (pre_fix, batch_num))
-    file(file_name, 'w').write('\n'.join(lines) + '\n')
-
-
-def tokenize_batch(id):
-    """
-    tokenize data by batch
-    """
-    while True:
-        num_batch, instance, pre_fix = parse_queue.get()
-        if num_batch == -1:  ### parse_queue finished
-            tokenize_queue.put((-1, None, None))
-            sys.stderr.write("Thread %s finish\n" % (id))
-            break
-        tokenize_instance = tokenize(instance)
-        tokenize_queue.put((num_batch, tokenize_instance, pre_fix))
-        sys.stderr.write('.')
-
-
-def save_batch(data_dir, num_tokenize, data_dir_dict):
-    """
-        save data by batch
-        build dict.txt
-    """
-    token_count = 0
-    while True:
-        num_batch, instance, pre_fix = tokenize_queue.get()
-        if num_batch == -1:
-            token_count += 1
-            if token_count == num_tokenize:  #### tokenize finished.
-                break
-            else:
-                continue
-        save_data(instance, data_dir, pre_fix, num_batch)
-        create_dict(instance)  ## update dict
-
-    sys.stderr.write("save file finish\n")
-    f = open(data_dir_dict, 'w')
-    f.write('%s\t%s\n' % ('unk', '-1'))
-    for k, v in sorted(word_count.items(), key=operator.itemgetter(1), \
-                       reverse=True):
-        f.write('%s\t%s\n' % (k, v))
-    f.close()
-    sys.stderr.write("build dict finish\n")
-
-
-def parse_batch(data, num_tokenize):
-    """
-    parse data by batch
-    parse -> tokenize -> save
-    """
-    raw_txt = parse(data)
-    neg, pos = [], []
-    count = 0
-    sys.stderr.write("extract raw data\n")
-    for l in raw_txt:
-        rating = l["overall"]
-        text = l["reviewText"].lower()  # # convert words to lower case
-        if rating == 5.0 and text:
-            pos.append(text)
-        if rating < 3.0 and text:
-            neg.append(text)
-        if len(pos) == batch_size or len(neg) == batch_size:
-            if len(pos) == batch_size:
-                batch = pos
-                pre_fix = 'pos'
-            else:
-                batch = neg
-                pre_fix = 'neg'
-
-            parse_queue.put((count, batch, pre_fix))
-            count += 1
-            if pre_fix == 'pos':
-                pos = []
-            else:
-                neg = []
-
-    if len(pos) > 0:
-        parse_queue.put((count, pos, 'pos'))
-        count += 1
-    if len(neg) > 0:
-        parse_queue.put((count, neg, 'neg'))
-        count += 1
-    for i in range(num_tokenize):
-        parse_queue.put((-1, None, None))  #### for tokenize's input finished
-    sys.stderr.write("parsing finish\n")
-
-
-def option_parser():
-    parser = OptionParser(usage="usage: python preprcoess.py "\
-                                "-i data_path [options]")
-    parser.add_option(
-        "-i", "--data", action="store", dest="input", help="Input data path.")
-    parser.add_option(
-        "-s",
-        "--seed",
-        action="store",
-        dest="seed",
-        default=1024,
-        help="Set random seed.")
-    return parser.parse_args()
-
-
-def main():
-    reload(sys)
-    sys.setdefaultencoding('utf-8')
-    options, args = option_parser()
-    data = options.input
-    seed = options.seed
-    data_dir_dict = os.path.join(os.path.dirname(data), 'dict.txt')
-    data_dir = os.path.join(os.path.dirname(data), 'tmp')
-    pool = Pool(processes=num_tokenize + 2)
-    pool.apply_async(parse_batch, args=(data, num_tokenize))
-    for i in range(num_tokenize):
-        pool.apply_async(tokenize_batch, args=(str(i), ))
-    pool.apply_async(save_batch, args=(data_dir, num_tokenize, data_dir_dict))
-    pool.close()
-    pool.join()
-
-    file(os.path.join(os.path.dirname(data), 'labels.list'),
-         'w').write('neg\t0\npos\t1\n')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/v1_api_demo/quick_start/dataprovider_bow.py b/v1_api_demo/quick_start/dataprovider_bow.py
deleted file mode 100644
index 2745495586..0000000000
--- a/v1_api_demo/quick_start/dataprovider_bow.py
+++ /dev/null
@@ -1,86 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = {
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        'word': sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        'label': integer_value(2)
-    }
-
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield {'word': word_vector, 'label': int(label)}
-
-
-def predict_initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
-
-
-# Declaring a data provider for prediction. The difference with process
-# is that label is not generated.
-@provider(init_hook=predict_initializer, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            comment = line.strip().split()
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield {'word': word_vector}
diff --git a/v1_api_demo/quick_start/dataprovider_emb.py b/v1_api_demo/quick_start/dataprovider_emb.py
deleted file mode 100755
index ddfa3ce9b7..0000000000
--- a/v1_api_demo/quick_start/dataprovider_emb.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-
-UNK_IDX = 0
-
-
-def initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {
-        # Define the type of the first input as sequence of integer.
-        # The value of the integers range from 0 to len(dictrionary)-1
-        'word': integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        'label': integer_value(2)
-    }
-
-
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            label, comment = line.strip().split('\t')
-            words = comment.split()
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield {'word': word_slot, 'label': int(label)}
-
-
-def predict_initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
-
-
-@provider(init_hook=predict_initializer, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name, 'r') as f:
-        for line in f:
-            comment = line.strip().split()
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield {'word': word_slot}
diff --git a/v1_api_demo/quick_start/predict.sh b/v1_api_demo/quick_start/predict.sh
deleted file mode 100755
index e47c2dd01f..0000000000
--- a/v1_api_demo/quick_start/predict.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.lr.py
-#cfg=trainer_config.emb.py
-#cfg=trainer_config.cnn.py
-#cfg=trainer_config.lstm.py
-model="output/pass-00003"
-paddle train \
-    --config=$cfg \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-2>&1 | tee 'predict.log'
-paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
-
-mv rank-00000 result.txt
diff --git a/v1_api_demo/quick_start/train.sh b/v1_api_demo/quick_start/train.sh
deleted file mode 100755
index 01697fed48..0000000000
--- a/v1_api_demo/quick_start/train.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.lr.py
-#cfg=trainer_config.emb.py
-#cfg=trainer_config.cnn.py
-#cfg=trainer_config.lstm.py
-#cfg=trainer_config.bidi-lstm.py
-#cfg=trainer_config.db-lstm.py
-#cfg=trainer_config.resnet-lstm.py
-paddle train \
-  --config=$cfg \
-  --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=100 \
-  --num_passes=15 \
-  --use_gpu=false \
-  --show_parameter_stats_period=100 \
-  --test_all_data_in_one_period=1 \
-  2>&1 | tee 'train.log'
-paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py b/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
deleted file mode 100644
index ca1d1f8d09..0000000000
--- a/v1_api_demo/quick_start/trainer_config.bidi-lstm.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-
-bi_lstm = bidirectional_lstm(input=emb, size=128)
-dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
-
-output = fc_layer(
-    input=dropout, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.cnn.py b/v1_api_demo/quick_start/trainer_config.cnn.py
deleted file mode 100644
index f8c3d511f3..0000000000
--- a/v1_api_demo/quick_start/trainer_config.cnn.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-data = data_layer(name="word", size=len(word_dict))
-embedding = embedding_layer(input=data, size=128)
-conv = sequence_conv_pool(input=embedding, context_len=3, hidden_size=512)
-output = fc_layer(input=conv, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.db-lstm.py b/v1_api_demo/quick_start/trainer_config.db-lstm.py
deleted file mode 100644
index fba802b460..0000000000
--- a/v1_api_demo/quick_start/trainer_config.db-lstm.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-
-hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
-lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
-
-input_layers = [hidden_0, lstm_0]
-
-for i in range(1, 8):
-    fc = fc_layer(input=input_layers, size=128)
-    lstm = lstmemory(
-        input=fc,
-        layer_attr=ExtraAttr(drop_rate=0.1),
-        reverse=(i % 2) == 1, )
-    input_layers = [fc, lstm]
-
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-
-output = fc_layer(
-    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.emb.py b/v1_api_demo/quick_start/trainer_config.emb.py
deleted file mode 100644
index 7410397ef6..0000000000
--- a/v1_api_demo/quick_start/trainer_config.emb.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size, learning_rate=2e-3, learning_method=AdamOptimizer())
-
-data = data_layer(name="word", size=len(word_dict))
-embedding = embedding_layer(input=data, size=128)
-avg = pooling_layer(input=embedding, pooling_type=AvgPooling())
-output = fc_layer(input=avg, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.lr.py b/v1_api_demo/quick_start/trainer_config.lr.py
deleted file mode 100644
index e5105aa895..0000000000
--- a/v1_api_demo/quick_start/trainer_config.lr.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_bow",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-# Define the data for text features. The size of the data layer is the number
-# of words in the dictionary.
-data = data_layer(name="word", size=len(word_dict))
-
-# Define a fully connected layer with logistic activation.
-# (also called softmax activation).
-output = fc_layer(input=data, size=2, act=SoftmaxActivation())
-
-if not is_predict:
-    # For training, we need label and cost
-
-    # define the category id for each example.
-    # The size of the data layer is the number of labels.
-    label = data_layer(name="label", size=2)
-
-    # Define cross-entropy classification loss and error.
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
-else:
-    # For prediction, no label is needed. We need to output
-    # We need to output classification result, and class probabilities.
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
diff --git a/v1_api_demo/quick_start/trainer_config.lstm.py b/v1_api_demo/quick_start/trainer_config.lstm.py
deleted file mode 100644
index 43b4ddac2d..0000000000
--- a/v1_api_demo/quick_start/trainer_config.lstm.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-lstm = simple_lstm(
-    input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.25))
-lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_max, size=2, act=SoftmaxActivation())
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py b/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
deleted file mode 100644
index 89a837abb7..0000000000
--- a/v1_api_demo/quick_start/trainer_config.resnet-lstm.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# edit-mode: -*- python -*-
-
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-This configuration is a demonstration of how to implement the stacked LSTM
-with residual connections, i.e. an LSTM layer takes the sum of the hidden states
-and inputs of the previous LSTM layer instead of only the hidden states.
-This architecture is from:
-Yonghui Wu, Mike Schuster, Zhifeng Chen, Quoc V. Le, Mohammad Norouzi,
-Wolfgang Macherey, Maxim Krikun, Yuan Cao, Qin Gao, Klaus Macherey,
-Jeff Klingner, Apurva Shah, Melvin Johnson, Xiaobing Liu, Lukasz Kaiser,
-Stephan Gouws, Yoshikiyo Kato, Taku Kudo, Hideto Kazawa, Keith Stevens,
-George Kurian, Nishant Patil, Wei Wang, Cliff Young, Jason Smith, Jason Riesa,
-Alex Rudnick, Oriol Vinyals, Greg Corrado, Macduff Hughes, Jeffrey Dean. 2016.
-Google's Neural Machine Translation System: Bridging the Gap between Human and
-Machine Translation. In arXiv https://arxiv.org/pdf/1609.08144v2.pdf
-Different from the architecture described in the paper, we use a stack single
-direction LSTM layers as the first layer instead of bi-directional LSTM. Also,
-since this is a demo code, to reduce computation time, we stacked 4 layers
-instead of 8 layers.
-"""
-
-from paddle.trainer_config_helpers import *
-
-dict_file = "./data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn,
-    test_list=tst,
-    module="dataprovider_emb",
-    obj=process,
-    args={"dictionary": word_dict})
-
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=2e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25)
-
-bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
-
-data = data_layer(name="word", size=len(word_dict))
-emb = embedding_layer(input=data, size=128)
-lstm = simple_lstm(input=emb, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
-
-previous_input, previous_hidden_state = emb, lstm
-
-for i in range(3):
-    # The input to the current layer is the sum of the hidden state
-    # and input of the previous layer.
-    current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(
-        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
-    previous_input, previous_hidden_state = current_input, hidden_state
-
-lstm = previous_hidden_state
-
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(
-    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
-
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid, output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
diff --git a/v1_api_demo/sequence_tagging/data/test.list b/v1_api_demo/sequence_tagging/data/test.list
deleted file mode 100644
index 073c0a0c90..0000000000
--- a/v1_api_demo/sequence_tagging/data/test.list
+++ /dev/null
@@ -1 +0,0 @@
-data/test.txt.gz
diff --git a/v1_api_demo/sequence_tagging/data/train.list b/v1_api_demo/sequence_tagging/data/train.list
deleted file mode 100644
index 43c24d5f64..0000000000
--- a/v1_api_demo/sequence_tagging/data/train.list
+++ /dev/null
@@ -1 +0,0 @@
-data/train.txt.gz
diff --git a/v1_api_demo/sequence_tagging/dataprovider.py b/v1_api_demo/sequence_tagging/dataprovider.py
deleted file mode 100644
index bb4b4465bc..0000000000
--- a/v1_api_demo/sequence_tagging/dataprovider.py
+++ /dev/null
@@ -1,260 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import gzip
-import logging
-
-logging.basicConfig(
-    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s', )
-logger = logging.getLogger('paddle')
-logger.setLevel(logging.INFO)
-
-OOV_POLICY_IGNORE = 0
-OOV_POLICY_USE = 1
-OOV_POLICY_ERROR = 2
-
-num_original_columns = 3
-
-# Feature combination patterns.
-# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
-# column 0 are combined as one feature.
-patterns = [
-    [[-2, 0]],
-    [[-1, 0]],
-    [[0, 0]],
-    [[1, 0]],
-    [[2, 0]],
-    [[-1, 0], [0, 0]],
-    [[0, 0], [1, 0]],
-    [[-2, 1]],
-    [[-1, 1]],
-    [[0, 1]],
-    [[1, 1]],
-    [[2, 1]],
-    [[-2, 1], [-1, 1]],
-    [[-1, 1], [0, 1]],
-    [[0, 1], [1, 1]],
-    [[1, 1], [2, 1]],
-    [[-2, 1], [-1, 1], [0, 1]],
-    [[-1, 1], [0, 1], [1, 1]],
-    [[0, 1], [1, 1], [2, 1]],
-]
-
-dict_label = {
-    'B-ADJP': 0,
-    'I-ADJP': 1,
-    'B-ADVP': 2,
-    'I-ADVP': 3,
-    'B-CONJP': 4,
-    'I-CONJP': 5,
-    'B-INTJ': 6,
-    'I-INTJ': 7,
-    'B-LST': 8,
-    'I-LST': 9,
-    'B-NP': 10,
-    'I-NP': 11,
-    'B-PP': 12,
-    'I-PP': 13,
-    'B-PRT': 14,
-    'I-PRT': 15,
-    'B-SBAR': 16,
-    'I-SBAR': 17,
-    'B-UCP': 18,
-    'I-UCP': 19,
-    'B-VP': 20,
-    'I-VP': 21,
-    'O': 22
-}
-
-
-def make_features(sequence):
-    length = len(sequence)
-    num_features = len(sequence[0])
-
-    def get_features(pos):
-        if pos < 0:
-            return ['#B%s' % -pos] * num_features
-        if pos >= length:
-            return ['#E%s' % (pos - length + 1)] * num_features
-        return sequence[pos]
-
-    for i in xrange(length):
-        for pattern in patterns:
-            fname = '/'.join([get_features(i + pos)[f] for pos, f in pattern])
-            sequence[i].append(fname)
-
-
-'''
-Source file format:
-Each line is for one timestep. The features are separated by space.
-An empty line indicates end of a sequence.
-
-cutoff: a list of numbers. If count of a feature is smaller than this,
- it will be ignored.
-if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
-i-th column.
-
-return a list of dict for each column
-'''
-
-
-def create_dictionaries(filename, cutoff, oov_policy):
-    def add_to_dict(sequence, dicts):
-        num_features = len(dicts)
-        for features in sequence:
-            l = len(features)
-            assert l == num_features, "Wrong number of features " + line
-            for i in xrange(l):
-                if features[i] in dicts[i]:
-                    dicts[i][features[i]] += 1
-                else:
-                    dicts[i][features[i]] = 1
-
-    num_features = len(cutoff)
-    dicts = []
-    for i in xrange(num_features):
-        dicts.append(dict())
-
-    f = gzip.open(filename, 'rb')
-
-    sequence = []
-
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            add_to_dict(sequence, dicts)
-            sequence = []
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    for i in xrange(num_features):
-        dct = dicts[i]
-        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
-        todo = []
-        for k, v in dct.iteritems():
-            if v < cutoff[i]:
-                todo.append(k)
-            else:
-                dct[k] = n
-                n += 1
-
-        if oov_policy[i] == OOV_POLICY_USE:
-            # placeholder so that len(dct) will be the number of features
-            # including OOV
-            dct['#OOV#'] = 0
-
-        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
-        for k in todo:
-            del dct[k]
-
-    f.close()
-    return dicts
-
-
-def initializer(settings, **xargs):
-    cutoff = [3, 1, 0]
-    cutoff += [3] * len(patterns)
-    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
-    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
-    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
-    dicts[2] = dict_label
-    settings.dicts = dicts
-    settings.oov_policy = oov_policy
-    input_types = []
-    num_features = len(dicts)
-    for i in xrange(num_original_columns):
-        input_types.append(integer_sequence(len(dicts[i])))
-        logger.info("slot %s size=%s" % (i, len(dicts[i])))
-    if patterns:
-        dim = 0
-        for i in xrange(num_original_columns, num_features):
-            dim += len(dicts[i])
-        input_types.append(sparse_binary_vector_sequence(dim))
-        logger.info("feature size=%s" % dim)
-    settings.input_types = input_types
-
-
-'''
-if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
-existed in dicts[i] will be assigned to id 0.
-if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
-in dicts[i].
-'''
-
-
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, filename):
-    input_file = filename
-    dicts = settings.dicts
-    oov_policy = settings.oov_policy
-
-    def gen_sample(sequence):
-        num_features = len(dicts)
-        sample = [list() for i in xrange(num_original_columns)]
-        if patterns:
-            sample.append([])
-        for features in sequence:
-            assert len(features) == num_features, \
-                "Wrong number of features: " + line
-            for i in xrange(num_original_columns):
-                id = dicts[i].get(features[i], -1)
-                if id != -1:
-                    sample[i].append(id)
-                elif oov_policy[i] == OOV_POLICY_IGNORE:
-                    sample[i].append(0xffffffff)
-                elif oov_policy[i] == OOV_POLICY_ERROR:
-                    logger.fatal("Unknown token: %s" % features[i])
-                else:
-                    sample[i].append(0)
-
-            if patterns:
-                dim = 0
-                vec = []
-                for i in xrange(num_original_columns, num_features):
-                    id = dicts[i].get(features[i], -1)
-                    if id != -1:
-                        vec.append(dim + id)
-                    elif oov_policy[i] == OOV_POLICY_IGNORE:
-                        pass
-                    elif oov_policy[i] == OOV_POLICY_ERROR:
-                        logger.fatal("Unknown token: %s" % features[i])
-                    else:
-                        vec.ids.append(dim + 0)
-
-                    dim += len(dicts[i])
-                sample[-1].append(vec)
-        return sample
-
-    num_features = len(dicts)
-    f = gzip.open(input_file, 'rb')
-
-    num_sequences = 0
-    sequence = []
-    for line in f:
-        line = line.strip()
-        if not line:
-            make_features(sequence)
-            yield gen_sample(sequence)
-            sequence = []
-            num_sequences += 1
-            continue
-        features = line.split(' ')
-        sequence.append(features)
-
-    f.close()
-
-    logger.info("num_sequences=%s" % num_sequences)
diff --git a/v1_api_demo/sequence_tagging/linear_crf.py b/v1_api_demo/sequence_tagging/linear_crf.py
deleted file mode 100644
index ea012ba1ae..0000000000
--- a/v1_api_demo/sequence_tagging/linear_crf.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-import math
-
-define_py_data_sources2(
-    train_list="data/train.list",
-    test_list="data/test.list",
-    module="dataprovider",
-    obj="process")
-
-batch_size = 1
-settings(
-    learning_method=MomentumOptimizer(),
-    batch_size=batch_size,
-    regularization=L2Regularization(batch_size * 1e-4),
-    model_average=ModelAverage(0.5),
-    learning_rate=1e-1,
-    learning_rate_decay_a=1e-5,
-    learning_rate_decay_b=0.25, )
-
-num_label_types = 23
-
-
-def get_simd_size(size):
-    return int(math.ceil(float(size) / 8)) * 8
-
-
-# Currently, in order to use sparse_update=True,
-# the size has to be aligned.
-num_label_types = get_simd_size(num_label_types)
-
-features = data_layer(name="features", size=76328)
-word = data_layer(name="word", size=6778)
-pos = data_layer(name="pos", size=44)
-chunk = data_layer(name="chunk", size=num_label_types)
-
-crf_input = fc_layer(
-    input=features,
-    size=num_label_types,
-    act=LinearActivation(),
-    bias_attr=False,
-    param_attr=ParamAttr(
-        initial_std=0, sparse_update=True))
-
-crf = crf_layer(
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(
-        name="crfw", initial_std=0), )
-
-crf_decoding = crf_decoding_layer(
-    size=num_label_types,
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(name="crfw"), )
-
-sum_evaluator(
-    name="error",
-    input=crf_decoding, )
-
-chunk_evaluator(
-    name="chunk_f1",
-    input=crf_decoding,
-    label=chunk,
-    chunk_scheme="IOB",
-    num_chunk_types=11, )
-
-inputs(word, pos, chunk, features)
-outputs(crf)
diff --git a/v1_api_demo/sequence_tagging/readme.md b/v1_api_demo/sequence_tagging/readme.md
deleted file mode 100644
index 2e17fffb83..0000000000
--- a/v1_api_demo/sequence_tagging/readme.md
+++ /dev/null
@@ -1,45 +0,0 @@
-# Sequence Tagging
-
-This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
-
-## Download data
-```bash
-cd demo/sequence_tagging
-./data/get_data.sh
-```
-
-## Train model
-```bash
-cd demo/sequence_tagging
-./train.sh
-```
-
-## Model description
-
-We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">Model name</th>
-<th scope="col" class="left">Number of parameters</th>
-<th scope="col" class="left">F1 score</th>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">linear_crf</td>
-<td class="left"> 1.8M </td>
-<td class="left"> 0.937</td>
-</tr>
-
-<tr>
-<td class="left">rnn_crf</td>
-<td class="left"> 960K </td>
-<td class="left">0.941</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/v1_api_demo/sequence_tagging/rnn_crf.py b/v1_api_demo/sequence_tagging/rnn_crf.py
deleted file mode 100644
index 937a34df10..0000000000
--- a/v1_api_demo/sequence_tagging/rnn_crf.py
+++ /dev/null
@@ -1,121 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-
-import math
-
-define_py_data_sources2(
-    train_list="data/train.list",
-    test_list="data/test.list",
-    module="dataprovider",
-    obj="process")
-
-batch_size = 16
-settings(
-    learning_method=MomentumOptimizer(),
-    batch_size=batch_size,
-    regularization=L2Regularization(batch_size * 1e-5),
-    model_average=ModelAverage(0.5),
-    learning_rate=2e-3,
-    learning_rate_decay_a=5e-7,
-    learning_rate_decay_b=0.5, )
-
-word_dim = 128
-hidden_dim = 128
-with_rnn = True
-
-initial_std = 1 / math.sqrt(hidden_dim)
-param_attr = ParamAttr(initial_std=initial_std)
-cpu_layer_attr = ExtraLayerAttribute(device=-1)
-
-default_device(0)
-
-num_label_types = 23
-
-features = data_layer(name="features", size=76328)
-word = data_layer(name="word", size=6778)
-pos = data_layer(name="pos", size=44)
-chunk = data_layer(
-    name="chunk", size=num_label_types, layer_attr=cpu_layer_attr)
-
-emb = embedding_layer(
-    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
-
-hidden1 = mixed_layer(
-    size=hidden_dim,
-    act=STanhActivation(),
-    bias_attr=True,
-    input=[
-        full_matrix_projection(emb), table_projection(
-            pos, param_attr=param_attr)
-    ])
-
-if with_rnn:
-    rnn1 = recurrent_layer(
-        act=ReluActivation(),
-        bias_attr=True,
-        input=hidden1,
-        param_attr=ParamAttr(initial_std=0), )
-
-hidden2 = mixed_layer(
-    size=hidden_dim,
-    act=STanhActivation(),
-    bias_attr=True,
-    input=[full_matrix_projection(hidden1)] +
-    ([full_matrix_projection(
-        rnn1, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
-
-if with_rnn:
-    rnn2 = recurrent_layer(
-        reverse=True,
-        act=ReluActivation(),
-        bias_attr=True,
-        input=hidden2,
-        param_attr=ParamAttr(initial_std=0), )
-
-crf_input = mixed_layer(
-    size=num_label_types,
-    bias_attr=False,
-    input=[full_matrix_projection(hidden2), ] +
-    ([full_matrix_projection(
-        rnn2, param_attr=ParamAttr(initial_std=0))] if with_rnn else []), )
-
-crf = crf_layer(
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(
-        name="crfw", initial_std=0),
-    layer_attr=cpu_layer_attr, )
-
-crf_decoding = crf_decoding_layer(
-    size=num_label_types,
-    input=crf_input,
-    label=chunk,
-    param_attr=ParamAttr(name="crfw"),
-    layer_attr=cpu_layer_attr, )
-
-sum_evaluator(
-    name="error",
-    input=crf_decoding, )
-
-chunk_evaluator(
-    name="chunk_f1",
-    input=crf_decoding,
-    label=chunk,
-    chunk_scheme="IOB",
-    num_chunk_types=11, )
-
-inputs(word, pos, chunk, features)
-outputs(crf)
diff --git a/v1_api_demo/sequence_tagging/train.sh b/v1_api_demo/sequence_tagging/train.sh
deleted file mode 100755
index 37e196c842..0000000000
--- a/v1_api_demo/sequence_tagging/train.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-
-paddle train \
-       --config rnn_crf.py \
-       --parallel_nn=1 \
-       --use_gpu=1 \
-       --dot_period=10 \
-       --log_period=1000 \
-       --test_period=0 \
-       --num_passes=10 \
-2>&1 | tee 'train.log'
-paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/v1_api_demo/sequence_tagging/train_linear.sh b/v1_api_demo/sequence_tagging/train_linear.sh
deleted file mode 100755
index ad6e2d8ee7..0000000000
--- a/v1_api_demo/sequence_tagging/train_linear.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-#!/bin/bash
-
-paddle train \
-       --config linear_crf.py \
-       --use_gpu=0 \
-       --dot_period=100 \
-       --log_period=10000 \
-       --test_period=0 \
-       --num_passes=10
-2>&1 | tee 'train_linear.log'
-paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/v1_api_demo/traffic_prediction/README b/v1_api_demo/traffic_prediction/README
deleted file mode 100644
index 4c95188583..0000000000
--- a/v1_api_demo/traffic_prediction/README
+++ /dev/null
@@ -1,7 +0,0 @@
-run by:
-cd ./data
-sh get_data.sh
-cd ..
-sh train.sh
-sh predict.sh
-
diff --git a/v1_api_demo/traffic_prediction/dataprovider.py b/v1_api_demo/traffic_prediction/dataprovider.py
deleted file mode 100644
index c7883b6950..0000000000
--- a/v1_api_demo/traffic_prediction/dataprovider.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer.PyDataProvider2 import *
-import sys
-import numpy as np
-TERM_NUM = 24
-FORECASTING_NUM = 24
-LABEL_VALUE_NUM = 4
-
-
-def initHook(settings, file_list, **kwargs):
-    """
-    Init hook is invoked before process data. It will set obj.slots and store data meta.
-
-    :param settings: global object. It will passed to process routine.
-    :type obj: object
-    :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
-    :param kwargs: unused other arguments.
-    """
-    del kwargs  #unused 
-
-    settings.pool_size = sys.maxint
-    #Use a time seires of the past as feature.
-    #Dense_vector's expression form is [float,float,...,float]
-    settings.input_types = [dense_vector(TERM_NUM)]
-    #There are next FORECASTING_NUM fragments you need predict.
-    #Every predicted condition at time point has four states.
-    for i in range(FORECASTING_NUM):
-        settings.input_types.append(integer_value(LABEL_VALUE_NUM))
-
-
-@provider(
-    init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
-def process(settings, file_name):
-    with open(file_name) as f:
-        #abandon fields name
-        f.next()
-        for row_num, line in enumerate(f):
-            speeds = map(int, line.rstrip('\r\n').split(",")[1:])
-            # Get the max index.
-            end_time = len(speeds)
-            # Scanning and generating samples
-            for i in range(TERM_NUM, end_time - FORECASTING_NUM):
-                # For dense slot
-                pre_spd = map(float, speeds[i - TERM_NUM:i])
-
-                # Integer value need predicting, values start from 0, so every one minus 1.
-                fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
-
-                # Predicting label is missing, abandon the sample.
-                if -1 in fol_spd:
-                    continue
-                yield [pre_spd] + fol_spd
-
-
-def predict_initHook(settings, file_list, **kwargs):
-    settings.pool_size = sys.maxint
-    settings.input_types = [dense_vector(TERM_NUM)]
-
-
-@provider(init_hook=predict_initHook, should_shuffle=False)
-def process_predict(settings, file_name):
-    with open(file_name) as f:
-        #abandon fields name
-        f.next()
-        for row_num, line in enumerate(f):
-            speeds = map(int, line.rstrip('\r\n').split(","))
-            end_time = len(speeds)
-            pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
-            yield pre_spd
diff --git a/v1_api_demo/traffic_prediction/gen_result.py b/v1_api_demo/traffic_prediction/gen_result.py
deleted file mode 100644
index 3da70b3031..0000000000
--- a/v1_api_demo/traffic_prediction/gen_result.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-res = []
-with open('./rank-00000') as f:
-    for line in f:
-        pred = map(int, line.strip('\r\n;').split(";"))
-        #raw prediction range from 0 to 3
-        res.append([i + 1 for i in pred])
-
-file_name = open('./data/pred.list').read().strip('\r\n')
-
-FORECASTING_NUM = 24
-header = [
-    'id',
-    '201604200805',
-    '201604200810',
-    '201604200815',
-    '201604200820',
-    '201604200825',
-    '201604200830',
-    '201604200835',
-    '201604200840',
-    '201604200845',
-    '201604200850',
-    '201604200855',
-    '201604200900',
-    '201604200905',
-    '201604200910',
-    '201604200915',
-    '201604200920',
-    '201604200925',
-    '201604200930',
-    '201604200935',
-    '201604200940',
-    '201604200945',
-    '201604200950',
-    '201604200955',
-    '201604201000',
-]
-###################
-## To CSV format ##
-###################
-with open(file_name) as f:
-    f.next()
-    print ','.join(header)
-    for row_num, line in enumerate(f):
-        fields = line.rstrip('\r\n').split(',')
-        linkid = fields[0]
-        print linkid + ',' + ','.join(map(str, res[row_num]))
diff --git a/v1_api_demo/traffic_prediction/train.sh b/v1_api_demo/traffic_prediction/train.sh
deleted file mode 100755
index 48dfc5604f..0000000000
--- a/v1_api_demo/traffic_prediction/train.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -e
-
-cfg=trainer_config.py
-paddle train \
-  --config=$cfg \
-  --save_dir=./output \
-  --trainer_count=4 \
-  --log_period=1000 \
-  --dot_period=10 \
-  --num_passes=10 \
-  --use_gpu=false \
-  --show_parameter_stats_period=3000 \
-  2>&1 | tee 'train.log'
diff --git a/v1_api_demo/traffic_prediction/trainer_config.py b/v1_api_demo/traffic_prediction/trainer_config.py
deleted file mode 100755
index 52d678624a..0000000000
--- a/v1_api_demo/traffic_prediction/trainer_config.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from paddle.trainer_config_helpers import *
-
-################################### DATA Configuration #############################################
-is_predict = get_config_arg('is_predict', bool, False)
-trn = './data/train.list' if not is_predict else None
-tst = './data/test.list' if not is_predict else './data/pred.list'
-process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(
-    train_list=trn, test_list=tst, module="dataprovider", obj=process)
-################################### Parameter Configuaration #######################################
-TERM_NUM = 24
-FORECASTING_NUM = 24
-emb_size = 16
-batch_size = 128 if not is_predict else 1
-settings(
-    batch_size=batch_size,
-    learning_rate=1e-3,
-    learning_method=RMSPropOptimizer())
-################################### Algorithm Configuration ########################################
-
-output_label = []
-
-link_encode = data_layer(name='link_encode', size=TERM_NUM)
-for i in xrange(FORECASTING_NUM):
-    # Each task share same weight.
-    link_param = ParamAttr(
-        name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
-    link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
-    score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
-    if is_predict:
-        maxid = maxid_layer(score)
-        output_label.append(maxid)
-    else:
-        # Multi-task training.
-        label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
-        cls = classification_cost(
-            input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
-        output_label.append(cls)
-outputs(output_label)
diff --git a/v1_api_demo/vae/README.md b/v1_api_demo/vae/README.md
deleted file mode 100644
index e55d483b02..0000000000
--- a/v1_api_demo/vae/README.md
+++ /dev/null
@@ -1,13 +0,0 @@
-#Variational Autoencoder (VAE)
-
-This demo implements VAE training described in the original paper (https://arxiv.org/abs/1312.6114).
-
-
-In order to run the model, first download the MNIST dataset by running the shell script in ./data.
-
-Then you can run the command below. The flag --useGpu specifies whether to use gpu for training (0 is cpu, 1 is gpu).  
-
-$python vae_train.py [--use_gpu 1]
-
-The generated images will be stored in ./samples/
-The corresponding models will be stored in ./params/
diff --git a/v1_api_demo/vae/data/get_mnist_data.sh b/v1_api_demo/vae/data/get_mnist_data.sh
deleted file mode 100755
index a77c81bf5a..0000000000
--- a/v1_api_demo/vae/data/get_mnist_data.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/usr/bin/env sh
-# This script downloads the mnist data and unzips it.
-set -e
-DIR="$( cd "$(dirname "$0")" ; pwd -P )"
-rm -rf "$DIR/mnist_data"
-mkdir "$DIR/mnist_data"
-cd "$DIR/mnist_data"
-
-echo "Downloading..."
-
-for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
-do
-    if [ ! -e $fname ]; then
-        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
-        gunzip ${fname}.gz
-    fi
-done
diff --git a/v1_api_demo/vae/dataloader.py b/v1_api_demo/vae/dataloader.py
deleted file mode 100644
index e9ff95d44f..0000000000
--- a/v1_api_demo/vae/dataloader.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-
-
-class MNISTloader():
-    def __init__(self,
-                 data_path="./data/mnist_data/",
-                 batch_size=60,
-                 process='train'):
-        self.batch_size = batch_size
-        self.data_path = data_path
-        self._pointer = 0
-        self.image_batches = np.array([])
-        self.process = process
-
-    def _extract_images(self, filename, n):
-        f = open(filename, 'rb')
-        f.read(16)
-        data = np.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
-        #Mapping data into [-1, 1]
-        data = data / 255. * 2. - 1
-        data_batches = np.split(data, 60000 / self.batch_size, 0)
-
-        f.close()
-
-        return data_batches
-
-    @property
-    def pointer(self):
-        return self._pointer
-
-    def load_data(self):
-        TRAIN_IMAGES = '%s/train-images-idx3-ubyte' % self.data_path
-        TEST_IMAGES = '%s/t10k-images-idx3-ubyte' % self.data_path
-
-        if self.process == 'train':
-            self.image_batches = self._extract_images(TRAIN_IMAGES, 60000)
-        else:
-            self.image_batches = self._extract_images(TEST_IMAGES, 10000)
-
-    def next_batch(self):
-        batch = self.image_batches[self._pointer]
-        self._pointer = (self._pointer + 1) % (60000 / self.batch_size)
-        return np.array(batch)
-
-    def reset_pointer(self):
-        self._pointer = 0
diff --git a/v1_api_demo/vae/vae_conf.py b/v1_api_demo/vae/vae_conf.py
deleted file mode 100644
index 301dd23793..0000000000
--- a/v1_api_demo/vae/vae_conf.py
+++ /dev/null
@@ -1,116 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from paddle.trainer_config_helpers import *
-import numpy as np
-
-is_generating = get_config_arg("is_generating", bool, False)
-
-settings(batch_size=32, learning_rate=1e-3, learning_method=AdamOptimizer())
-
-X_dim = 28 * 28
-h_dim = 128
-z_dim = 100
-
-
-def reparameterization(mu, logvar):
-    eps = ParamAttr(initial_mean=0., initial_std=1)
-    with mixed_layer() as sigma:
-        sigma += dotmul_projection(layer_math.exp(logvar) * 0.5, param_attr=eps)
-    return mu + sigma
-
-
-def q_func(X):
-    """
-    xavier initialization
-    """
-    param_attr = ParamAttr(
-        name='share.w', initial_mean=0., initial_std=1. / np.sqrt(X_dim / 2.))
-    mu_param = ParamAttr(
-        name='mu.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-    logvar_param = ParamAttr(
-        name='logvar.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-
-    bias_attr = ParamAttr(name='share.bias', initial_mean=0., initial_std=0.)
-    mu_bias = ParamAttr(name='mu.bias', initial_mean=0., initial_std=0.)
-    logvar_bias = ParamAttr(name='logvar.bias', initial_mean=0., initial_std=0.)
-
-    share_layer = fc_layer(
-        X,
-        size=h_dim,
-        param_attr=param_attr,
-        bias_attr=bias_attr,
-        act=ReluActivation())
-
-    return (fc_layer(
-        share_layer,
-        size=z_dim,
-        param_attr=mu_param,
-        bias_attr=mu_bias,
-        act=LinearActivation()), fc_layer(
-            share_layer,
-            size=z_dim,
-            param_attr=logvar_param,
-            bias_attr=logvar_bias,
-            act=LinearActivation()))
-
-
-def generator(z):
-
-    hidden_param = ParamAttr(
-        name='hidden.w', initial_mean=0., initial_std=1. / np.sqrt(z_dim / 2.))
-    hidden_bias = ParamAttr(name='hidden.bias', initial_mean=0., initial_std=0.)
-    prob_param = ParamAttr(
-        name='prob.w', initial_mean=0., initial_std=1. / np.sqrt(h_dim / 2.))
-    prob_bias = ParamAttr(name='prob.bias', initial_mean=0., initial_std=0.)
-
-    hidden_layer = fc_layer(
-        z,
-        size=h_dim,
-        act=ReluActivation(),
-        param_attr=hidden_param,
-        bias_attr=hidden_bias)
-    prob = fc_layer(
-        hidden_layer,
-        size=X_dim,
-        act=SigmoidActivation(),
-        param_attr=prob_param,
-        bias_attr=prob_bias)
-
-    return prob
-
-
-def reconstruct_error(prob, X):
-    cost = multi_binary_label_cross_entropy(input=prob, label=X)
-    return cost
-
-
-def KL_loss(mu, logvar):
-    with mixed_layer() as mu_square:
-        mu_square += dotmul_operator(mu, mu, scale=1.)
-
-    cost = 0.5 * sum_cost(layer_math.exp(logvar) + mu_square - 1. - logvar)
-
-    return cost
-
-
-if not is_generating:
-    x_batch = data_layer(name='x_batch', size=X_dim)
-    mu, logvar = q_func(x_batch)
-    z_samples = reparameterization(mu, logvar)
-    prob = generator(z_samples)
-    outputs(reconstruct_error(prob, x_batch) + KL_loss(mu, logvar))
-else:
-    z_samples = data_layer(name='noise', size=z_dim)
-    outputs(generator(z_samples))
diff --git a/v1_api_demo/vae/vae_train.py b/v1_api_demo/vae/vae_train.py
deleted file mode 100644
index 1babb011c7..0000000000
--- a/v1_api_demo/vae/vae_train.py
+++ /dev/null
@@ -1,175 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import random
-import numpy as np
-import cPickle
-import sys, os
-from PIL import Image
-
-from paddle.trainer.config_parser import parse_config
-from paddle.trainer.config_parser import logger
-import py_paddle.swig_paddle as api
-import dataloader
-import matplotlib.pyplot as plt
-
-
-def plot_samples(samples):
-    fig = plt.figure(figsize=(4, 4))
-    gs = gridspec.GridSpec(4, 4)
-    gs.update(wspace=0.05, hspace=0.05)
-    for i, sample in enumerate(samples):
-        plt.subplot(gs[i])
-        plt.axis('off')
-        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
-
-    return fig
-
-
-def CHECK_EQ(a, b):
-    assert a == b, "a=%s, b=%s" % (a, b)
-
-
-def get_fake_samples(generator_machine, batch_size, noise):
-    gen_inputs = api.Arguments.createArguments(1)
-    gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
-    gen_outputs = api.Arguments.createArguments(0)
-    generator_machine.forward(gen_inputs, gen_outputs, api.PASS_TEST)
-    fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
-    return fake_samples
-
-
-def copy_shared_parameters(src, dst):
-    '''
-    copy the parameters from src to dst
-    :param src: the source of the parameters
-    :type src: GradientMachine
-    :param dst: the destination of the parameters
-    :type dst: GradientMachine
-    '''
-    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
-    src_params = dict([(p.getName(), p) for p in src_params])
-
-    for i in xrange(dst.getParameterSize()):
-        dst_param = dst.getParameter(i)
-        src_param = src_params.get(dst_param.getName(), None)
-        if src_param is None:
-            continue
-        src_value = src_param.getBuf(api.PARAMETER_VALUE)
-        dst_value = dst_param.getBuf(api.PARAMETER_VALUE)
-        CHECK_EQ(len(src_value), len(dst_value))
-        dst_value.copyFrom(src_value)
-        dst_param.setValueUpdated()
-
-
-def find(iterable, cond):
-    for item in iterable:
-        if cond(item):
-            return item
-    return None
-
-
-def get_layer_size(model_conf, layer_name):
-    layer_conf = find(model_conf.layers, lambda x: x.name == layer_name)
-    assert layer_conf is not None, "Cannot find '%s' layer" % layer_name
-    return layer_conf.size
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--use_gpu", default="1", help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
-    args = parser.parse_args()
-    use_gpu = args.use_gpu
-    assert use_gpu in ["0", "1"]
-
-    if not os.path.exists("./samples/"):
-        os.makedirs("./samples/")
-
-    if not os.path.exists("./params/"):
-        os.makedirs("./params/")
-
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
-                   '--log_period=1000', '--gpu_id=' + args.gpu_id,
-                   '--save_dir=' + "./params/")
-
-    conf = "vae_conf.py"
-
-    trainer_conf = parse_config(conf, "is_generating=False")
-    gener_conf = parse_config(conf, "is_generating=True")
-
-    batch_size = trainer_conf.opt_config.batch_size
-
-    noise_dim = get_layer_size(gener_conf.model_config, "noise")
-
-    mnist = dataloader.MNISTloader(batch_size=batch_size)
-    mnist.load_data()
-
-    training_machine = api.GradientMachine.createFromConfigProto(
-        trainer_conf.model_config)
-
-    generator_machine = api.GradientMachine.createFromConfigProto(
-        gener_conf.model_config)
-
-    trainer = api.Trainer.create(trainer_conf, training_machine)
-
-    trainer.startTrain()
-
-    for train_pass in xrange(100):
-        trainer.startTrainPass()
-        mnist.reset_pointer()
-        i = 0
-        it = 0
-        while mnist.pointer != 0 or i == 0:
-            X = mnist.next_batch().astype('float32')
-
-            inputs = api.Arguments.createArguments(1)
-            inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(X))
-
-            trainer.trainOneDataBatch(batch_size, inputs)
-
-            if it % 1000 == 0:
-
-                outputs = api.Arguments.createArguments(0)
-                training_machine.forward(inputs, outputs, api.PASS_TEST)
-                loss = np.mean(outputs.getSlotValue(0).copyToNumpyMat())
-                print "\niter: {}".format(str(it).zfill(3))
-                print "VAE loss: {}".format(str(loss).zfill(3))
-
-                #Sync parameters between networks (GradientMachine) at the beginning
-                copy_shared_parameters(training_machine, generator_machine)
-
-                z_samples = np.random.randn(batch_size,
-                                            noise_dim).astype('float32')
-                samples = get_fake_samples(generator_machine, batch_size,
-                                           z_samples)
-
-                #Generating the first 16 images for a picture. 
-                figure = plot_samples(samples[:16])
-                plt.savefig(
-                    "./samples/{}_{}.png".format(
-                        str(train_pass).zfill(3), str(i).zfill(3)),
-                    bbox_inches='tight')
-                plt.close(figure)
-                i += 1
-            it += 1
-
-        trainer.finishTrainPass()
-    trainer.finishTrain()
-
-
-if __name__ == '__main__':
-    main()